In [2]:
import gensim
import nltk
import re
import string

In [5]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/davehiltbrand/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [10]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/davehiltbrand/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

Create initial documents list

In [3]:
doc = [ ]
doc.append( 'It is a far, far better thing I do, than I have every done' )
doc.append( 'Call me Ishmael' )
doc.append( 'Is this a dagger I see before me?' )
doc.append( 'O happy dagger' )

Remove punctuation, then tokenize documents

In [6]:
punc = re.compile( '[%s]' % re.escape( string.punctuation ) )
term_vec = [ ]

for d in doc:
    d = d.lower()
    d = punc.sub( '', d )
    term_vec.append( nltk.word_tokenize( d ) )

Print resulting term vectors

In [8]:
for vec in term_vec:
    print(vec)

['it', 'is', 'a', 'far', 'far', 'better', 'thing', 'i', 'do', 'than', 'i', 'have', 'every', 'done']
['call', 'me', 'ishmael']
['is', 'this', 'a', 'dagger', 'i', 'see', 'before', 'me']
['o', 'happy', 'dagger']


Remove stop words from term vectors

In [11]:
stop_words = nltk.corpus.stopwords.words( 'english' )

for i in range( 0, len( term_vec ) ):
    term_list = [ ]

    for term in term_vec[ i ]:
        if term not in stop_words:
            term_list.append( term )

    term_vec[ i ] = term_list

Print term vectors with stop words removed

In [13]:
for vec in term_vec:
    print(vec)

['far', 'far', 'better', 'thing', 'every', 'done']
['call', 'ishmael']
['dagger', 'see']
['happy', 'dagger']


Porter stem remaining terms

In [15]:
porter = nltk.stem.porter.PorterStemmer()

for i in range( 0, len( term_vec ) ):
    for j in range( 0, len( term_vec[ i ] ) ):
        term_vec[ i ][ j ] = porter.stem( term_vec[ i ][ j ] )

Print term vectors with stop words removed

In [17]:
for vec in term_vec:
    print(vec)

['far', 'far', 'better', 'thing', 'everi', 'done']
['call', 'ishmael']
['dagger', 'see']
['happi', 'dagger']


misc notes

In [18]:
from math import *

In [20]:
log(4/12)

-1.0986122886681098

In [21]:
1/1.39

0.7194244604316548

Convert term vectors into gensim dictionary

In [22]:
dict = gensim.corpora.Dictionary( term_vec )

corp = [ ]
for i in range( 0, len( term_vec ) ):
    corp.append( dict.doc2bow( term_vec[ i ] ) )

Create TFIDF vectors based on term vectors bag-of-word corpora

In [23]:
tfidf_model = gensim.models.TfidfModel( corp )

tfidf = [ ]
for i in range( 0, len( corp ) ):
    tfidf.append( tfidf_model[ corp[ i ] ] )

Create pairwise document similarity index

In [24]:
n = len( dict )
index = gensim.similarities.SparseMatrixSimilarity( tfidf_model[ corp ], num_features = n )

Print TFIDF vectors and pairwise similarity per document

In [29]:
for i in range( 0, len( tfidf ) ):
    s = 'Doc ' + str( i + 1 ) + ' TFIDF:'

    for j in range( 0, len( tfidf[ i ] ) ):
        s = s + ' (' + dict.get( tfidf[ i ][ j ][ 0 ] ) + ','
        s = s + ( '%.3f' % tfidf[ i ][ j ][ 1 ] ) + ')'

    print(s)

for i in range( 0, len( corp ) ):
    print('Doc', ( i + 1 ), 'sim: [ ',)

    sim = index[ tfidf_model[ corp[ i ] ] ]
    for j in range( 0, len( sim ) ):
        print('%.3f ' % sim[ j ],)

    print(']')

Doc 1 TFIDF: (better,0.354) (done,0.354) (everi,0.354) (far,0.707) (thing,0.354)
Doc 2 TFIDF: (call,0.707) (ishmael,0.707)
Doc 3 TFIDF: (dagger,0.447) (see,0.894)
Doc 4 TFIDF: (dagger,0.447) (happi,0.894)
Doc 1 sim: [ 
1.000 
0.000 
0.000 
0.000 
]
Doc 2 sim: [ 
0.000 
1.000 
0.000 
0.000 
]
Doc 3 sim: [ 
0.000 
0.000 
1.000 
0.200 
]
Doc 4 sim: [ 
0.000 
0.000 
0.200 
1.000 
]
