- https://github.com/RaRe-Technologies/gensim/blob/d7f4c2c46aa7a16d2493d25f5830cdf267e573f4/docs/notebooks/Corpora_and_Vector_Spaces.ipynb
- https://github.com/RaRe-Technologies/gensim/blob/d7f4c2c46aa7a16d2493d25f5830cdf267e573f4/docs/notebooks/Topics_and_Transformations.ipynb
- https://github.com/RaRe-Technologies/gensim/blob/d7f4c2c46aa7a16d2493d25f5830cdf267e573f4/docs/notebooks/Similarity_Queries.ipynb

In [1]:
documents = ["Human machine interface for lab abc computer applications",
             "A survey of user opinion of computer system response time",
             "The EPS user interface management system",
             "System and human system engineering testing of EPS",              
             "Relation of user perceived response time to error measurement",
             "The generation of random binary unordered trees",
             "The intersection graph of paths in trees",
             "Graph minors IV Widths of trees and well quasi ordering",
             "Graph minors A survey"]

In [2]:
# remove common words and tokenize# remov 
stoplist = set('for a of the and to in'.split())
texts = [[word for word in document.lower().split() if word not in stoplist]
         for document in documents]

# remove words that appear only once
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

texts = [[token for token in text if frequency[token] > 1] for text in texts]
texts

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]

In [3]:
import os

checkpoint_path = 'tmp'
if not os.path.isdir(checkpoint_path):
    os.mkdir(checkpoint_path)

In [4]:
from gensim import corpora

dictionary = corpora.Dictionary(texts)
dictionary.save(os.path.join(checkpoint_path, 'deerwester.dict'))  # store the dictionary, for future reference
print(dictionary)

Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...)


In [5]:
print(dictionary.token2id)

new_doc = "Human computer interaction"
new_vec = dictionary.doc2bow(new_doc.lower().split())
print(new_vec)

{'computer': 0, 'human': 1, 'interface': 2, 'response': 3, 'survey': 4, 'system': 5, 'time': 6, 'user': 7, 'eps': 8, 'trees': 9, 'graph': 10, 'minors': 11}
[(0, 1), (1, 1)]


In [6]:
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize(os.path.join(checkpoint_path, 'deerwester.mm'), corpus)  # store to disk, for later use
for c in corpus:
    print(c)

[(0, 1), (1, 1), (2, 1)]
[(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)]
[(2, 1), (5, 1), (7, 1), (8, 1)]
[(1, 1), (5, 2), (8, 1)]
[(3, 1), (6, 1), (7, 1)]
[(9, 1)]
[(9, 1), (10, 1)]
[(9, 1), (10, 1), (11, 1)]
[(4, 1), (10, 1), (11, 1)]


In [7]:
from gensim import corpora, models, similarities


if os.path.isfile(os.path.join(checkpoint_path, 'deerwester.dict')):
    dictionary = corpora.Dictionary.load(os.path.join(checkpoint_path, 'deerwester.dict'))
    corpus = corpora.MmCorpus(os.path.join(checkpoint_path, 'deerwester.mm'))
    print("Used files generated from first tutorial")
else:
    print("Please run first tutorial to generate data set")
    
print(dictionary[0])
print(dictionary[1])
print(dictionary[2])

Used files generated from first tutorial
computer
human
interface


In [8]:
tfidf = models.TfidfModel(corpus)
doc_bow = [(0, 1), (1, 1)]
print(tfidf[doc_bow])

[(0, 0.7071067811865476), (1, 0.7071067811865476)]


In [9]:
corpus_tfidf = tfidf[corpus]
for doc in corpus_tfidf:
    print(doc)

[(0, 0.5773502691896257), (1, 0.5773502691896257), (2, 0.5773502691896257)]
[(0, 0.44424552527467476), (3, 0.44424552527467476), (4, 0.44424552527467476), (5, 0.3244870206138555), (6, 0.44424552527467476), (7, 0.3244870206138555)]
[(2, 0.5710059809418182), (5, 0.4170757362022777), (7, 0.4170757362022777), (8, 0.5710059809418182)]
[(1, 0.49182558987264147), (5, 0.7184811607083769), (8, 0.49182558987264147)]
[(3, 0.6282580468670046), (6, 0.6282580468670046), (7, 0.45889394536615247)]
[(9, 1.0)]
[(9, 0.7071067811865475), (10, 0.7071067811865475)]
[(9, 0.5080429008916749), (10, 0.5080429008916749), (11, 0.695546419520037)]
[(4, 0.6282580468670046), (10, 0.45889394536615247), (11, 0.6282580468670046)]


In [10]:
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2) # initialize an LSI transformation
corpus_lsi = lsi[corpus_tfidf] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi
corpus_lsi

<gensim.interfaces.TransformedCorpus at 0x10cce74a8>

In [11]:
lsi.print_topics(2)

[(0,
  '0.703*"trees" + 0.538*"graph" + 0.402*"minors" + 0.187*"survey" + 0.061*"system" + 0.060*"response" + 0.060*"time" + 0.058*"user" + 0.049*"computer" + 0.035*"interface"'),
 (1,
  '-0.460*"system" + -0.373*"user" + -0.332*"eps" + -0.328*"interface" + -0.320*"response" + -0.320*"time" + -0.293*"computer" + -0.280*"human" + -0.171*"survey" + 0.161*"trees"')]

In [12]:
doc = "Human computer interaction"
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lsi[vec_bow] # convert the query to LSI space
print(vec_lsi)

[(0, 0.07910475117444804), (1, -0.5732835243079396)]


In [13]:
index = similarities.MatrixSimilarity(lsi[corpus])
# index.save(os.path.join(TEMP_FOLDER, 'deerwester.index'))
index

<gensim.similarities.docsim.MatrixSimilarity at 0x1104570f0>

In [14]:
sims = index[vec_lsi] # perform a similarity query against the corpus
print(list(enumerate(sims))) # print (document_number, document_similarity) 2-tuples

[(0, 0.9999408), (1, 0.9946708), (2, 0.9999427), (3, 0.99987906), (4, 0.99935204), (5, -0.08804217), (6, -0.051574208), (7, -0.023664715), (8, 0.1938726)]


In [15]:
# The EPS user interface management system
sims = sorted(enumerate(sims), key=lambda item: -item[1])
print(sims) # print sorted (document number, similarity score) 2-tuples

[(2, 0.9999427), (0, 0.9999408), (3, 0.99987906), (4, 0.99935204), (1, 0.9946708), (8, 0.1938726), (7, -0.023664715), (6, -0.051574208), (5, -0.08804217)]


In [16]:
# vec1_tfidf = ksearch.getTfidfForDoc(73)
# vec2_tfidf = ksearch.getTfidfForDoc(results[0][0])    

# Interpret the top match.
# search.interpretMatch(vec1_tfidf, vec2_tfidf)

In [17]:
import numpy as np

def sparse2dense(sparse_vec, length):
    vec = np.zeros(length)
    for i in range(0, len(sparse_vec)):
        j = sparse_vec[i][0]
        value = sparse_vec[i][1]
        vec[j] = value

    return vec


vec1_tfidf = corpus_tfidf[2]
sparse2dense(lsi[vec1_tfidf], lsi.num_topics)

array([ 0.0899264 , -0.72418606])

In [18]:
vec1_tfidf = corpus_tfidf[2]
vec2_tfidf = tfidf[vec_bow]
print(vec2_tfidf)
vec1_tfidf

[(0, 0.7071067811865476), (1, 0.7071067811865476)]


[(2, 0.5710059809418182),
 (5, 0.4170757362022777),
 (7, 0.4170757362022777),
 (8, 0.5710059809418182)]

In [19]:
# len(dictionary.keys())
vocab_size = len(dictionary.token2id)
vec1_lsi = sparse2dense(lsi[vec1_tfidf], lsi.num_topics)
vec2_lsi = sparse2dense(lsi[vec2_tfidf], lsi.num_topics)
vec1_tfidf = sparse2dense(vec1_tfidf, vocab_size)
print(vec1_lsi)
print(vec2_lsi)
print(vec1_tfidf)

[ 0.0899264  -0.72418606]
[ 0.05593551 -0.40537267]
[0.         0.         0.57100598 0.         0.         0.41707574
 0.         0.41707574 0.57100598 0.         0.         0.        ]


In [20]:
norms = np.linalg.norm(vec1_lsi) * np.linalg.norm(vec2_lsi)    
                
# Create a vector to hold the similarity contribution of each word.
word_sims = np.zeros(vocab_size)

# For each word in the vocabulary...
for word_id in range(vocab_size):

    # Get the weights vector for this word. This vector has one weight
    # for each topic
    word_weights = np.asarray(lsi.projection.u[word_id, :]).flatten()

    # Calculate the contribution of this word in doc1 to the total similarity.
    word_sims[word_id] = vec1_tfidf[word_id] * np.dot(word_weights, vec2_lsi) / norms

word_sims

array([ 0.        ,  0.        ,  0.25762474,  0.        ,  0.        ,
        0.26537098,  0.        ,  0.21555183,  0.2613603 , -0.        ,
       -0.        ,  0.        ])

In [21]:
lsi.projection.u.shape

(12, 2)

In [22]:
norms = np.linalg.norm(vec1_lsi) * np.linalg.norm(vec2_lsi) 
word_sims = vec1_tfidf * np.dot(lsi.projection.u, vec2_lsi) / norms
word_sims

array([ 0.        ,  0.        ,  0.25762474,  0.        ,  0.        ,
        0.26537098,  0.        ,  0.21555183,  0.2613603 , -0.        ,
       -0.        ,  0.        ])

In [23]:
word_sims = sorted(enumerate(word_sims), key=lambda item: -item[1])
word_sims

[(5, 0.2653709808317419),
 (8, 0.26136029647405956),
 (2, 0.25762473899614763),
 (7, 0.21555182985077137),
 (0, 0.0),
 (1, 0.0),
 (3, 0.0),
 (4, 0.0),
 (6, 0.0),
 (9, -0.0),
 (10, -0.0),
 (11, 0.0)]

In [24]:
def printWordSims(word_sims, dictionary, topn=10, min_pos=0.1, max_neg=0.01):
    """
    Internal function used by `interpretMatch` to display the contributing
    words.
    """
    # TODO - First create the list of results in interpretMatch, then
    #        in this function just do the printing, and adapt the column
    #        width to the maximum word length in the results...

    # Build up the table of results to display.        
    tableStr = ''
    for i in range(0, topn):
        pos_word_id, pos_word_val = word_sims[i]
        neg_word_id, neg_word_val = word_sims[-(i + 1)]

        pos_word = dictionary[pos_word_id]
        neg_word = dictionary[neg_word_id]                       

        # If neither words pass the thresholds, break.
        if ((pos_word_val <= min_pos) and (neg_word_val >= max_neg)):
            break

        # Only display the positive word if the value passes the threshold.
        if (pos_word_val > min_pos):
            tableStr += '  %15s  +%.3f' % (pos_word, pos_word_val)
        # Otherwise add empty space.
        else:
            # e.g.,     '          freedom  +0.440'
            tableStr += '                         '

        # Only display the negative word if the value passes the threshold.
        if (neg_word_val < max_neg):
            tableStr += '    %15s  %.3f\n' % (neg_word, neg_word_val)
        # Otherwise just end the line.
        else:
            tableStr += '\n'

    print(tableStr)

In [25]:
printWordSims(word_sims, dictionary)

           system  +0.265             minors  0.000
              eps  +0.261              graph  -0.000
        interface  +0.258              trees  -0.000
             user  +0.216               time  0.000
                                      survey  0.000
                                    response  0.000
                                       human  0.000
                                    computer  0.000

