In [1]:
from collections import defaultdict
from gensim import corpora

documents = [
    "Human machine social interface for lab abc computer applications",
    "A survey of user opinion of computer system response time",
    "The EPS user interface management system",
    "System and human social system engineering testing of EPS",
    "Relation of user perceived response time to error measurement",
    "The generation of random binary unordered trees",
    "The intersection graph of paths in trees",
    "Graph minors IV Widths of trees and well quasi ordering",
    "Graph minors A survey",
    "Pocahontas is a social human who likes interaction"
]

# remove common words and tokenize
stoplist = set('for a of the and to in'.split())
texts = [
    [word for word in document.lower().split() if word not in stoplist]
    for document in documents
]

# remove words that appear only once
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

texts = [
    [token for token in text if frequency[token] > 1]
    for text in texts
]

dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

In [2]:
[text for text in texts]

[['human', 'social', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'social', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey'],
 ['social', 'human']]

In [3]:
texts

[['human', 'social', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'social', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey'],
 ['social', 'human']]

In [4]:
from gensim import models
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2)

In [5]:
doc = "Pocahontas is a person with computer interaction"

In [6]:
type(doc)

str

In [7]:
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lsi[vec_bow]  # convert the query to LSI space
print(vec_lsi)

[(0, 0.2330918031540606), (1, -0.10156362666135244)]


In [8]:
vec_lsi

[(0, 0.2330918031540606), (1, -0.10156362666135244)]

In [9]:
dictionary.token2id

{'computer': 0,
 'human': 1,
 'interface': 2,
 'social': 3,
 'response': 4,
 'survey': 5,
 'system': 6,
 'time': 7,
 'user': 8,
 'eps': 9,
 'trees': 10,
 'graph': 11,
 'minors': 12}

In [10]:
from gensim import similarities
index = similarities.MatrixSimilarity(lsi[corpus])  # transform corpus to LSI space and index it

In [11]:
index.save('/tmp/deerwester.index')
index = similarities.MatrixSimilarity.load('/tmp/deerwester.index')

In [12]:
sims = index[vec_lsi]  # perform a similarity query against the corpus
print(list(enumerate(sims)))  # print (document_number, document_similarity) 2-tuples

[(0, 0.55212295), (1, 0.96431756), (2, 0.9214136), (3, 0.65517575), (4, 0.8458402), (5, 0.43049815), (6, 0.45229146), (7, 0.46063223), (8, 0.59685427), (9, 0.3227335)]


In [13]:
sims = sorted(enumerate(sims), key=lambda item: -item[1])
for doc_position, doc_score in sims:
    print(doc_score, documents[doc_position])

0.96431756 A survey of user opinion of computer system response time
0.9214136 The EPS user interface management system
0.8458402 Relation of user perceived response time to error measurement
0.65517575 System and human social system engineering testing of EPS
0.59685427 Graph minors A survey
0.55212295 Human machine social interface for lab abc computer applications
0.46063223 Graph minors IV Widths of trees and well quasi ordering
0.45229146 The intersection graph of paths in trees
0.43049815 The generation of random binary unordered trees
0.3227335 Pocahontas is a social human who likes interaction


In [15]:
sims

[(1, 0.96431756),
 (2, 0.9214136),
 (4, 0.8458402),
 (3, 0.65517575),
 (8, 0.59685427),
 (0, 0.55212295),
 (7, 0.46063223),
 (6, 0.45229146),
 (5, 0.43049815),
 (9, 0.3227335)]

In [16]:
type(documents)

list