In [1]:
from collections import defaultdict
from gensim import corpora

documents = [
    "Human machine social interface for lab abc computer applications",
    "A survey of user opinion of computer system response time",
    "The EPS user interface management system",
    "System and human social system engineering testing of EPS",
    "Relation of user perceived response time to error measurement",
    "The generation of random binary unordered trees",
    "The intersection graph of paths in trees",
    "Graph minors IV Widths of trees and well quasi ordering",
    "Graph minors A survey",
    "Pocahontas is a social human who likes interaction",
    "There was a shooting in alabama"
]

# remove common words and tokenize
stoplist = set('for a of the and to in'.split())
texts = [
    [word for word in document.lower().split() if word not in stoplist]
    for document in documents
]

# remove words that appear only once
frequency = defaultdict(int)
for text in texts:
    #print(f"ON {text}...")
    for token in text:
        #print(token)
        frequency[token] += 1 # frequency count

# list of unique words per document in corpus
texts = [
    [token for token in text if frequency[token] > 1]
    for text in texts
]

# returns gensim object of dictionary mapping
dictionary = corpora.Dictionary(texts)

# list of tuple(token_id, token_count)
corpus = [dictionary.doc2bow(text) for text in texts]

In [2]:
[text for text in texts]

[['human', 'social', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'social', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey'],
 ['social', 'human'],
 []]

In [3]:
# also dictionary.token2id()
print(dictionary.keys())
print([dictionary[i] for i in dictionary.keys()])

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
['computer', 'human', 'interface', 'social', 'response', 'survey', 'system', 'time', 'user', 'eps', 'trees', 'graph', 'minors']


In [4]:
# (token_id, token_count)
corpus

[[(0, 1), (1, 1), (2, 1), (3, 1)],
 [(0, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)],
 [(2, 1), (6, 1), (8, 1), (9, 1)],
 [(1, 1), (3, 1), (6, 2), (9, 1)],
 [(4, 1), (7, 1), (8, 1)],
 [(10, 1)],
 [(10, 1), (11, 1)],
 [(10, 1), (11, 1), (12, 1)],
 [(5, 1), (11, 1), (12, 1)],
 [(1, 1), (3, 1)],
 []]

In [5]:
# Contents of dictionary
print(f"Contents of dictionary: {[dictionary[i] for i in dictionary.keys()]}")

Contents of dictionary: ['computer', 'human', 'interface', 'social', 'response', 'survey', 'system', 'time', 'user', 'eps', 'trees', 'graph', 'minors']


In [6]:
# Create LSI model using dictionary as training data
from gensim import models
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=1)

In [7]:
# Output most meaningful words in corpus
lsi.print_topics()

[(0,
  '0.624*"system" + 0.324*"user" + 0.323*"social" + 0.323*"human" + 0.300*"eps" + 0.233*"computer" + 0.209*"interface" + 0.204*"response" + 0.204*"time" + 0.161*"survey"')]

In [8]:
#test_doc = "Girard is a social science student interested in computer science and human computer interaction and natural language processing"
#test_doc = "Girard is a mechanic and a mountaineer that likes to cook."

# Add last sentence to bring similarity score up
test_doc = "We started off early, but by the time we got a glimpse of the Aberdeen glacier, there were two parties midway through the ice pitch already. We could barely make them out from a far, and it seemed like the wall was not that much of a deal. It was pretty daunting once you get close enought to appreciate the massive ice formation."

In [9]:
vec_bow = dictionary.doc2bow(test_doc.lower().split())
vec_lsi = lsi[vec_bow]  # convert the query test_doc to LSI matrix and calculate cosine similarity against the lsi model
print(vec_lsi)

[(0, 0.20380793062023445)]


In [10]:
from gensim import similarities
index = similarities.MatrixSimilarity(lsi[corpus])

index.get_similarities(test_doc, vec_lsi)

index.save('/tmp/deerwester.index')
index = similarities.MatrixSimilarity.load('/tmp/deerwester.index')

In [11]:
documents

['Human machine social interface for lab abc computer applications',
 'A survey of user opinion of computer system response time',
 'The EPS user interface management system',
 'System and human social system engineering testing of EPS',
 'Relation of user perceived response time to error measurement',
 'The generation of random binary unordered trees',
 'The intersection graph of paths in trees',
 'Graph minors IV Widths of trees and well quasi ordering',
 'Graph minors A survey',
 'Pocahontas is a social human who likes interaction',
 'There was a shooting in alabama']

In [12]:
sims = index[vec_lsi]  # perform a similarity query against the corpus
print(list(enumerate(sims)))  # print (document_number, document_similarity) 2-tuples

[(0, 1.0), (1, 1.0), (2, 1.0), (3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (7, 1.0), (8, 1.0), (9, 1.0), (10, 0.0)]


In [13]:
sims = sorted(enumerate(sims), key=lambda item: -item[1])
for doc_position, doc_score in sims:
    print(doc_score, documents[doc_position])

1.0 Human machine social interface for lab abc computer applications
1.0 A survey of user opinion of computer system response time
1.0 The EPS user interface management system
1.0 System and human social system engineering testing of EPS
1.0 Relation of user perceived response time to error measurement
1.0 The generation of random binary unordered trees
1.0 The intersection graph of paths in trees
1.0 Graph minors IV Widths of trees and well quasi ordering
1.0 Graph minors A survey
1.0 Pocahontas is a social human who likes interaction
0.0 There was a shooting in alabama


In [14]:
sims

[(0, 1.0),
 (1, 1.0),
 (2, 1.0),
 (3, 1.0),
 (4, 1.0),
 (5, 1.0),
 (6, 1.0),
 (7, 1.0),
 (8, 1.0),
 (9, 1.0),
 (10, 0.0)]

In [15]:
type(documents)

list