In [16]:
import gensim
import numpy as np

In [21]:
text_corpus = [
    "Human and human machine interface for lab abc computer applications",
    "A survey of user opinion of computer system response time",
    "The EPS user interface management system",
    "System and human system engineering testing of EPS",
    "Relation of user perceived response time to error measurement",
    "The generation of random binary unordered trees",
    "The intersection graph of paths in trees",
    "Graph minors IV Widths of trees and well quasi ordering",
    "Graph minors A survey",
]

In [22]:
stoplist = set('for a of the to in'.split(' '))
#lowercase each document, split it by white space and filter stopwords
texts = [[word for word in document.lower().split() if word not in stoplist] for document in text_corpus]
texts

[['human',
  'and',
  'human',
  'machine',
  'interface',
  'lab',
  'abc',
  'computer',
  'applications'],
 ['survey', 'user', 'opinion', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'management', 'system'],
 ['system', 'and', 'human', 'system', 'engineering', 'testing', 'eps'],
 ['relation', 'user', 'perceived', 'response', 'time', 'error', 'measurement'],
 ['generation', 'random', 'binary', 'unordered', 'trees'],
 ['intersection', 'graph', 'paths', 'trees'],
 ['graph',
  'minors',
  'iv',
  'widths',
  'trees',
  'and',
  'well',
  'quasi',
  'ordering'],
 ['graph', 'minors', 'survey']]

In [23]:
# count word frequencies
from collections import defaultdict

frequency = defaultdict(int)
for text in texts:
  for token in text:
    frequency[token]+=1

# Keep the words that appear more than once
processed_corpus = [[token for token in tokens if frequency[token]>1] for tokens in texts]

In [24]:
processed_corpus

[['human', 'and', 'human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'and', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees', 'and'],
 ['graph', 'minors', 'survey']]

In [42]:
# before proceeding, we need to associate each word in the
# corpus with a unique integer ID. We can do this using the gensim.corpora.Dictionary
from gensim import corpora
dictionary= gensim.corpora.Dictionary(documents=processed_corpus, prune_at=1)
print(dictionary)

Dictionary<13 unique tokens: ['and', 'computer', 'human', 'interface', 'response']...>


In [43]:
print(dictionary.token2id) # a 12 dimensional vector

{'and': 0, 'computer': 1, 'human': 2, 'interface': 3, 'response': 4, 'survey': 5, 'system': 6, 'time': 7, 'user': 8, 'eps': 9, 'trees': 10, 'graph': 11, 'minors': 12}


In [50]:
# vectorize a new phrase
new_doc= 'Human and computer used in survey INterface'
new_doc_normalized = new_doc.lower().split()
new_vec = dictionary.doc2bow(new_doc_normalized)
new_vec

[(0, 1), (1, 1), (2, 1), (3, 1), (5, 1)]

In [51]:
# we can convert our entire original corpus to a list of vectors
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_corpus]

In [52]:
bow_corpus

[[(0, 1), (1, 1), (2, 2), (3, 1)],
 [(1, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)],
 [(3, 1), (6, 1), (8, 1), (9, 1)],
 [(0, 1), (2, 1), (6, 2), (9, 1)],
 [(4, 1), (7, 1), (8, 1)],
 [(10, 1)],
 [(10, 1), (11, 1)],
 [(0, 1), (10, 1), (11, 1), (12, 1)],
 [(5, 1), (11, 1), (12, 1)]]