In [1]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
import os
import tempfile
TEMP_FOLDER = tempfile.gettempdir()
print('Folder "{}" will be used to save temporary dictionary and corpus.'.format(TEMP_FOLDER))

Folder "/tmp" will be used to save temporary dictionary and corpus.


In [3]:
from gensim import corpora, models, similarities

dictionary = corpora.Dictionary.load(os.path.join(TEMP_FOLDER, 'deerwester.dict'))
corpus = corpora.MmCorpus(os.path.join(TEMP_FOLDER, 'deerwester.mm')) # comes from the first tutorial, "Corpora and Vector Space"
print(corpus)

2019-01-24 01:12:28,922 : INFO : 'pattern' package not found; tag filters are not available for English
2019-01-24 01:12:28,928 : INFO : loading Dictionary object from /tmp/deerwester.dict
2019-01-24 01:12:28,929 : INFO : loaded /tmp/deerwester.dict
2019-01-24 01:12:28,931 : INFO : loaded corpus index from /tmp/deerwester.mm.index
2019-01-24 01:12:28,932 : INFO : initializing cython corpus reader from /tmp/deerwester.mm
2019-01-24 01:12:28,933 : INFO : accepted corpus with 9 documents, 12 features, 28 non-zero entries


MmCorpus(9 documents, 12 features, 28 non-zero entries)


In [4]:
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2)

2019-01-24 01:12:53,706 : INFO : using serial LSI version on this node
2019-01-24 01:12:53,709 : INFO : updating model with new documents
2019-01-24 01:12:53,710 : INFO : preparing a new chunk of documents
2019-01-24 01:12:53,712 : INFO : using 100 extra samples and 2 power iterations
2019-01-24 01:12:53,712 : INFO : 1st phase: constructing (12, 102) action matrix
2019-01-24 01:12:53,713 : INFO : orthonormalizing (12, 102) action matrix
2019-01-24 01:12:53,727 : INFO : 2nd phase: running dense svd on (12, 9) matrix
2019-01-24 01:12:53,729 : INFO : computing the final decomposition
2019-01-24 01:12:53,730 : INFO : keeping 2 factors (discarding 43.156% of energy spectrum)
2019-01-24 01:12:53,731 : INFO : processed documents up to #9
2019-01-24 01:12:53,733 : INFO : topic #0(3.341): 0.644*"system" + 0.404*"user" + 0.301*"eps" + 0.265*"time" + 0.265*"response" + 0.240*"computer" + 0.221*"human" + 0.206*"survey" + 0.198*"interface" + 0.036*"graph"
2019-01-24 01:12:53,735 : INFO : topic #1(2

In [5]:
doc = "Human computer interaction"
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lsi[vec_bow] # convert the query to LSI space
print(vec_lsi)

[(0, 0.46182100453271574), (1, -0.07002766527899992)]


In [6]:
index = similarities.MatrixSimilarity(lsi[corpus]) # transform corpus to LSI space and index it

2019-01-24 01:14:11,429 : INFO : creating matrix with 9 documents and 2 features
  if np.issubdtype(vec.dtype, np.int):


In [7]:
index.save(os.path.join(TEMP_FOLDER, 'deerwester.index'))
#index = similarities.MatrixSimilarity.load(os.path.join(TEMP_FOLDER, 'index'))

2019-01-24 01:15:09,306 : INFO : saving MatrixSimilarity object under /tmp/deerwester.index, separately None
2019-01-24 01:15:09,311 : INFO : saved /tmp/deerwester.index


In [8]:
sims = index[vec_lsi] # perform a similarity query against the corpus
print(list(enumerate(sims))) # print (document_number, document_similarity) 2-tuples

[(0, 0.998093), (1, 0.93748635), (2, 0.9984453), (3, 0.9865886), (4, 0.90755945), (5, -0.12416792), (6, -0.10639259), (7, -0.09879464), (8, 0.050041765)]


In [None]:
for doc in corpus_lsi: # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly
    print(doc)

In [None]:
lsi.save(os.path.join(TEMP_FOLDER, 'model.lsi')) # same for tfidf, lda, ...
#lsi = models.LsiModel.load(os.path.join(TEMP_FOLDER, 'model.lsi'))

In [None]:
corpus_memory_friendly = MyCorpus() # doesn't load the corpus into memory!
print(corpus_memory_friendly)

In [None]:
for vector in corpus_memory_friendly:  # load one vector into memory at a time
    print(vector)

In [None]:
from six import iteritems
from smart_open import smart_open

# collect statistics about all tokens
dictionary = corpora.Dictionary(line.lower().split() for line in smart_open('datasets/mycorpus.txt', 'rb'))

# remove stop words and words that appear only once
stop_ids = [dictionary.token2id[stopword] for stopword in stoplist 
            if stopword in dictionary.token2id]
once_ids = [tokenid for tokenid, docfreq in iteritems(dictionary.dfs) if docfreq == 1]

# remove stop words and words that appear only once
dictionary.filter_tokens(stop_ids + once_ids)
print(dictionary)

In [None]:
dictionary.dfs

In [None]:
# create a toy corpus of 2 documents, as a plain Python list
corpus = [[(1, 0.5)], []]  # make one document empty, for the heck of it

corpora.MmCorpus.serialize(os.path.join(TEMP_FOLDER, 'corpus.mm'), corpus)

In [None]:
corpus = corpora.MmCorpus(os.path.join(TEMP_FOLDER, 'corpus.mm'))

In [None]:
print(corpus)

In [None]:
for doc in corpus:
    print(doc)

In [None]:
import gensim
import numpy as np
numpy_matrix = np.random.randint(10, size=[5,2])
corpus = gensim.matutils.Dense2Corpus(numpy_matrix)
numpy_matrix_dense = gensim.matutils.corpus2dense(corpus, num_terms=10)

In [None]:
numpy_matrix

In [None]:
print(list(corpus))

In [None]:
numpy_matrix_dense