In [63]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [64]:
from gensim import corpora, models, similarities

# Import the dictionaryy and corpus 
dictionary = corpora.Dictionary.load('onet.dict')
corpus = corpora.MmCorpus('onet_corpus.mm') # comes from the first tutorial, "Corpora and Vector Space"
print(corpus)

2017-10-08 13:46:04,798 : INFO : loading Dictionary object from onet.dict
2017-10-08 13:46:04,813 : INFO : loaded onet.dict
2017-10-08 13:46:04,815 : INFO : loaded corpus index from onet_corpus.mm.index
2017-10-08 13:46:04,816 : INFO : initializing corpus reader from onet_corpus.mm
2017-10-08 13:46:04,823 : INFO : accepted corpus with 974 documents, 17354 features, 457991 non-zero entries


MmCorpus(974 documents, 17354 features, 457991 non-zero entries)


In [65]:
# Create a lsi model with 200 topics 
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=300)

2017-10-08 13:46:04,837 : INFO : using serial LSI version on this node
2017-10-08 13:46:04,838 : INFO : updating model with new documents
2017-10-08 13:46:06,090 : INFO : preparing a new chunk of documents
2017-10-08 13:46:06,177 : INFO : using 100 extra samples and 2 power iterations
2017-10-08 13:46:06,178 : INFO : 1st phase: constructing (17354, 400) action matrix
2017-10-08 13:46:06,340 : INFO : orthonormalizing (17354, 400) action matrix
2017-10-08 13:46:08,521 : INFO : 2nd phase: running dense svd on (400, 974) matrix
2017-10-08 13:46:08,704 : INFO : computing the final decomposition
2017-10-08 13:46:08,705 : INFO : keeping 300 factors (discarding 2.327% of energy spectrum)
2017-10-08 13:46:08,869 : INFO : processed documents up to #974
2017-10-08 13:46:08,875 : INFO : topic #0(973.809): 0.357*"time" + 0.285*"exposed" + 0.284*"spend" + 0.284*"equipment" + 0.230*"software" + 0.190*"safety" + 0.162*"microsoft" + 0.153*"work" + 0.134*"systems" + 0.101*"using"
2017-10-08 13:46:08,876

In [66]:
# Create an index
index = similarities.MatrixSimilarity(lsi[corpus]) # transform corpus to LSI space and index it
index.save('onet.index')

2017-10-08 13:46:10,481 : INFO : creating matrix with 974 documents and 300 features
2017-10-08 13:46:12,176 : INFO : saving MatrixSimilarity object under onet.index, separately None
2017-10-08 13:46:12,185 : INFO : saved onet.index


In [67]:
# Test example 
doc = "Apply principles of psychology to human resources, administration, management, sales, and marketing problems. Activities may include policy planning; employee testing and selection, training and development; and organizational development and analysis. May work with management to organize the work setting to improve worker productivity"
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lsi[vec_bow] # convert the query to LSI space

In [68]:
sims = index[vec_lsi] # perform a similarity query against the corpus

In [69]:
sims = sorted(enumerate(sims), key=lambda item: -item[1])

In [70]:
import pickle 
jobs = pickle.load(open('lookuptable', 'rb'))

In [71]:
# Print out the top 5 jobs 
for x in sims[:10]: 
    idx = x[0]
    job = jobs[idx]
    print(job['title'])

Intelligence Analysts
Wind Energy Operations Managers
First-Line Supervisors of Retail Sales Workers
Dental Hygienists
Water/Wastewater Engineers
Naturopathic Physicians
Sound Engineering Technicians
Wind Energy Project Managers
Mechatronics Engineers
Cardiovascular Technologists and Technicians
