In [86]:
import pickle 
import pandas as pd
from gensim import corpora, models, similarities

jobs = pickle.load(open('lookuptable', 'rb'))

# Import the dictionaryy and corpus 
dictionary = corpora.Dictionary.load('onet.dict')
words = pd.DataFrame([{'keys':v, 'value':k} for k, v in dictionary.token2id.items()]) # Word lookup
corpus = corpora.MmCorpus('onet_corpus.mm') # comes from the first tutorial, "Corpora and Vector Space"

print(corpus)

MmCorpus(974 documents, 25184 features, 451904 non-zero entries)


In [87]:
# Build a model 

lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=200)

index = similarities.MatrixSimilarity(lsi[corpus])
index.save('onet.index')
index = similarities.MatrixSimilarity.load('onet.index')

In [88]:
# Example doc
# not bad - "using a hammer and working with tools"

# Bad  - "dancing, music, paint, art"

doc = "microsoft, adobe"
vec_bow = dictionary.doc2bow(doc.split())
vec_lsi = lsi[vec_bow]

In [89]:
# Turn vec_lsi (ie list of words) into a dataframe
df = pd.DataFrame([{'keys':x[0], 'values':x[1]} for x in vec_lsi])

In [90]:
# Merge in the words and sort in descending order
df = df.merge(words, 
         left_on = 'keys', 
         right_on = 'keys', 
         how = 'left', 
        )

df.sort_values('values', ascending = False)[:20]

Unnamed: 0,keys,values,value
9,9,0.284346,focus
7,7,0.169996,preferences
5,5,0.156024,service
32,32,0.110256,profitability
11,11,0.082093,direct
25,25,0.075074,discount
33,33,0.068091,accounting
24,24,0.060665,schedules
57,57,0.0601,business
54,54,0.056819,ensure


In [91]:
def lookupJob(doc): 
    # Test example 
    doc = doc.lower().replace(",","").replace(".","").replace(";","").replace("\n","").replace("\\", "")
    print("Using string: {0}\n".format(doc))
    vec_bow = dictionary.doc2bow(doc.split())
    vec_lsi = lsi[vec_bow] # convert the query to LSI space

    sims = index[vec_lsi] # perform a similarity query against the corpus
    sims = sorted(enumerate(sims), key=lambda item: -item[1])
    
    # Print out the top 5 jobs 
    for x in sims[:10]: 
        idx = x[0]
        job = jobs[idx]
        print(job['title'], (x[0], x[1]))

In [92]:
lookupJob(doc)

Using string: microsoft adobe

Aircraft Mechanics and Service Technicians (768, 0.62871486)
Animal Scientists (214, 0.62647748)
Methane/Landfill Gas Collection System Operators (18, 0.61470193)
Instructional Coordinators (354, 0.61350328)
Stationary Engineers and Boiler Operators (878, 0.5824883)
Computer Hardware Engineers (154, 0.5618853)
Business Teachers, Postsecondary (296, 0.55614662)
Refractory Materials Repairers, Except Brickmasons (791, 0.54400772)
Insurance Underwriters (99, 0.53936934)
Security Managers (54, 0.53617871)
