In [2]:
import pickle, gensim, logging
import pandas as pd
import numpy as np
from nltk.corpus import stopwords 
from gensim.models.keyedvectors import KeyedVectors

In [49]:
# Import the data 

data = pickle.load(open('onetsoccode.p', 'rb'))
jobtitles = pickle.load(open('lookuptable', 'rb'))

# Stop words 
stopWords = stopwords.words('english')

In [50]:
# Convert to DF

df = pd.DataFrame(data)
df2 = pd.DataFrame(jobtitles).T

data = pd.merge(df, 
                  df2, 
                  left_on = 'soc', 
                  right_on = 'soc', 
                  how = 'left'
                 )

In [51]:
# Function to preprocess the text 

def preprocessText(txt, stopwords): 
    """Use gensim simple preprocess and remove stopwords"""
    stopwords = stopwords
    txt = gensim.utils.simple_preprocess(txt)
    return  [word for word in txt if word not in stopwords]
    

In [52]:
# Preprocess

data['processed'] = data.apply(lambda x: preprocessText(x['text'], stopWords), axis = 1)

In [53]:
# import modules & set up logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [10]:
# Pretrained vectors from google
### https://code.google.com/archive/p/word2vec/

word_vectors = KeyedVectors.load_word2vec_format('/home/craig/Documents/googledata/GoogleNews-vectors-negative300.bin', binary=True)

2017-11-04 07:22:09,125 : INFO : loading projection weights from /home/craig/Documents/googledata/GoogleNews-vectors-negative300.bin
2017-11-04 07:22:59,735 : INFO : loaded (3000000, 300) matrix from /home/craig/Documents/googledata/GoogleNews-vectors-negative300.bin


In [54]:
# Function to create a numpy array of the model word_vec 

def average_word_embedding(data, model, keyed = False): 
    """Return the average word embedding"""
    out = []
    
    # Handle keyed vectors 
    if keyed == True: 
        find_vector = lambda word: model.word_vec(word)
    else: 
        find_vector = lambda word: model.wv[word]
        
    # Loop through 
    for word in data: 
        if word in model:
            out.append(find_vector(word))
    
    # calculate average 
    avg = np.average(np.array(out), axis = 0)
    return pd.Series({'d' + str(idx):x for idx, x in enumerate(avg)})

In [55]:
# Create the avg word vectors 

variables = ['d' + str(x) for x in range(0,300)]
data[variables] = data.apply(lambda x: average_word_embedding(x['processed'], word_vectors, True), axis = 1)

# Make a function that finds the best 

In [57]:
def cosine_similarity(target_vector, row): 
    job1 = target_vector
    job2 = row.loc['d0':].values
    numerator = np.dot(job1,job2)
    denominator = np.sqrt(np.sum(job1**2)) * np.sqrt(np.sum(job2**2))
    return pd.Series([numerator/denominator])

In [58]:
def find_jobs(data, soc, topN, bottomN): 
    """ Given the ONET job DF and a SOC code find the top and bottom similar jobs """
    
    # Create a DF of the target and non-target jobs
    target = data[data['soc'] == soc]
    df = data[data['soc'] != soc]

    # Create target vector 
    target_vector = data[data['soc'] == soc].loc[:, 'd0':].values[0]
    
    # Run similarities 
    s = df.apply(lambda x: cosine_similarity(target_vector, x), axis = 1)
    df = df.assign(similarity = s)
    
    # Sort the values 
    df.sort_values(by = 'similarity', ascending = False, inplace = True)
    
    # Print the top N 
    top = df[['title', 'similarity']].head(topN).values.tolist()
    
    # Print the bottom N 
    bottom = df[['title', 'similarity']].tail(bottomN).values.tolist()
    
    # Print results 
    print("For the job of {0}...".format(target['title'].values[0]))
    
    # Top 
    print("The most similiar jobs are...".format(target['title'].values[0]))
    for job in top: 
        print("\t {0}".format(job[0]))
    
    # Bottom 
    print("The least similar jobs are...")
    for job in bottom: 
        print('\t {0}'.format(job[0]))
        

In [59]:
# Lets look at singers
find_jobs(data, '27-2042.01', 5, 5)

For the job of Singers...
The most similiar jobs are...
	 Music Directors
	 Choreographers
	 Public Address System and Other Announcers
	 Actors
	 Talent Directors
The least similar jobs are...
	 Methane/Landfill Gas Generation System Technicians
	 Green Marketers
	 Methane/Landfill Gas Collection System Operators
	 Fuel Cell Technicians
	 Data Warehousing Specialists


In [21]:
# Now lets look at a mechanical job 
find_jobs(data, '49-3023.01', 5, 5)

For the job of Automotive Master Mechanics...
The most similiar jobs are...
	 Bus and Truck Mechanics and Diesel Engine Specialists
	 Automotive Specialty Technicians
	 Mobile Heavy Equipment Mechanics, Except Engines
	 Outdoor Power Equipment and Other Small Engine Mechanics
	 Recreational Vehicle Service Technicians
The least similar jobs are...
	 Software Developers, Applications
	 Green Marketers
	 Data Warehousing Specialists
	 Investment Underwriters
	 Legislators


In [22]:
# Now lets look at I/O Psychologist 
find_jobs(data, '19-3032.00', 5, 5)

For the job of Industrial-Organizational Psychologists...
The most similiar jobs are...
	 Natural Sciences Managers
	 Sociologists
	 Logisticians
	 Social and Community Service Managers
	 Counseling Psychologists
The least similar jobs are...
	 Landscaping and Groundskeeping Workers
	 Bakers
	 Musical Instrument Repairers and Tuners
	 Helpers--Pipelayers, Plumbers, Pipefitters, and Steamfitters
	 Agricultural Equipment Operators


In [23]:
# Now lets look at Computer programmers
find_jobs(data, '15-1131.00', 5, 5)

For the job of Computer Programmers...
The most similiar jobs are...
	 Software Developers, Applications
	 Software Developers, Systems Software
	 Software Quality Assurance Engineers and Testers
	 Computer Systems Engineers/Architects
	 Database Administrators
The least similar jobs are...
	 Landscaping and Groundskeeping Workers
	 Surgeons
	 Hunters and Trappers
	 Helpers--Pipelayers, Plumbers, Pipefitters, and Steamfitters
	 Agricultural Equipment Operators


In [24]:
# Now lets look at Computer programmers
find_jobs(data, '29-1067.00', 5, 5)

For the job of Surgeons...
The most similiar jobs are...
	 Surgical Technologists
	 Oral and Maxillofacial Surgeons
	 Obstetricians and Gynecologists
	 Surgical Assistants
	 Veterinarians
The least similar jobs are...
	 Software Developers, Applications
	 Green Marketers
	 Investment Underwriters
	 Legislators
	 Data Warehousing Specialists


# Roll my own model

In [31]:
mymodel = gensim.models.Word2Vec(data.processed, size=300, window = 5, iter=15, workers=4)

2017-11-04 07:36:54,181 : INFO : collecting all words and their counts
2017-11-04 07:36:54,183 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-11-04 07:36:54,350 : INFO : collected 24588 word types from a corpus of 666976 raw words and 974 sentences
2017-11-04 07:36:54,351 : INFO : Loading a fresh vocabulary
2017-11-04 07:36:54,378 : INFO : min_count=5 retains 7523 unique words (30% of original 24588, drops 17065)
2017-11-04 07:36:54,380 : INFO : min_count=5 leaves 639263 word corpus (95% of original 666976, drops 27713)
2017-11-04 07:36:54,410 : INFO : deleting the raw counts dictionary of 24588 items
2017-11-04 07:36:54,414 : INFO : sample=0.001 downsamples 46 most-common words
2017-11-04 07:36:54,414 : INFO : downsampling leaves estimated 584754 word corpus (91.5% of prior 639263)
2017-11-04 07:36:54,415 : INFO : estimated required memory for 7523 words and 300 dimensions: 21816700 bytes
2017-11-04 07:36:54,437 : INFO : resetting layer weights
2017-11

In [60]:
model2 = data.loc[:, :'processed']

In [61]:
variables = ['d' + str(x) for x in range(0,300)]
model2[variables] = model2.apply(lambda x: average_word_embedding(x['processed'], mymodel), axis = 1)

In [62]:
find_jobs(model2, '29-1067.00', 5, 5)

For the job of Surgeons...
The most similiar jobs are...
	 Veterinarians
	 Oral and Maxillofacial Surgeons
	 Surgical Technologists
	 Veterinary Technologists and Technicians
	 Acute Care Nurses
The least similar jobs are...
	 Methane/Landfill Gas Collection System Operators
	 Fuel Cell Technicians
	 Investment Underwriters
	 Data Warehousing Specialists
	 Green Marketers
