In [3]:
import pickle, gensim, logging
import pandas as pd
import numpy as np
from nltk.corpus import stopwords 
from gensim.models.keyedvectors import KeyedVectors
from functions.word2vec import preprocessText, average_word_embedding, \
     cosine_similarity, find_jobs, listcomparision, model_comparison

In [2]:
# Set up logging for gensim
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Import O*NET Data

In [3]:
data = pickle.load(open('onetsoccode.p', 'rb'))
jobtitles = pickle.load(open('lookuptable', 'rb'))

In [4]:
# Convert to DF

df = pd.DataFrame(data)
df2 = pd.DataFrame(jobtitles).T

data = pd.merge(df, df2, left_on = 'soc', right_on = 'soc', how = 'left')

In [5]:
# Preprocess the text data 
stopWords = stopwords.words('english')
data['processed'] = data.apply(lambda x: preprocessText(x['text'], stopWords), axis = 1)

# Model creation 

Create two different models for comparision the first using googles model and the second training a model from scratch using ONET. 

In [6]:
# Pretrained vectors from google (https://code.google.com/archive/p/word2vec/)
googlemodel = KeyedVectors.load_word2vec_format('/home/craig/Documents/googledata/GoogleNews-vectors-negative300.bin', binary=True)

2017-11-04 09:53:42,466 : INFO : loading projection weights from /home/craig/Documents/googledata/GoogleNews-vectors-negative300.bin
2017-11-04 09:54:27,298 : INFO : loaded (3000000, 300) matrix from /home/craig/Documents/googledata/GoogleNews-vectors-negative300.bin


In [7]:
# Create O*NET Model and save it so we can use it later 
onetmodel = gensim.models.Word2Vec(data.processed, size=300, window = 5, iter=15, workers=4)
onetmodel.save("onetmodel.model")

2017-11-04 09:54:27,304 : INFO : collecting all words and their counts
2017-11-04 09:54:27,316 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-11-04 09:54:27,529 : INFO : collected 24588 word types from a corpus of 666976 raw words and 974 sentences
2017-11-04 09:54:27,530 : INFO : Loading a fresh vocabulary
2017-11-04 09:54:27,561 : INFO : min_count=5 retains 7523 unique words (30% of original 24588, drops 17065)
2017-11-04 09:54:27,562 : INFO : min_count=5 leaves 639263 word corpus (95% of original 666976, drops 27713)
2017-11-04 09:54:27,586 : INFO : deleting the raw counts dictionary of 24588 items
2017-11-04 09:54:27,588 : INFO : sample=0.001 downsamples 46 most-common words
2017-11-04 09:54:27,589 : INFO : downsampling leaves estimated 584754 word corpus (91.5% of prior 639263)
2017-11-04 09:54:27,590 : INFO : estimated required memory for 7523 words and 300 dimensions: 21816700 bytes
2017-11-04 09:54:27,614 : INFO : resetting layer weights
2017-11

# Create average word embeddings 


Now we create two datasets using the two different models.  One model (googledata) uses the google model and the second uses the O*NET model

In [8]:
# List for vector names 
variables = ['d' + str(x) for x in range(0,300)]

# Create google dataframe 
googledata = data.copy()
googledata[variables] = googledata.apply(lambda x: average_word_embedding(x['processed'], googlemodel, True), axis = 1)

# Create O*NET dataframe
onetdata = data.copy()
onetdata[variables] = onetdata.apply(lambda x: average_word_embedding(x['processed'], onetmodel), axis = 1)

In [9]:
# Pickle the DFs so we can use it later 
googledata.to_pickle("googledata.p")
onetdata.to_pickle("onetdata.p")

# Let's run some examples to see what jobs are similar using both models

In [10]:
# Lets look at singers - with google and onet data
print("***************\nGoogle data\n***************")
find_jobs(googledata, '27-2042.01', 5, 5)
print("\n***************\nO*NET data\n***************")
find_jobs(onetdata, '27-2042.01', 5, 5)

***************
Google data
***************
For the job of Singers...
The most similiar jobs are...
	 Music Directors
	 Choreographers
	 Public Address System and Other Announcers
	 Actors
	 Talent Directors
The least similar jobs are...
	 Methane/Landfill Gas Generation System Technicians
	 Green Marketers
	 Methane/Landfill Gas Collection System Operators
	 Fuel Cell Technicians
	 Data Warehousing Specialists

***************
O*NET data
***************
For the job of Singers...
The most similiar jobs are...
	 Actors
	 Broadcast News Analysts
	 Public Address System and Other Announcers
	 Talent Directors
	 Music Directors
The least similar jobs are...
	 Green Marketers
	 Data Warehousing Specialists
	 Methane/Landfill Gas Collection System Operators
	 Methane/Landfill Gas Generation System Technicians
	 Fuel Cell Technicians


In [11]:
# Lets look a mechanical job
print("***************\nGoogle data\n***************")
find_jobs(googledata, '49-3023.01', 5, 5)
print("\n***************\nO*NET data\n***************")
find_jobs(onetdata, '49-3023.01', 5, 5)

***************
Google data
***************
For the job of Automotive Master Mechanics...
The most similiar jobs are...
	 Bus and Truck Mechanics and Diesel Engine Specialists
	 Automotive Specialty Technicians
	 Mobile Heavy Equipment Mechanics, Except Engines
	 Outdoor Power Equipment and Other Small Engine Mechanics
	 Recreational Vehicle Service Technicians
The least similar jobs are...
	 Software Developers, Applications
	 Green Marketers
	 Data Warehousing Specialists
	 Investment Underwriters
	 Legislators

***************
O*NET data
***************
For the job of Automotive Master Mechanics...
The most similiar jobs are...
	 Automotive Specialty Technicians
	 Recreational Vehicle Service Technicians
	 Mobile Heavy Equipment Mechanics, Except Engines
	 Bus and Truck Mechanics and Diesel Engine Specialists
	 Farm Equipment Mechanics and Service Technicians
The least similar jobs are...
	 Data Warehousing Specialists
	 Special Education Teachers, Preschool
	 Green Marketers
	 Inve

In [12]:
# Lets look a I/O Psychology
print("***************\nGoogle data\n***************")
find_jobs(googledata, '19-3032.00', 5, 5)
print("\n***************\nO*NET data\n***************")
find_jobs(onetdata, '19-3032.00', 5, 5)

***************
Google data
***************
For the job of Industrial-Organizational Psychologists...
The most similiar jobs are...
	 Natural Sciences Managers
	 Sociologists
	 Logisticians
	 Social and Community Service Managers
	 Counseling Psychologists
The least similar jobs are...
	 Landscaping and Groundskeeping Workers
	 Bakers
	 Musical Instrument Repairers and Tuners
	 Helpers--Pipelayers, Plumbers, Pipefitters, and Steamfitters
	 Agricultural Equipment Operators

***************
O*NET data
***************
For the job of Industrial-Organizational Psychologists...
The most similiar jobs are...
	 Chief Sustainability Officers
	 Natural Sciences Managers
	 Financial Quantitative Analysts
	 Risk Management Specialists
	 Education Administrators, Preschool and Childcare Center/Program
The least similar jobs are...
	 Craft Artists
	 Fuel Cell Technicians
	 Helpers--Pipelayers, Plumbers, Pipefitters, and Steamfitters
	 Musical Instrument Repairers and Tuners
	 Methane/Landfill Gas Ge

In [13]:
# Lets look at computer programming
print("***************\nGoogle data\n***************")
find_jobs(googledata, '15-1131.00', 5, 5)
print("\n***************\nO*NET data\n***************")
find_jobs(onetdata, '15-1131.00', 5, 5)

***************
Google data
***************
For the job of Computer Programmers...
The most similiar jobs are...
	 Software Developers, Applications
	 Software Developers, Systems Software
	 Software Quality Assurance Engineers and Testers
	 Computer Systems Engineers/Architects
	 Database Administrators
The least similar jobs are...
	 Landscaping and Groundskeeping Workers
	 Surgeons
	 Hunters and Trappers
	 Helpers--Pipelayers, Plumbers, Pipefitters, and Steamfitters
	 Agricultural Equipment Operators

***************
O*NET data
***************
For the job of Computer Programmers...
The most similiar jobs are...
	 Software Developers, Applications
	 Computer Systems Engineers/Architects
	 Software Developers, Systems Software
	 Database Administrators
	 Software Quality Assurance Engineers and Testers
The least similar jobs are...
	 Welders, Cutters, and Welder Fitters
	 Glaziers
	 Solderers and Brazers
	 Musical Instrument Repairers and Tuners
	 Helpers--Pipelayers, Plumbers, Pipefi