In [1]:
import pickle, gensim, logging, os
import pandas as pd
import numpy as np
from nltk.corpus import stopwords 
from gensim.models.keyedvectors import KeyedVectors
from functions.word2vec import preprocessText, average_word_embedding, \
     cosine_similarity, find_jobs, listcomparision, model_comparison

In [2]:
# Set up logging for gensim
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Import O*NET Data

In [3]:
df = pickle.load(open('onetdata.p', 'rb'))

In [4]:
# Preprocess the text data 
stopWords = stopwords.words('english')
df['processed'] = df.apply(lambda x: preprocessText(x['text'], stopWords), axis = 1)

# Model creation 

Create two different models for comparision the first using googles model and the second training a model from scratch using ONET. 

In [5]:
# Pretrained vectors from google (https://code.google.com/archive/p/word2vec/)

googlemodel = KeyedVectors.load_word2vec_format('/home/craig/Documents/googledata/GoogleNews-vectors-negative300.bin', binary=True)

2017-11-18 21:19:14,594 : INFO : loading projection weights from /home/craig/Documents/googledata/GoogleNews-vectors-negative300.bin
2017-11-18 21:20:06,553 : INFO : loaded (3000000, 300) matrix from /home/craig/Documents/googledata/GoogleNews-vectors-negative300.bin


In [6]:
# Load or build the O*NET model

if os.path.isfile("onetmodel.model"):
    print("Found O*NET model....")
    onetmodel = gensim.models.Word2Vec.load("onetmodel.model")
else: 
    # Create O*NET Model and save it so we can use it later 
    onetmodel = gensim.models.Word2Vec(df.processed, size=300, window = 5, iter=15, workers=4)
    onetmodel.save("onetmodel.model")

2017-11-18 21:20:06,560 : INFO : loading Word2Vec object from onetmodel.model


Found O*NET model....


2017-11-18 21:20:06,778 : INFO : loading wv recursively from onetmodel.model.wv.* with mmap=None
2017-11-18 21:20:06,780 : INFO : setting ignored attribute syn0norm to None
2017-11-18 21:20:06,780 : INFO : setting ignored attribute cum_table to None
2017-11-18 21:20:06,781 : INFO : loaded onetmodel.model


# Create average word embeddings 


Now we create two datasets using the two different models.  One model (googledata) uses the google model and the second uses the O*NET model

In [7]:
# List for vector names 
variables = ['d' + str(x) for x in range(0,300)]

# Create google dataframe 
googledata = df.copy()
googledata[variables] = googledata.apply(lambda x: average_word_embedding(x['processed'], googlemodel, True), axis = 1)

# Create O*NET dataframe
onetdata = df.copy()
onetdata[variables] = onetdata.apply(lambda x: average_word_embedding(x['processed'], onetmodel), axis = 1)

In [8]:
# Pickle the DFs so we can use it later 
if os.path.isfile("googledata.p") == False:
    googledata.to_pickle("googledata.p")
    
if os.path.isfile("onetdata.p") == False:
    onetdata.to_pickle("onetdata.p")