In [1]:
import numpy as np
np.random.seed(42)
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate
from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback

import warnings
warnings.filterwarnings('ignore')

import os
os.environ['OMP_NUM_THREADS'] = '4'

Using TensorFlow backend.


In [2]:
import gensim

from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import re

In [3]:
stop_words = set(stopwords.words('english'))

In [4]:
EMBEDDING_DIM = 300 # how big is each word vector
MAX_VOCAB_SIZE = 150000 # how many unique words to use (i.e num rows in embedding vector)
MAX_SEQUENCE_LENGTH = 200 # max number of words in a comment to use

#training params
batch_size = 256 
num_epochs = 2 

In [5]:
df1 = pd.read_csv('./data/data_data_scientist.csv')
df2 = pd.read_csv('./data/data_engineer.csv', nrows =25000)

In [6]:
df =pd.concat([df1, df2], sort=True)

In [7]:
df.reset_index(drop =True, inplace =True)

In [8]:
df.head()

Unnamed: 0,additional_information,job_ID,job_company,job_date,job_description,job_title,person_ID,skills
0,Core CompetenciesData Modeling & Visualization...,1,"TRILLO INC - San Francisco Bay Area, CA",January 2018 to Present,* Incorporate Machine Learning solutions to fa...,Data Scientist,1,"CLUSTERING (Less than 1 year), DATA MINING (Le..."
1,TECHNICAL EXPERTISEStatistics/Machine Learning...,1,"Pepsico - Plano, TX",September 2018 to Present,Fleet Analysis: Exploring the Truck repair and...,Data Scientist,2,"Hadoop (Less than 1 year), PYTHON (Less than 1..."
2,TECHNICAL EXPERTISEStatistics/Machine Learning...,2,"CISCO - San Jose, CA",May 2018 to July 2018,Developed Python scripts to automate the load...,Data Scientist,2,"Hadoop (Less than 1 year), PYTHON (Less than 1..."
3,TECHNICAL EXPERTISEStatistics/Machine Learning...,3,"PHOTON INFOTECH - Bohemia, NY",September 2017 to March 2018,Developed a personalized recommender engine fo...,Data Scientist,2,"Hadoop (Less than 1 year), PYTHON (Less than 1..."
4,TECHNICAL EXPERTISEStatistics/Machine Learning...,4,"APPLE INC - Cupertino, CA",January 2016 to August 2017,Played a key role in developing and maintainin...,Data Scientist,2,"Hadoop (Less than 1 year), PYTHON (Less than 1..."


In [9]:
def find_data_scientist(title_value):
    target = 'data scientist'
    if type(title_value) == float:
        return -1
    elif type(title_value) ==str:
        if title_value.lower().find(target) >=0:
            return 1
        else:
            return 0

In [10]:
is_data_scientist = df.job_title.map(find_data_scientist)

In [11]:
sum(is_data_scientist==-1)

1858

In [12]:
def find_data_scientist_binary(title_value):
    target = 'data scientist'
    if type(title_value) == float:
        return 0
    elif type(title_value) ==str:
        if title_value.lower().find(target) >=0:
            return 1
        else:
            return 0

In [13]:
is_data_scientist = df.job_title.map(find_data_scientist_binary)

In [14]:
sum(is_data_scientist==-1)

0

### Add is_data_scientist column to indicate whether the title has data scientist

In [15]:
df['is_data_scientist']=is_data_scientist
df_new = df.dropna(subset =['job_description'])
df_new.reset_index(drop =True, inplace =True)

In [16]:
def standardize_text(df, text_field, job_label):
    text_first =df[text_field].map(lambda x: x.replace(r"[^A-Za-z0-9(),!?@\'\`\"\_\n]", " "))
    text_second = text_first.map(lambda x: x.lower())
    df_copy = pd.concat([text_second, df[job_label]], axis =1)
    return df_copy

In [17]:
train_comments =standardize_text(df_new, 'job_description', 'is_data_scientist')
train_comments.head()

Unnamed: 0,job_description,is_data_scientist
0,* incorporate machine learning solutions to fa...,1
1,fleet analysis: exploring the truck repair and...,1
2,developed python scripts to automate the load...,1
3,developed a personalized recommender engine fo...,1
4,played a key role in developing and maintainin...,1


In [18]:
tokenizer = RegexpTokenizer(r'\w+')
train_comments['job_description'] = train_comments['job_description'].astype('str') 
#clean_train_comments.dtypes
train_comments["tokens"] = train_comments["job_description"].apply(tokenizer.tokenize)
# delete Stop Words
train_comments["tokens"] = train_comments["tokens"].apply(lambda vec: [word for word in vec if word not in stop_words])
   
train_comments.head()

Unnamed: 0,job_description,is_data_scientist,tokens
0,* incorporate machine learning solutions to fa...,1,"[incorporate, machine, learning, solutions, fa..."
1,fleet analysis: exploring the truck repair and...,1,"[fleet, analysis, exploring, truck, repair, ma..."
2,developed python scripts to automate the load...,1,"[developed, python, scripts, automate, loading..."
3,developed a personalized recommender engine fo...,1,"[developed, personalized, recommender, engine,..."
4,played a key role in developing and maintainin...,1,"[played, key, role, developing, maintaining, s..."


In [19]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [20]:
all_training_words = [word for tokens in train_comments["tokens"] for word in tokens]
training_sentence_lengths = [len(tokens) for tokens in train_comments["tokens"]]
TRAINING_VOCAB = sorted(list(set(all_training_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_training_words), len(TRAINING_VOCAB)))
print("Max sentence length is %s" % max(training_sentence_lengths))

3928510 words total, with a vocabulary size of 94918
Max sentence length is 2846


In [21]:
MAX_VOCAB_SIZE = 94918

In [22]:
word2vec_path = "./data/GoogleNews-vectors-negative300.bin.gz"
#Due to memory limit, load first frequent 300000 words
word2vec = gensim.models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True, limit=300000)

In [23]:
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, lower=True, char_level=False)
tokenizer.fit_on_texts(train_comments["job_description"].tolist())
training_sequences = tokenizer.texts_to_sequences(train_comments["job_description"].tolist())

train_word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(train_word_index))

train_GRU_data = pad_sequences(training_sequences, maxlen=MAX_SEQUENCE_LENGTH)


#Why has shape len(train_word_index)+1
train_embedding_weights = np.zeros((len(train_word_index)+1, EMBEDDING_DIM))
for word,index in train_word_index.items():
    train_embedding_weights[index,:] = word2vec[word] if word in word2vec else np.random.rand(EMBEDDING_DIM)
print(train_embedding_weights.shape)

Found 106070 unique tokens.
(106071, 300)


In [24]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))

In [25]:
def get_model_GRU():
    inp = Input(shape=(MAX_SEQUENCE_LENGTH, ))
    x = Embedding(len(train_word_index)+1, EMBEDDING_DIM, weights=[train_embedding_weights])(inp)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(GRU(80, return_sequences=True))(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    outp = Dense(1, activation="sigmoid")(conc)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

In [26]:
model_GRU = get_model_GRU()

In [27]:
x_train = train_GRU_data
y_train = train_comments['is_data_scientist'].values

In [28]:
train_comments['is_data_scientist'].unique()

array([1, 0])

In [29]:
X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.95, random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)

In [30]:
hist = model_GRU.fit(X_tra, y_tra, batch_size=batch_size, epochs=num_epochs, validation_data=(X_val, y_val),
                 callbacks=[RocAuc], verbose=2)

Train on 44434 samples, validate on 2339 samples
Epoch 1/2
 - 768s - loss: 0.2665 - acc: 0.8907 - val_loss: 0.2006 - val_acc: 0.9124

 ROC-AUC - epoch: 1 - score: 0.946939 

Epoch 2/2
 - 747s - loss: 0.1732 - acc: 0.9255 - val_loss: 0.1872 - val_acc: 0.9201

 ROC-AUC - epoch: 2 - score: 0.954993 



In [32]:
model_GRU.save('./data/GRU_model.h5')

###  Now try LSTM model

In [33]:
from keras.layers import LSTM

In [34]:
def get_model_LSTM():
    inp = Input(shape=(MAX_SEQUENCE_LENGTH, ))
    x = Embedding(len(train_word_index)+1, EMBEDDING_DIM, weights=[train_embedding_weights])(inp)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(LSTM(80, return_sequences=True))(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    outp = Dense(1, activation="sigmoid")(conc)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

In [35]:
model_LSTM = get_model_LSTM()

In [36]:
hist_LSTM = model_LSTM.fit(X_tra, y_tra, batch_size=batch_size, epochs=num_epochs, validation_data=(X_val, y_val),
                 callbacks=[RocAuc], verbose=2)

Train on 44434 samples, validate on 2339 samples
Epoch 1/2
 - 684s - loss: 0.2622 - acc: 0.8933 - val_loss: 0.2233 - val_acc: 0.9085

 ROC-AUC - epoch: 1 - score: 0.946409 

Epoch 2/2
 - 683s - loss: 0.1748 - acc: 0.9251 - val_loss: 0.1877 - val_acc: 0.9213

 ROC-AUC - epoch: 2 - score: 0.953895 



In [37]:
model_LSTM.save('./data/LSTM_model.h5')