In [1]:
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE


Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
# read in preprocessed resume dataset
resume_df = pd.read_csv('../../data-science-for-good-careervillage/data/resume_dataset_preprocessed.csv')

# drop NA rows - some didn't get dropped in preprocessing, but come in as NA after writing to csv
resume_df = resume_df.dropna()

In [3]:
# get unique values of categories
Counter(resume_df['category'].tolist())

Counter({"['1Amy.docx']": 31,
         "['8Holly.docx']": 40,
         "['6Faye.docx']": 29,
         "['2Ben.docx']": 41,
         "['Hedge Fund Accountant JD.docx']": 14,
         "['9Ignatius.docx']": 69,
         "['3Carrie.docx']": 25,
         'Personal': 849,
         'Investment': 637,
         'Sales': 1112,
         'Compliance': 913,
         'Operations': 1396,
         'Audit': 103,
         'Corp Accounting, finance': 3680,
         'Cosec': 1941,
         'Investor Relations': 268,
         'Fund Accounting': 12592,
         'Trust': 1047,
         'Administration': 1143,
         'Random': 205,
         'Legal': 616})

In [4]:
# there were some uncategorized resumes in the dataset. filter them out
cats = ['Personal','Investment','Sales','Compliance','Operations','Audit','Corp Accounting, finance',
              'Cosec','Investor Relations','Fund Accounting','Trust','Administration','Random','Legal']

resume_df = resume_df[resume_df['category'].map(lambda x: x in cats)]

In [5]:
# extract list of categories and list of resume lines from resume_df
cat_list = resume_df['category'].tolist()
resume_list = resume_df['resume_line_pp'].tolist()

## Tokenize data and split into train/test

In [6]:
#test
count_vect = CountVectorizer()
x_train_counts = count_vect.fit_transform(resume_df['resume_line_pp'])


tfidf_transformer = TfidfTransformer()
x_train_tfidf = tfidf_transformer.fit_transform(x_train_counts)

train_x, test_x, train_y, test_y = train_test_split(x_train_tfidf, resume_df['category'], test_size=0.3)

In [7]:
train_x

<18551x10550 sparse matrix of type '<class 'numpy.float64'>'
	with 159479 stored elements in Compressed Sparse Row format>

## Try Upsampling Data (imbalanced classes)

In [58]:
print("Before OverSampling, counts of label 'Fund Accounting': {}".format(sum(train_y=='Fund Accounting')))
print("Before OverSampling, counts of label 'Cosec': {} \n".format(sum(train_y=='Cosec')))


Before OverSampling, counts of label 'Fund Accounting': 8788
Before OverSampling, counts of label 'Cosec': 1379 



In [60]:
sm = SMOTE(random_state=2)
train_x_res, train_y_res = sm.fit_sample(train_x, train_y)

In [61]:
print('After OverSampling, the shape of train_X: {}'.format(train_x_res.shape))
print('After OverSampling, the shape of train_y: {} \n'.format(train_y_res.shape))

After OverSampling, the shape of train_X: (123088, 10550)
After OverSampling, the shape of train_y: (123088,) 



## Baseline Model Training and Results

In [62]:
# Naive Bayes Model
mnb = MultinomialNB().fit(train_x_res, train_y_res)
y_mnb = mnb.predict(test_x)
acc_mnb = accuracy_score(test_y, y_mnb)

# SVM Model
svm = SVC(kernel='linear').fit(train_x_res, train_y_res)
y_svm = svm.predict(test_x)
acc_svm = accuracy_score(test_y, y_svm)

# RandomForest Model
rf = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=0).fit(train_x_res, train_y_res)
y_rf = rf.predict(test_x)
acc_rf = accuracy_score(test_y, y_rf)

In [63]:
print("Naive Bayes Acc: %.2f%%" % acc_mnb)
print("SVM Acc: %.2f%%" % acc_svm)
print("RF Acc: %.2f%%" % acc_rf)

Naive Bayes Acc: 0.29%
SVM Acc: 0.35%
RF Acc: 0.31%


In [64]:
# compare results of Naive Bayes, SVM with actuals
results_df = pd.DataFrame({'actual':test_y, 'naivebayes':y_mnb, 'svm':y_svm, 'rf':y_rf})

In [65]:
results_df.head(-50)

Unnamed: 0,actual,naivebayes,svm,rf
19224,Fund Accounting,Fund Accounting,Fund Accounting,Fund Accounting
23093,Fund Accounting,Audit,Audit,Fund Accounting
1070,Personal,Trust,Compliance,Cosec
22926,Fund Accounting,"Corp Accounting, finance",Cosec,Fund Accounting
8078,"Corp Accounting, finance","Corp Accounting, finance","Corp Accounting, finance","Corp Accounting, finance"
18852,Fund Accounting,Investment,Fund Accounting,Investment
5587,"Corp Accounting, finance","Corp Accounting, finance","Corp Accounting, finance",Legal
1073,Personal,Operations,Operations,Sales
1841,Sales,Investment,Fund Accounting,Random
22934,Fund Accounting,Investment,Fund Accounting,Investment


## Deep Learning Model Train and Results

In [11]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.layers import Dropout

Using TensorFlow backend.
  return f(*args, **kwds)


In [12]:
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 5000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 30
# This is fixed.
EMBEDDING_DIM = 100
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(resume_df['resume_line_pp'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 10580 unique tokens.


In [13]:
X = tokenizer.texts_to_sequences(resume_df['resume_line_pp'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

Shape of data tensor: (26502, 30)


In [14]:
Y = pd.get_dummies(resume_df['category']).values
print('Shape of label tensor:', Y.shape)

Shape of label tensor: (26502, 14)


In [15]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.10, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)


(23851, 30) (23851, 14)
(2651, 30) (2651, 14)


In [22]:
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(13, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

UnboundLocalError: local variable 'a' referenced before assignment

In [20]:
epochs = 5
batch_size = 64

history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])


ValueError: Error when checking target: expected dense_2 to have 3 dimensions, but got array with shape (23851, 14)

In [None]:
accr = model.evaluate(X_test,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))