In [1]:
import pandas as pd
import re
import datetime
import numpy as np
import matplotlib.pyplot as plt
import string 
import warnings
warnings.simplefilter('ignore')
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import recall_score, roc_auc_score, accuracy_score, plot_confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from regressors import stats
from sklearn.feature_selection import chi2

Using TensorFlow backend.


In [2]:
def show_scores(model, 
                X_train, y_train,
                X_valid, y_valid):
    """
    Shows train and test error 
    Parameters
    ----------
    model: sklearn classifier model
        The sklearn model
    X_train: numpy.ndarray        
        The X part of the train set
    y_train: numpy.ndarray
        The y part of the train set    
    X_valid: numpy.ndarray        
        The X part of the validation set
    y_valid: numpy.ndarray
        The y part of the validation set    
    Returns
    -------
        None
        prints the train and validation errors. 
            
    """    
    print("Training error:   %.2f" % (1-model.score(X_train, y_train)))
    print("Validation error: %.2f" % (1-model.score(X_valid, y_valid)))

In [8]:
def show_sig(num):
    if num < 0.001:
        return "***"
    elif num < 0.01:
        return "**"
    elif num < 0.05:
        return "*"
    else:
        return ""

In [28]:
# Load dataset 
train_original = pd.read_csv("../data/train_dataset.csv")
test_original = pd.read_csv("../data/test_dataset.csv")
resume_original = pd.read_csv("../data/05182020_cleaned_english_resumes_V1.0.csv")
avg_work = pd.read_csv("../result/avg_work_exp.csv")
competitor = pd.read_csv("../result/competitor_experience.csv")

In [29]:
# Prepare train and test datasets
train = pd.merge(train_original, resume_original, how="left", on="employee_code").merge(
                                     avg_work, how="left", on="employee_code").merge(
                                     competitor, how="left", on="employee_code"
                )[["employee_code", "raw_resume", "resume_text", "resume_bline", "clean_text", "job_hopper", "competitor_experience", "hp_class"]]

test = pd.merge(test_original, resume_original, how="left", on="employee_code"
               )[["employee_code", "raw_resume", "resume_text", "resume_bline", "clean_text", "hp_class"]]
X_train = train["clean_text"]
y_train = train["hp_class"]
X_test = test["clean_text"]
y_test = test["hp_class"]

In [30]:
vec = CountVectorizer(max_features=5000, ngram_range=(2, 2))
X_train_counts = vec.fit_transform(X_train) 
X_test_counts = vec.transform(X_test)

In [31]:
lr = LogisticRegression()
lr.fit(X_train_counts, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [32]:
# Get features (words in our case)
vocab = vec.get_feature_names()
weights = lr.coef_.flatten()
scores, pvalues = chi2(X_train_counts, y_train)

# Sort the coefficients in descending order 
inds = np.argsort(lr.coef_.flatten())

# pick the first 20 as most informative features for negative reviews  
negative_words = [vocab[index] for index in inds[:20]]

# pick the last 20 features as most informative features for positive reviews  
positive_words = [vocab[index] for index in inds[-20:][::-1]]

neg_words_weights = [(weights[index]) for index in inds[:20]]
pos_words_weights = [(weights[index]) for index in inds[-20:][::-1]]

neg_words_pvalues = [(pvalues[index]) for index in inds[:20]]
pos_words_pvalues = [(pvalues[index]) for index in inds[-20:][::-1]]

df = pd.DataFrame({'Neg feats':negative_words, 'Neg weights':neg_words_weights, "Neg pvalues": neg_words_pvalues, 
                   'Pos feats':positive_words, 'Pos weights':pos_words_weights, "Pos pvalues": pos_words_pvalues})
df["Neg sig."] = df["Neg pvalues"].apply(show_sig)
df["Pos sig."] = df["Pos pvalues"].apply(show_sig)
df[['Neg feats', 'Neg weights', 'Neg pvalues', 'Neg sig.', 'Pos feats', 'Pos weights', 'Pos pvalues', 'Pos sig.']]

Unnamed: 0,Neg feats,Neg weights,Neg pvalues,Neg sig.,Pos feats,Pos weights,Pos pvalues,Pos sig.
0,high school,-0.513144,0.014402,*,efficient service,0.440615,7.9e-05,***
1,sale associate,-0.467281,0.085937,,rogers communications,0.418416,0.005993,**
2,team player,-0.426519,0.21979,,work ethic,0.382055,0.248989,
3,health safety,-0.355103,0.019471,*,sale closing,0.373974,0.000416,***
4,excellent service,-0.307044,0.254905,,university fraser,0.357177,1.5e-05,***
5,cash register,-0.296875,0.045346,*,sale target,0.352404,0.041786,*
6,ms office,-0.295314,0.284395,,university england,0.336289,0.002239,**
7,food prep,-0.28978,0.060191,,marketing business,0.335721,0.413257,
8,project management,-0.272953,0.049651,*,support technician,0.328337,0.002239,**
9,time management,-0.261679,0.474262,,sale representative,0.325761,0.13021,


In [68]:
X_train_count_df = pd.DataFrame(columns=vec.get_feature_names(), data=X_train_counts.todense())

In [78]:
X_train_count_df[["efficient service", "mobile expert", "high school", "information system", "cash register", "high school"]]

Unnamed: 0,efficient service,mobile expert,high school,information system,cash register,high school.1
0,0,0,0,0,0,0
1,0,0,0,0,0,0
2,0,0,0,0,0,0
3,0,0,0,0,1,0
4,0,0,0,0,0,0
...,...,...,...,...,...,...
283,0,0,1,0,0,1
284,0,0,1,0,0,1
285,0,0,0,0,1,0
286,0,0,0,0,0,0


In [63]:
X_train_counts.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 1, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

Solution:
    1. use forward feature selection 

In [59]:
vec2 = TfidfVectorizer(max_features=5000, ngram_range=(2,2))
X_train_tfidf = vec2.fit_transform(X_train) 
X_test_tfidf = vec2.transform(X_test)

In [60]:
lr2 = LogisticRegression()
lr2.fit(X_train_tfidf, y_train)
#show_scores(lr2,X_train_tfidf, y_train, X_test_counts, y_test)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [61]:
vocab = vec2.get_feature_names()
weights = lr2.coef_.flatten()
scores, pvalues = chi2(X_train_counts, y_train)

# Sort the coefficients in descending order 
inds = np.argsort(lr2.coef_.flatten())

# pick the first 20 as most informative features for negative reviews  
negative_words = [vocab[index] for index in inds[:20]]

# pick the last 20 features as most informative features for positive reviews  
positive_words = [vocab[index] for index in inds[-20:][::-1]]

neg_words_weights = [(weights[index]) for index in inds[:20]]
pos_words_weights = [(weights[index]) for index in inds[-20:][::-1]]

neg_words_pvalues = [(pvalues[index]) for index in inds[:20]]
pos_words_pvalues = [(pvalues[index]) for index in inds[-20:][::-1]]

df = pd.DataFrame({'Neg feats':negative_words, 'Neg weights':neg_words_weights, "Neg pvalues": neg_words_pvalues, 
                   'Pos feats':positive_words, 'Pos weights':pos_words_weights, "Pos pvalues": pos_words_pvalues})
df["Neg sig."] = df["Neg pvalues"].apply(show_sig)
df["Pos sig."] = df["Pos pvalues"].apply(show_sig)
df[['Neg feats', 'Neg weights', 'Neg pvalues', 'Neg sig.', 'Pos feats', 'Pos weights', 'Pos pvalues', 'Pos sig.']]

Unnamed: 0,Neg feats,Neg weights,Neg pvalues,Neg sig.,Pos feats,Pos weights,Pos pvalues,Pos sig.
0,high school,-0.453811,0.014402,*,university fraser,0.489146,1.5e-05,***
1,sale associate,-0.38712,0.085937,,sale target,0.435773,0.041786,*
2,cash register,-0.307305,0.045346,*,efficient service,0.43528,7.9e-05,***
3,health safety,-0.305364,0.019471,*,experience company,0.419116,7.9e-05,***
4,associate cashier,-0.286725,0.049651,*,sales manager,0.369597,0.010504,*
5,excellent service,-0.253886,0.254905,,sale closing,0.361915,0.000416,***
6,human resource,-0.245584,0.028188,*,product knowledge,0.361513,0.006649,**
7,food prep,-0.238484,0.060191,,rogers communications,0.340873,0.005993,**
8,mohawk college,-0.234491,0.060191,,university england,0.330333,0.002239,**
9,fast paced,-0.230238,0.052972,,new student,0.32599,7.9e-05,***
