In [1]:
import sys
sys.path.append('../app/')

In [2]:
from pymongo import MongoClient
import pandas as pd
from feature_extractor import get_ohe_features, get_portuguese_stop_words
import os

In [3]:
from sklearn.feature_extraction import text
stop_words = text.ENGLISH_STOP_WORDS.union(get_portuguese_stop_words())

In [5]:
client = MongoClient(os.environ.get('MONGO_URI'))
db = client.linkedin
collection = db.jobIds

In [6]:
df = pd.DataFrame(list(collection.find({})))

In [7]:
feature_names = []

In [8]:
from sklearn.model_selection import train_test_split

df['Description Length'] = df['description'].apply(len)

df_train_val = df[pd.notna(df['label'])]
df_test = df[~pd.notna(df['label'])]
X_test = df_test

X = df_train_val.drop(columns=['label'])
y = df_train_val[['label']]

usedFeatures = ['applicants', 'applicants_per_day', 'Description Length']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=777, stratify=y)

final_features_train = [X_train[usedFeatures]]
final_features_val = [X_val[usedFeatures]]
final_features_test = [X_test[usedFeatures]]

final_vects_train = []
final_vects_val = []
final_vects_test = []

feature_names += usedFeatures

In [9]:
from feature_extractor import all_columns_dict
categorical_fields = ['Seniority level', 'Employment type', 'Job function', 'Industries']

for field in categorical_fields:
    feature_names += all_columns_dict[field]
    final_features_train.append(get_ohe_features(X_train, field))
    final_features_val.append(get_ohe_features(X_val, field))
    final_features_test.append(get_ohe_features(X_test, field))

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

title_vec = TfidfVectorizer(min_df=10, ngram_range=(1,1), stop_words=stop_words)
title_bow_train = title_vec.fit_transform(X_train['title'])
title_bow_val = title_vec.transform(X_val['title'])
title_bow_test = title_vec.transform(X_test['title'])

print(title_bow_train.shape)

final_vects_train.append(title_bow_train)
final_vects_val.append(title_bow_val)
final_vects_test.append(title_bow_test)

feature_names += title_vec.get_feature_names()

(511, 25)


In [11]:
desc_vec = TfidfVectorizer(min_df=5, ngram_range=(1,4), stop_words=stop_words)
desc_bow_train = desc_vec.fit_transform(X_train['description'])
desc_bow_val = desc_vec.transform(X_val['description'])
desc_bow_test = desc_vec.transform(X_test['description'])

print(desc_bow_train.shape)

final_vects_train.append(desc_bow_train)
final_vects_val.append(desc_bow_val)
final_vects_test.append(desc_bow_test)

feature_names += desc_vec.get_feature_names()

(511, 7686)


In [12]:
from scipy.sparse import hstack

X_train_full = hstack(final_features_train + final_vects_train)
X_val_full = hstack(final_features_val + final_vects_val)
X_test_full = hstack(final_features_test + final_vects_test)

In [13]:
#feature_names

In [14]:
from sklearn.ensemble import RandomForestClassifier

mdl_rf = RandomForestClassifier(n_estimators=1000, random_state=0, class_weight="balanced", n_jobs=-1, min_samples_leaf=5)
mdl_rf.fit(X_train_full, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=5, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=-1, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [15]:
from sklearn.metrics import roc_auc_score, average_precision_score

p = mdl_rf.predict_proba(X_val_full)[:, 1]

precision = average_precision_score(y_val, p)
auc = roc_auc_score(y_val, p)

print("Precision: {:.2f}%\tAUC: {:.3f}".format(precision*100, auc))

Precision: 73.76%	AUC: 0.871


In [16]:
X_val_full.shape

(171, 7841)

In [17]:
from sklearn.feature_selection import SelectKBest, SelectPercentile, f_regression, f_classif
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor

k_vs_score = []

k = 1400
selector = SelectKBest(score_func=f_classif, k=k)

Xtrain2 = selector.fit_transform(X_train_full, y_train)
Xval2 = selector.transform(X_val_full)

mdl = RandomForestRegressor(n_estimators=1000, n_jobs=-1, random_state=0, min_samples_leaf=5)
mdl.fit(Xtrain2, y_train)

p = mdl.predict(Xval2)

score = average_precision_score(y_val, p)
print("k = {} - MAE = {}".format(k, score))

k_vs_score.append(score)


k = 1400 - MAE = 0.7230305212479924


In [18]:
# Precision: 68.71%	AUC: 0.900 - title_vec = (min_df=3, ngram_range=(1,2) desc_vec =(min_df=4, ngram_range=(1,3)
# Precision: 71.35%	AUC: 0.875 

In [19]:
import itertools

mask = selector.get_support()
#list(itertools.compress(feature_names, mask))

In [20]:
from lightgbm import LGBMClassifier

In [21]:
mdl = LGBMClassifier(random_state=0, class_weight="balanced", n_jobs=-1)
mdl.fit(X_train_full, y_train)

LGBMClassifier(boosting_type='gbdt', class_weight='balanced',
               colsample_bytree=1.0, importance_type='split', learning_rate=0.1,
               max_depth=-1, min_child_samples=20, min_child_weight=0.001,
               min_split_gain=0.0, n_estimators=100, n_jobs=-1, num_leaves=31,
               objective=None, random_state=0, reg_alpha=0.0, reg_lambda=0.0,
               silent=True, subsample=1.0, subsample_for_bin=200000,
               subsample_freq=0)

In [22]:
p = mdl.predict_proba(X_val_full)[:, 1]

precision = average_precision_score(y_val, p)
auc = roc_auc_score(y_val, p)

print("Precision: {:.2f}%\tAUC: {:.3f}".format(precision*100, auc))

Precision: 76.42%	AUC: 0.877


In [23]:
from skopt import forest_minimize

In [24]:
y_train.mean(), y_val.mean()

(label    0.232877
 dtype: float64,
 label    0.233918
 dtype: float64)

In [25]:
def tune_lgbm(params):
    print(params)
    lr = params[0]
    max_depth = params[1]
    min_child_samples = params[2]
    subsample = params[3]
    colsample_bytree = params[4]
    
    min_df = params[5]
    ngram_range = (1, params[6])

    final_vects_train, final_vects_val = [], []
    
    title_vec = TfidfVectorizer(min_df=10, ngram_range=(1,1), stop_words=stop_words)
    title_bow_train = title_vec.fit_transform(X_train['title'])
    title_bow_val = title_vec.transform(X_val['title'])
    
    final_vects_train.append(title_bow_train)
    final_vects_val.append(title_bow_val)

    desc_vec = TfidfVectorizer(min_df=min_df, ngram_range=ngram_range, stop_words=stop_words)
    desc_bow_train = desc_vec.fit_transform(X_train['description'])
    desc_bow_val = desc_vec.transform(X_val['description'])

    final_vects_train.append(desc_bow_train)
    final_vects_val.append(desc_bow_val)

    
    
    X_train_full = hstack(final_features_train + final_vects_train)
    X_val_full = hstack(final_features_val + final_vects_val)


    mdl_lgbm = LGBMClassifier(learning_rate=lr, num_leaves=2 ** max_depth, max_depth=max_depth,
                            min_child_samples=min_child_samples, subsample=subsample, random_state=0, 
                            colsample_bytree=colsample_bytree, class_weight="balanced", n_jobs=-1)

    mdl_lgbm.fit(X_train_full, y_train)
    
    p = mdl_lgbm.predict_proba(X_val_full)[:, 1]
    
    print(roc_auc_score(y_val, p))
    
    return -average_precision_score(y_val, p)


space = [(1e-2, 1e-1, 'log-uniform'), # lr
          (1, 12), # max_depth
          (10, 30), # min_child_samples
          (0.05, 1.), # subsample
          (0.05, 1.), # colsample_bytree
          (2,8), # min_df desc
          (1,4)] # ngram_range desc

res = forest_minimize(tune_lgbm, space, random_state=7777, n_random_starts=40, n_calls=100, verbose=1)

0.7912
Iteration No: 35 started. Evaluating function at random point.
[0.03485552780947716, 3, 16, 0.9824024873308342, 0.6279493869140963, 5, 2]
0.8591603053435114
Iteration No: 35 ended. Evaluation done at random point.
Time taken: 1.1655
Function value obtained: -0.7029
Current minimum: -0.7912
Iteration No: 36 started. Evaluating function at random point.
[0.028412685616154454, 6, 13, 0.196771888771214, 0.7526360135581259, 3, 3]
0.8520992366412213
Iteration No: 36 ended. Evaluation done at random point.
Time taken: 1.7530
Function value obtained: -0.7131
Current minimum: -0.7912
Iteration No: 37 started. Evaluating function at random point.
[0.048938517872981674, 10, 11, 0.0697130003282783, 0.21710212089182218, 3, 4]
0.8607824427480916
Iteration No: 37 ended. Evaluation done at random point.
Time taken: 2.4205
Function value obtained: -0.7606
Current minimum: -0.7912
Iteration No: 38 started. Evaluating function at random point.
[0.010700332067545254, 7, 20, 0.8026304928396812, 0.60

In [26]:
res.x

[0.09602533443444468, 11, 15, 0.815626604502644, 0.9231700046193302, 5, 1]

In [27]:
params = [0.012, 6, 28, 0.8, 0.1, 4, 3, 3, 4]
params = res.x
lr = params[0]
max_depth = params[1]
min_child_samples = params[2]
subsample = params[3]
colsample_bytree = params[4]

min_df = params[5]
ngram_range = (1, params[6])

final_vects_train, final_vects_val = [], []

title_vec = TfidfVectorizer(min_df=10, ngram_range=(1,1), stop_words=stop_words)
title_bow_train = title_vec.fit_transform(X_train['title'])
title_bow_val = title_vec.transform(X_val['title'])

final_vects_train.append(title_bow_train)
final_vects_val.append(title_bow_val)

desc_vec = TfidfVectorizer(min_df=min_df, ngram_range=ngram_range, stop_words=stop_words)
desc_bow_train = desc_vec.fit_transform(X_train['description'])
desc_bow_val = desc_vec.transform(X_val['description'])

final_vects_train.append(desc_bow_train)
final_vects_val.append(desc_bow_val)



X_train_full = hstack(final_features_train + final_vects_train)
X_val_full = hstack(final_features_val + final_vects_val)


mdl_lgbm = LGBMClassifier(learning_rate=lr, num_leaves=2 ** max_depth, max_depth=max_depth,
                        min_child_samples=min_child_samples, subsample=subsample, random_state=0, 
                        colsample_bytree=colsample_bytree, class_weight="balanced", n_jobs=-1)

mdl_lgbm.fit(X_train_full, y_train)

p_lgbm = mdl_lgbm.predict_proba(X_val_full)[:, 1]

precision = average_precision_score(y_val, p_lgbm)
auc = roc_auc_score(y_val, p_lgbm)

print("Precision: {:.2f}%\tAUC: {:.3f}".format(precision*100, auc))


Precision: 79.12%	AUC: 0.877


In [28]:
mdl_rf = RandomForestClassifier(n_estimators=1000, random_state=0, class_weight="balanced", n_jobs=-1, min_samples_leaf=2)
mdl_rf.fit(X_train_full, y_train)


p_rf = mdl_rf.predict_proba(X_val_full)[:, 1]

precision = average_precision_score(y_val, p_rf)
auc = roc_auc_score(y_val, p_rf)

print("Precision: {:.2f}%\tAUC: {:.3f}".format(precision*100, auc))

Precision: 74.26%	AUC: 0.852


In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MaxAbsScaler
from sklearn.pipeline import make_pipeline

In [30]:
lr_pipe =  make_pipeline(MaxAbsScaler(), LogisticRegression(C=2.0, penalty='l2', n_jobs=-1, random_state=777))
lr_pipe.fit(X_train_full, y_train)

p_lr = lr_pipe.predict_proba(X_val_full)[:, 1]


precision = average_precision_score(y_val, p_lr)
auc = roc_auc_score(y_val, p_lr)

print("Precision: {:.2f}%\tAUC: {:.3f}".format(precision*100, auc))

Precision: 67.79%	AUC: 0.821


In [31]:
pd.DataFrame({"RF": p_rf, "LGBM" : p_lgbm, "LR" : p_lr}).corr()

Unnamed: 0,RF,LGBM,LR
RF,1.0,0.872379,0.853337
LGBM,0.872379,1.0,0.852574
LR,0.853337,0.852574,1.0


In [34]:
part_lgbm, part_rf, part_lr = .6, .2, .2
p = part_lgbm * p_lgbm + part_rf * p_rf + part_lr * p_lr
average_precision_score(y_val, p), roc_auc_score(y_val, p)

(0.7696668417358516, 0.8598282442748092)

In [35]:
import joblib as jb

In [36]:
import datetime

In [37]:
timestamp = datetime.datetime.now().strftime('%d_%b_%Y')

In [38]:
jb.dump(mdl_lgbm, f'../app/models/lgbm_{timestamp}.jb')
jb.dump(mdl_rf, f'../app/models/rf_{timestamp}.jb')
jb.dump(lr_pipe, f'../app/models/lr_{timestamp}.jb')
jb.dump(desc_vec, f'../app/models/desc_vec_{timestamp}.jb')
jb.dump(title_vec, f'../app/models/title_vec_{timestamp}.jb')

['../app/models/title_vec_28_Jul_2020.jb']