In [1]:
from pymongo import MongoClient
import pandas as pd
from feature_extractor import get_ohe_features

In [2]:
from sklearn.feature_extraction import text
from portuguese_stop_words import pt_br_stopwords
stop_words = text.ENGLISH_STOP_WORDS.union(pt_br_stopwords)

In [3]:
client = MongoClient()
db = client.linkedin
collection = db.jobIds

In [4]:
df = pd.DataFrame(list(collection.find({})))

In [5]:
from sklearn.model_selection import train_test_split

df['Description Length'] = df['description'].apply(len)

df_train_val = df[pd.notna(df['label'])]
df_test = df[~pd.notna(df['label'])]
X_test = df_test

X = df_train_val.drop(columns=['label'])
y = df_train_val[['label']]

usedFeatures = ['applicants', 'applicants_per_day', 'Description Length']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=777, stratify=y)

final_features_train = [X_train[usedFeatures]]
final_features_val = [X_val[usedFeatures]]
final_features_test = [X_test[usedFeatures]]

final_vects_train = []
final_vects_val = []
final_vects_test = []

In [6]:
categorical_fields = ['Seniority level', 'Employment type', 'Job function', 'Industries']

for field in categorical_fields:
    final_features_train.append(get_ohe_features(X_train, field))
    final_features_val.append(get_ohe_features(X_val, field))
    final_features_test.append(get_ohe_features(X_test, field))

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

title_vec = TfidfVectorizer(min_df=10, ngram_range=(1,2), stop_words=stop_words)
title_bow_train = title_vec.fit_transform(X_train['title'])
title_bow_val = title_vec.transform(X_val['title'])
title_bow_test = title_vec.transform(X_test['title'])

print(title_bow_train.shape)

final_vects_train.append(title_bow_train)
final_vects_val.append(title_bow_val)
final_vects_test.append(title_bow_test)

(498, 34)


In [8]:
desc_vec = TfidfVectorizer(min_df=5, ngram_range=(1,4), stop_words=stop_words)
desc_bow_train = desc_vec.fit_transform(X_train['description'])
desc_bow_val = desc_vec.transform(X_val['description'])
desc_bow_test = desc_vec.transform(X_test['description'])

print(desc_bow_train.shape)

final_vects_train.append(desc_bow_train)
final_vects_val.append(desc_bow_val)
final_vects_test.append(desc_bow_test)

(498, 6784)


In [9]:
from scipy.sparse import hstack

X_train_full = hstack(final_features_train + final_vects_train)
X_val_full = hstack(final_features_val + final_vects_val)
X_test_full = hstack(final_features_test + final_vects_test)

In [10]:
from sklearn.ensemble import RandomForestClassifier

mdl_rf = RandomForestClassifier(n_estimators=1000, random_state=0, class_weight="balanced", n_jobs=-1, min_samples_leaf=5)
mdl_rf.fit(X_train_full, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=5, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=-1, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [11]:
from sklearn.metrics import roc_auc_score, average_precision_score

p = mdl_rf.predict_proba(X_val_full)[:, 1]

precision = average_precision_score(y_val, p)
auc = roc_auc_score(y_val, p)

print("Precision: {:.2f}%\tAUC: {:.3f}".format(precision*100, auc))

Precision: 57.97%	AUC: 0.855


In [12]:
# Precision: 68.71%	AUC: 0.900 - title_vec = (min_df=3, ngram_range=(1,2) desc_vec =(min_df=4, ngram_range=(1,3)
# Precision: 71.35%	AUC: 0.875 

In [13]:
from lightgbm import LGBMClassifier

In [14]:
mdl = LGBMClassifier(random_state=0, class_weight="balanced", n_jobs=-1)
mdl.fit(X_train_full, y_train)

LGBMClassifier(boosting_type='gbdt', class_weight='balanced',
               colsample_bytree=1.0, importance_type='split', learning_rate=0.1,
               max_depth=-1, min_child_samples=20, min_child_weight=0.001,
               min_split_gain=0.0, n_estimators=100, n_jobs=-1, num_leaves=31,
               objective=None, random_state=0, reg_alpha=0.0, reg_lambda=0.0,
               silent=True, subsample=1.0, subsample_for_bin=200000,
               subsample_freq=0)

In [15]:
p = mdl.predict_proba(X_val_full)[:, 1]

precision = average_precision_score(y_val, p)
auc = roc_auc_score(y_val, p)

print("Precision: {:.2f}%\tAUC: {:.3f}".format(precision*100, auc))

Precision: 56.15%	AUC: 0.847


In [16]:
from skopt import forest_minimize

In [17]:
y_train.mean(), y_val.mean()

(label    0.2249
 dtype: float64,
 label    0.222892
 dtype: float64)

In [18]:
def tune_lgbm(params):
    print(params)
    lr = params[0]
    max_depth = params[1]
    min_child_samples = params[2]
    subsample = params[3]
    colsample_bytree = params[4]
    
    min_df = params[5]
    ngram_range = (1, params[6])
    
    min_df_desc = params[7]
    ngram_range_desc = (1, params[8])

    final_vects_train, final_vects_val = [], []
    
    title_vec = TfidfVectorizer(min_df=min_df, ngram_range=ngram_range, stop_words=stop_words)
    title_bow_train = title_vec.fit_transform(X_train['title'])
    title_bow_val = title_vec.transform(X_val['title'])
    
    final_vects_train.append(title_bow_train)
    final_vects_val.append(title_bow_val)

    desc_vec = TfidfVectorizer(min_df=min_df_desc, ngram_range=ngram_range_desc, stop_words=stop_words)
    desc_bow_train = desc_vec.fit_transform(X_train['description'])
    desc_bow_val = desc_vec.transform(X_val['description'])

    final_vects_train.append(desc_bow_train)
    final_vects_val.append(desc_bow_val)

    
    
    X_train_full = hstack(final_features_train + final_vects_train)
    X_val_full = hstack(final_features_val + final_vects_val)


    mdl_lgbm = LGBMClassifier(learning_rate=lr, num_leaves=2 ** max_depth, max_depth=max_depth,
                            min_child_samples=min_child_samples, subsample=subsample, random_state=0, 
                            colsample_bytree=colsample_bytree, class_weight="balanced", n_jobs=-1)

    mdl_lgbm.fit(X_train_full, y_train)
    
    p = mdl_lgbm.predict_proba(X_val_full)[:, 1]
    
    print(roc_auc_score(y_val, p))
    
    return -average_precision_score(y_val, p)


space = [(1e-2, 1e-1, 'log-uniform'), # lr
          (1, 12), # max_depth
          (10, 30), # min_child_samples
          (0.05, 1.), # subsample
          (0.05, 1.), # colsample_bytree
          (2,15), # min_df title
          (1,3), # ngram_range title
          (2,8), # min_df desc
          (1,4)] # ngram_range desc

res = forest_minimize(tune_lgbm, space, random_state=7777, n_random_starts=40, n_calls=100, verbose=1)

eration No: 36 ended. Evaluation done at random point.
Time taken: 2.8215
Function value obtained: -0.5603
Current minimum: -0.5990
Iteration No: 37 started. Evaluating function at random point.
[0.03277820499942151, 1, 27, 0.3296239518032571, 0.5943463171315009, 15, 3, 5, 4]
0.8117536140791956
Iteration No: 37 ended. Evaluation done at random point.
Time taken: 1.5585
Function value obtained: -0.5614
Current minimum: -0.5990
Iteration No: 38 started. Evaluating function at random point.
[0.01278532962202386, 3, 26, 0.4412646381559916, 0.6196014465731267, 12, 3, 2, 1]
0.8201340875759481
Iteration No: 38 ended. Evaluation done at random point.
Time taken: 0.4135
Function value obtained: -0.5436
Current minimum: -0.5990
Iteration No: 39 started. Evaluating function at random point.
[0.021458348781111743, 8, 17, 0.8738509201164921, 0.7492052899412638, 13, 3, 3, 2]
0.822962497381102
Iteration No: 39 ended. Evaluation done at random point.
Time taken: 1.0480
Function value obtained: -0.5225

In [19]:
res.x

[0.012677230579130063,
 6,
 28,
 0.7966812636852204,
 0.1073814522666183,
 4,
 3,
 3,
 4]

In [20]:
tune_lgbm([0.012,
 6,
 28,
 0.8,
 0.1,
 4,
 3,
 3,
 4])

[0.012, 6, 28, 0.8, 0.1, 4, 3, 3, 4]
0.8799497171590195


-0.6307312739439663

In [21]:
params = [0.012, 6, 28, 0.8, 0.1, 4, 3, 3, 4]
#params = res.x
lr = params[0]
max_depth = params[1]
min_child_samples = params[2]
subsample = params[3]
colsample_bytree = params[4]

min_df = params[5]
ngram_range = (1, params[6])

min_df_desc = params[7]
ngram_range_desc = (1, params[8])

final_vects_train, final_vects_val = [], []

title_vec = TfidfVectorizer(min_df=min_df, ngram_range=ngram_range, stop_words=stop_words)
title_bow_train = title_vec.fit_transform(X_train['title'])
title_bow_val = title_vec.transform(X_val['title'])

final_vects_train.append(title_bow_train)
final_vects_val.append(title_bow_val)

desc_vec = TfidfVectorizer(min_df=min_df_desc, ngram_range=ngram_range_desc, stop_words=stop_words)
desc_bow_train = desc_vec.fit_transform(X_train['description'])
desc_bow_val = desc_vec.transform(X_val['description'])

final_vects_train.append(desc_bow_train)
final_vects_val.append(desc_bow_val)



X_train_full = hstack(final_features_train + final_vects_train)
X_val_full = hstack(final_features_val + final_vects_val)


mdl_lgbm = LGBMClassifier(learning_rate=lr, num_leaves=2 ** max_depth, max_depth=max_depth,
                        min_child_samples=min_child_samples, subsample=subsample, random_state=0, 
                        colsample_bytree=colsample_bytree, class_weight="balanced", n_jobs=-1)

mdl_lgbm.fit(X_train_full, y_train)

p_lgbm = mdl_lgbm.predict_proba(X_val_full)[:, 1]

precision = average_precision_score(y_val, p_lgbm)
auc = roc_auc_score(y_val, p_lgbm)

print("Precision: {:.2f}%\tAUC: {:.3f}".format(precision*100, auc))


Precision: 63.07%	AUC: 0.880


In [41]:
mdl_rf = RandomForestClassifier(n_estimators=1000, random_state=0, class_weight="balanced", n_jobs=-1, min_samples_leaf=2)
mdl_rf.fit(X_train_full, y_train)


p_rf = mdl_rf.predict_proba(X_val_full)[:, 1]

precision = average_precision_score(y_val, p_rf)
auc = roc_auc_score(y_val, p_rf)

print("Precision: {:.2f}%\tAUC: {:.3f}".format(precision*100, auc))

Precision: 58.42%	AUC: 0.845


In [42]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MaxAbsScaler
from sklearn.pipeline import make_pipeline

In [43]:
lr_pipe =  make_pipeline(MaxAbsScaler(), LogisticRegression(C=2.0, penalty='l2', n_jobs=-1, random_state=777))
lr_pipe.fit(X_train_full, y_train)

p_lr = lr_pipe.predict_proba(X_val_full)[:, 1]


precision = average_precision_score(y_val, p_lr)
auc = roc_auc_score(y_val, p_lr)

print("Precision: {:.2f}%\tAUC: {:.3f}".format(precision*100, auc))

Precision: 59.95%	AUC: 0.824


In [44]:
pd.DataFrame({"RF": p_rf, "LGBM" : p_lgbm, "LR" : p_lr}).corr()

Unnamed: 0,RF,LGBM,LR
RF,1.0,0.890783,0.890488
LGBM,0.890783,1.0,0.723396
LR,0.890488,0.723396,1.0


In [45]:
part_lgbm, part_rf, part_lr = .8, .0, .2
p = part_lgbm * p_lgbm + part_rf * p_rf + part_lr * p_lr
average_precision_score(y_val, p), roc_auc_score(y_val, p)

(0.6308494129133824, 0.8701026608003353)

In [46]:
import joblib as jb

In [47]:
import datetime

In [48]:
timestamp = datetime.datetime.now().strftime('%d_%b_%Y')

In [49]:
jb.dump(mdl_lgbm, f'./models/lgbm_{timestamp}.jb')
jb.dump(mdl_rf, f'./models/rf_{timestamp}.jb')
jb.dump(lr_pipe, f'./models/lr_{timestamp}.jb')
jb.dump(desc_vec, f'./models/desc_vec_{timestamp}.jb')
jb.dump(title_vec, f'./models/title_vec_{timestamp}.jb')

['./models/title_vec_19_Jul_2020.jb']