## Notebook Magic

In [None]:
%matplotlib inline
%load_ext autoreload

## Imports

In [None]:
import os
import yaml
import pandas as pd
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np

from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import confusion_matrix
from sklearn.metrics import auc, roc_curve

from sklearn.linear_model import LogisticRegression, Lasso

from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

from scipy.stats import randint

import itertools

from IPython.display import display
pd.options.display.max_columns = None
pd.options.display.width = 0

In [None]:
df = pd.read_parquet("s3://iefp-unemployment/modelling/modelling.parquet")
df.shape
df.head(10)

In [None]:
# Intervention feature preparation

interv_cols = [col for col in df.columns if "i_" in col]
X = df[interv_cols].copy()

# Strip col names
#X.columns = [col.replace("i_", "") for col in X.columns]

# Make df boolean
X = (X.notna()).astype('int')

# Filter for frequent interventions
frequent_i = X.mean()[X.mean() > 0.01].index.tolist()
X = X[frequent_i]

In [None]:
X.head(10)

In [None]:
len(X.columns.tolist())

In [None]:
# Demographics prep

dems = ["d_age", "d_gender", "d_civil_status", "d_rsi", "d_desired_work_time", "d_desired_contract",
       "d_school_qualification", "d_college_qualification", "d_disabled", "d_subsidy", "d_previous_job_sector",
       "d_desired_job_sector", "d_previous_job_experience"]

#Filling NAs!!


X[dems] = df[dems].fillna(0)
X["register_month"] = df.register_date.dt.month
X["register_year"] = df.register_date.dt.year
X = pd.get_dummies(X, drop_first=True, dummy_na=True)
X.head()

# Set output variable

Y = df["success"]

In [None]:
X.columns.tolist()

In [None]:
# Test/Train split


X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=0)

In [None]:
# Prepare Random Forest pipeline

rf_pipeline = Pipeline([
    ('rf', RandomForestClassifier(n_estimators=50, n_jobs=-1, random_state=0)),
])

'''
# Prepare Random Forest pipeline with scaling

rf__scale_pipeline = Pipeline([
    ('scale', StandardScaler()),
    ('rf', RandomForestClassifier(n_estimators=50, n_jobs=-1, random_state=0))
])
'''

In [None]:
# Grid search

big_param_grid = [
        {'rf__n_estimators': [1,10,100,1000,10000],
         'rf__max_depth': [1,5,10,20,50,100],
         'rf__max_features': ['sqrt','log2'],
         'rf__min_samples_split': [2,5,10],
         'rf__n_jobs': [-1]
        },
    ]

param_grid = [
        {'rf__n_estimators': [100,1000],
         'rf__max_depth': [2, 3]
        },
    ]


rf_grid_search = GridSearchCV(rf_pipeline, param_grid, cv=3, refit=True)

rf_grid_search.fit(X_train, y_train)

rf_final_model = rf_grid_search.best_estimator_

print(confusion_matrix(y_test, rf_final_model.predict(X_test)))

print(accuracy_score(y_test, rf_final_model.predict(X_test)))

In [None]:
# Permutations

print(rf_final_model.predict(X_test.head(10)))
print(y_test.head(10))

In [None]:
combinations = [c for c in itertools.product([0, 1], repeat=16) if sum(c) <= 5]

In [None]:
combo_df = pd.DataFrame(combinations)

In [None]:
combo_df.shape

In [None]:
user_data = X_test.head(1)

In [None]:
user_data

In [None]:
user_data_fixed = user_data.drop(user_data.iloc[:,0:16], axis=1)

In [None]:
user_data_fixed

In [None]:
user_data_fixed.shape

In [None]:
user_fixed_df = pd.DataFrame(np.tile(user_data_fixed.values, len(combo_df.index)).reshape(-1,len(user_data_fixed.columns)), 
                   columns=user_data_fixed.columns)

In [None]:
user_fixed_df.shape

In [None]:
predict_df = combo_df.join(user_fixed_df)

In [None]:
predict_df.shape

In [None]:
probabilities = pd.DataFrame(rf_final_model.predict_proba(predict_df))

In [None]:
probabilities.columns = ["unsuccessful", "successful"]

In [None]:
results_df = predict_df.join(probabilities)

In [None]:
results_df.iloc[:,0:16].columns = ['i_job_search_techn',
 'i_train_active_life',
 'i_tutoring_in_individual_job_search',
 'i_professional_internships',
 'i_internship_job',
 'i_employment-insertion_contract',
 'i_employment_contract_insertion',
 'i_information_job_eval_orientat',
 'i_collective_orientation_session',
 'i_tutoring_in_collective_job_search',
 'i_efa_s3_type_a',
 'i_modular',
 'i_efa_n3_vocational_train',
 'i_assertive_communication_job_search_techn',
 'i_entrepreneurship_skills_job_search_techn',
 'i_train_external_entities']

In [None]:
display(results_df.sort_values(by=['successful'], ascending=False))

In [None]:
# Prepare LogReg pipeline

logreg_pipeline = Pipeline([
    ('logreg', LogisticRegression(penalty='l1', C=1e5)),
])


In [None]:
# Grid search

param_grid = [
        {'logreg__penalty': ['l1','l2'],
         'logreg__C': [0.00001,0.0001,0.001,0.01,0.1,1,10]
        },
    ]

logreg_grid_search = GridSearchCV(logreg_pipeline, param_grid, cv=3, refit=True)

logreg_grid_search.fit(X_train, y_train)

logreg_final_model = logreg_grid_search.best_estimator_

print(confusion_matrix(y_test, logreg_final_model.predict(X_test)))

print(accuracy_score(y_test, logreg_final_model.predict(X_test)))

In [None]:
# Prepare Gradient boost pipeline

gboost_pipeline = Pipeline([
    ('gboost', GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=10)),
])

In [None]:
# Grid search

param_grid = [
        {'gboost__n_estimators': [1,10,100,1000,10000],
         'gboost__learning_rate' : [0.001,0.01,0.05,0.1,0.5],
         'gboost__subsample' : [0.1,0.5,1.0],
         'gboost__max_depth': [1,3,5,10,20,50,100]
        },
    ]

gboost_grid_search = GridSearchCV(gboost_pipeline, param_grid, cv=3, refit=True)

gboost_grid_search.fit(X_train, y_train)

gboost_final_model = gboost_grid_search.best_estimator_

print(confusion_matrix(y_test, gboost_final_model.predict(X_test)))

print(accuracy_score(y_test, gboost_final_model.predict(X_test)))

In [None]:
# Precision / Recall graph


def plot_precision_recall(y_test, y_pred, y_prob):
    average_precision = average_precision_score(y_test, y_pred)
    precision, recall, _ = precision_recall_curve(y_test, y_prob)
    print("{} Precision at {} Recall".format(precision[recall>0.8].max(), 0.8))

    step_kwargs = ({'step': 'post'})
    plt.step(recall, precision, color='b', alpha=0.2, where='post')
    plt.fill_between(recall, precision, alpha=0.2, color='b', **step_kwargs)

    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 1.0])
    plt.title('2-class Precision-Recall curve: AP={0:0.2f}'.format(average_precision))

y_pred = pipeline.predict(X_test)
y_prob = pipeline.predict_proba(X_test)[:,1]
plot_precision_recall(y_test, y_pred, y_prob)

In [None]:
# Roc curve

fpr, tpr, threshold = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
# Feature importance

pd.Series(rf.feature_importances_, X_train.columns).sort_values(ascending=False).head(20)