## Notebook Magic

In [None]:
%matplotlib inline
%load_ext autoreload

## Imports

In [None]:
import os
import yaml
import pandas as pd
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np

from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import confusion_matrix
from sklearn.metrics import auc, roc_curve

from sklearn.linear_model import LogisticRegression, Lasso

from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

from scipy.stats import randint

import itertools

from IPython.display import display
pd.options.display.max_columns = None
pd.options.display.width = 0

In [None]:
df = pd.read_parquet("s3://iefp-unemployment/modelling/modelling.parquet")

In [None]:
df.shape
display(df.head(10))

In [None]:
df.info()

In [None]:
# Intervention feature preparation

interv_cols = [col for col in df.columns if "i_" in col]
X = df[interv_cols].copy()

# Strip col names
X.columns = [col.replace("i_", "") for col in X.columns]

# Make df boolean
X = (X.notna()).astype('int')

# Filter for frequent interventions
frequent_i = X.mean()[X.mean() > 0.01].index.tolist()
X = X[frequent_i]

In [None]:
# Demographics prep

dems = ["d_age", "d_gender", "d_civil_status", "d_rsi", "d_desired_work_time", "d_desired_contract",
       "d_school_qualification", "d_college_qualification", "d_disabled", "d_subsidy", "d_previous_job_sector",
       "d_desired_job_sector", "d_previous_job_experience"]

X[dems] = df[dems]

In [None]:
display(X.head(10))

In [None]:
X.info()

In [None]:
# Dealing with missing values
# Leave None type as feature in college_qual
# Fill NAs with 0 for school qualification!!!
# Fill NAs with 0 for previous job experience

X['d_school_qualification'] = X['d_school_qualification'].fillna(0)
X['d_previous_job_experience'] = X['d_previous_job_experience'].fillna(0)


In [None]:
X.info()

In [None]:
# Create temporal features

# Encode temporal variables as strings

X["register_month"] = df.register_date.dt.month.astype(str)
X["register_year"] = df.register_date.dt.year.astype(str)

# Convert categorical to dummies 

X = pd.get_dummies(X, drop_first=True, dummy_na=True)

display(X.head(10))

In [None]:
# Set output variable

Y = df["success"]

In [None]:
X.columns.tolist()

In [None]:
# Test/Train split


X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=0)

In [None]:

# Prepare Random Forest pipeline with scaling (for Age and job experience)

rf__scale_pipeline = Pipeline([
    ('scale', MinMaxScaler()),
    ('rf', RandomForestClassifier(n_estimators=300, n_jobs=-1, random_state=0))
])

# Best model from big grid search

best_rf__scale_pipeline = Pipeline([
    ('scale', MinMaxScaler()),
    ('rf', RandomForestClassifier(bootstrap=True, class_weight=None,
                                        criterion='gini', max_depth=20,
                                        max_features='sqrt',
                                        max_leaf_nodes=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1,
                                        min_samples_split=20,
                                        min_weight_fraction_leaf=0.0,
                                        n_estimators=100, n_jobs=-1,
                                        oob_score=False, random_state=0,
                                        verbose=0, warm_start=False))
])


In [None]:
# Best model

best_model = best_rf__scale_pipeline.fit(X_train, y_train)

print(confusion_matrix(y_test, best_model.predict(X_test)))

print(accuracy_score(y_test, best_model.predict(X_test)))

In [None]:
# Grid search

big_param_grid = [
        {'rf__n_estimators': [1,10,100,1000,10000],
         'rf__max_depth': [1,5,10,20,50,100],
         'rf__max_features': ['sqrt','log2'],
         'rf__min_samples_split': [2,5,10],
         'rf__n_jobs': [-1]
        },
    ]

param_grid = [
        {'rf__n_estimators': [500, 1000],
         'rf__max_depth': [2, 3, 6]
        },
    ]


rf_grid_search = GridSearchCV(rf__scale_pipeline, param_grid, cv=3, refit=True)

rf_grid_search.fit(X_train, y_train)

rf_final_model = rf_grid_search.best_estimator_

print(confusion_matrix(y_test, rf_final_model.predict(X_test)))

print(accuracy_score(y_test, rf_final_model.predict(X_test)))
'''
f = open("results.txt", "a")
f.write(str(rf_final_model) + "\n")
f.write(str(confusion_matrix(y_test, rf_final_model.predict(X_test))) + "\n")
f.write(str(accuracy_score(y_test, rf_final_model.predict(X_test))) + "\n")
f.write("\n")
f.close()
'''

In [None]:
# Permutations - full

combinations = [c for c in itertools.product([0, 1], repeat=16) if sum(c) <= 2]

combo_df = pd.DataFrame(combinations)

#combo_df.to_pickle("./full_combo.pkl")
#unpickled_df = pd.read_pickle("./full_combo.pkl")

In [None]:
type(X_test.iloc[[75]])

In [None]:
type(X.iloc[[75]])

In [None]:
user_2_data = X.iloc[93182]

In [None]:
user_data = X_test.iloc[[5631]]

In [None]:
user_data_fixed = user_data.drop(user_data.iloc[:,0:16], axis=1)

In [None]:
# Create user dataframe to match combosize 

user_fixed_df = pd.DataFrame(np.tile(user_data_fixed.values, len(combo_df.index)).reshape(-1,len(user_data_fixed.columns)), 
                   columns=user_data_fixed.columns)

In [None]:
predict_df = combo_df.join(user_fixed_df)

In [None]:
probabilities = pd.DataFrame(best_model.predict_proba(predict_df))

In [None]:
probabilities.columns = ["unsuccessful", "successful"]

In [None]:
results_df = predict_df.join(probabilities)

In [None]:
results_df.iloc[:,0:16].columns = ['i_job_search_techn',
 'i_train_active_life',
 'i_tutoring_in_individual_job_search',
 'i_professional_internships',
 'i_internship_job',
 'i_employment-insertion_contract',
 'i_employment_contract_insertion',
 'i_information_job_eval_orientat',
 'i_collective_orientation_session',
 'i_tutoring_in_collective_job_search',
 'i_efa_s3_type_a',
 'i_modular',
 'i_efa_n3_vocational_train',
 'i_assertive_communication_job_search_techn',
 'i_entrepreneurship_skills_job_search_techn',
 'i_train_external_entities']

In [None]:
display(results_df.sort_values(by=['successful'], ascending=False))

In [None]:
display(results_df.sort_values(by=['successful'], ascending=False))