## Notebook Magic

In [None]:
%matplotlib inline
%load_ext autoreload

## Imports

In [None]:
import os
import yaml
import pandas as pd
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
import pickle
import boto3
import time

from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import confusion_matrix
from sklearn.metrics import auc, roc_curve

from sklearn.linear_model import LogisticRegression, Lasso

from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

from scipy.stats import randint

import itertools

from IPython.display import display
pd.options.display.max_columns = None
pd.options.display.width = 0

In [None]:
df_train = pd.read_parquet("s3://iefp-unemployment/copy_test_train/train.parquet")

In [None]:
df_test = pd.read_parquet("s3://iefp-unemployment/copy_test_train/test.parquet")

In [None]:
y_train = df_train.loc[:, "ttj_sub_12"]
X_train = df_train.drop(["ttj", "ttj_sub_12"], axis="columns")

In [None]:
s3 = boto3.client('s3')
response = s3.get_object(Bucket='iefp-unemployment', Key='models/2019/08/07/random_forest_T175519.pkl')
model = pickle.loads(response["Body"].read())


In [None]:
# Load raw permutations

with open('../src/iefp/modelling/100k_combinations.pkl', 'rb') as f:
   a = pickle.load(f)

In [None]:
def generate_permutation_df(raw_permutations, zero_columns, max_interventions):
    # Strip binary prefix
    bin_string = [bin[2:] for bin in raw_permutations]
    
    # Convert strings to floats
    lst_bin_floats = [np.array(list(map(float, bin))) for bin in bin_string]
    
    # Limit max number of interventions per journey
    limited_combos = [com for com in lst_bin_floats if sum(com) <= max_interventions]
    
    # Convert list of binary numbers to dataframe, fill NAs
    permutations =  pd.DataFrame(limited_combos).fillna(0.0)
    
    # Add zero columns if needed
    if zero_columns:
        height = len(permutations.index)
        df_0 = pd.DataFrame(0, index=range(height), columns=range(zero_columns)).astype(float)
        permutations = pd.concat([permutations, df_0], axis=1, ignore_index=True)
    
    return permutations

In [None]:
def get_user_recommendations(journey_series, index, model, permutations, number_results):
    # Create user dataframe to match permutation size
    journey_fixed_df = pd.DataFrame(np.tile(journey_series.values, len(permutations.index)).reshape(-1,len(journey_series.index)), 
                       columns=journey_series.index)
    
    # Strip interventions and output variable from user dataframe
    journey_fixed_df = journey_fixed_df.drop([col for col in journey_fixed_df.columns if "i_" in col] +
                                             ["ttj", "ttj_sub_12"], axis='columns')
    
    # Join demographic features with permutation dataframe
    predict_df = journey_fixed_df.join(permutations)
    
    # Predict for all permutations
    probabilities = pd.DataFrame(model.predict_proba(predict_df))
    probabilities.columns = ["unsuccessful", "successful"]

    #Create final top_n dataframe
    results_df = predict_df.join(probabilities)
    
    top_n = results_df.sort_values(by=['successful'], ascending=False).iloc[0:number_results,:]
    top_n["journey_id"] = index

    return top_n

In [None]:
def gen_top_recommendations_table(df, model, permutations, number_results):
    # Prepare empty dataframe
    recommendations_df = pd.DataFrame()
    
    # Iterate through each user and append the top 5 recommendations to an empty dataframe
    for index, journey in df.iterrows():
        user_recommendations = get_user_recommendations(journey, index, model, permutations, number_results)
        recommendations_df = pd.concat([recommendations_df,user_recommendations])

    return recommendations_df

In [None]:
small_test = df_train.head(10)

In [None]:
start = time.time()
top_5_10 = gen_top_recommendations_table(small_test, model, generate_permutation_df(a, 20, 3), 20)
print(time.time() - start)

In [None]:
top_5_10.info()

In [None]:
matrix = top_5_10.groupby("journey_id").sum()

In [None]:
matrix