## Notebook Magic

In [None]:
%matplotlib inline
%load_ext autoreload

## Imports

In [None]:
import os
import yaml
import pandas as pd
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
import pickle
import boto3
import time

from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import confusion_matrix
from sklearn.metrics import auc, roc_curve

from sklearn.linear_model import LogisticRegression, Lasso

from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

from scipy.stats import randint

import itertools

from iefp.recommendation import *

from IPython.display import display
pd.options.display.max_columns = None
pd.options.display.width = 0

from iefp.data.postgres import *
from iefp.data.constants import *
results = query_db("SELECT * FROM {} LIMIT 5;".format(Database.EVALUATION_TABLE))

In [None]:
df_train = pd.read_parquet("s3://iefp-unemployment/copy_test_train/train_T180509.parquet")
df_test = pd.read_parquet("s3://iefp-unemployment/copy_test_train/test_T180509.parquet")

s3 = boto3.client('s3')
response = s3.get_object(Bucket='iefp-unemployment', Key='copy_test_train/random_forest_T180509.pkl')
model = pickle.loads(response["Body"].read())

In [None]:
'''

def generate_combinations(n,k):
    result = list()
    for i in range(1, k + 1):
        for bits in itertools.combinations(range(n), i):
                s = [0] * n
                for bit in bits:
                    s[bit] = 1
                result.append(s)

    return pd.DataFrame(result)

def get_user_recommendations(journey_series, model, max_interventions=3, number_results=10, index=1):
    # Get interventions to permute over
    interventions = [col for col in journey_series.axes[0] if col[0:2] == "i_"]
    journey_series.drop(labels=interventions + ["ttj", "ttj_sub_12"], inplace=True)
    
    # Generate permutation dataframe
    permutations = generate_combinations(len(interventions), max_interventions)
    
    # Create user dataframe to match permutation size and join with permutation df
    predict_df = pd.DataFrame(np.tile(journey_series.values,
                              len(permutations.index)).reshape(-1,len(journey_series.index)),
                              columns=journey_series.index).join(permutations)
    
    # Predict for all permutations
    probabilities = pd.DataFrame(model.predict_proba(predict_df))
    probabilities.columns = ["unsuccessful", "successful"]

    #Create final top_n dataframe
    results_df = predict_df.join(probabilities)
    
    top_n = results_df.sort_values(by=['successful'], ascending=False).iloc[0:number_results,:]
    top_n["journey_id"] = index

    return top_n

def gen_top_recommendations_table(df, model):
    # Prepare empty dataframe
    recommendations_df = pd.DataFrame()
    
    # Iterate through each user and append the top 5 recommendations to an empty dataframe
    for index, journey in df.iterrows():
        user_recommendations = get_user_recommendations(journey, model, index=index)
        recommendations_df = pd.concat([recommendations_df,user_recommendations])

    return recommendations_df

'''

In [None]:
# Test intervention count

small_test = df_test.tail(10)

start = time.time()
top_5_10 = gen_top_recommendations_table(small_test, model)
print(time.time() - start)

In [None]:
top_5_10

In [None]:
matrix = small_test.join(top_5_10.groupby("journey_id").sum().add_prefix('sum_'))

In [None]:
matrix

In [None]:
full_set = df_train.append(df_test)

In [None]:
def get_sub_group(df, observation):
    
    journey = pd.DataFrame(observation).T
    
    journey["youngest"] = (journey["d_age"] >= 0.0) & (journey["d_age"] < 0.2)
    journey["young_adult"] = (journey["d_age"] >= 0.2) & (journey["d_age"] < 0.3)
    journey["adult"] = (journey["d_age"] >= 0.3) & (journey["d_age"] < 0.40)
    journey["middle_adult"] = (journey["d_age"] >= 0.4) & (journey["d_age"] < 0.5)
    journey["older_adult"] = (journey["d_age"] >= 0.5) & (journey["d_age"] < 0.6)
    journey["senior"] = (journey["d_age"] >= 0.6) & (journey["d_age"] < 0.7)
    journey["old_senior"] = (journey["d_age"] >= 0.7) & (journey["d_age"] < 0.8)
    journey["old"] = (journey["d_age"] >= 0.8)
    
    df["youngest"] = (df["d_age"] >= 0.0) & (df["d_age"] < 0.2)
    df["young_adult"] = (df["d_age"] >= 0.2) & (df["d_age"] < 0.3)
    df["adult"] = (df["d_age"] >= 0.3) & (df["d_age"] < 0.40)
    df["middle_adult"] = (df["d_age"] >= 0.4) & (df["d_age"] < 0.5)
    df["older_adult"] = (df["d_age"] >= 0.5) & (df["d_age"] < 0.6)
    df["senior"] = (df["d_age"] >= 0.6) & (df["d_age"] < 0.7)
    df["old_senior"] = (df["d_age"] >= 0.7) & (df["d_age"] < 0.8)
    df["old"] = (df["d_age"] >= 0.8)
    
    dems = ["d_gender_M", "d_disabled", "d_subsidy", "d_rsi_True",
           "youngest", "young_adult", "adult", "middle_adult", 
            "older_adult", "senior", "old_senior", "old", "d_nationality_other",
           "d_school_qualification_2.0", "d_school_qualification_3.0",
            "d_school_qualification_4.0", "d_school_qualification_5.0",
            "d_school_qualification_6.0", "d_school_qualification_nan"]
    
    sub_group = df.merge(journey[dems], on=dems, right_index=True, how="inner")

    return sub_group

In [None]:
matrix

In [None]:
sub = get_sub_group(full_set, matrix.iloc[8])

In [None]:
interns = sub[sub["i_professional_internships"] == 1.0]

In [None]:
interns['ttj_sub_12'].astype(int).mean()

In [None]:
def eval_recommendations(journey, full_dataset, recommendations):
    sub_group = get_sub_group(full_dataset, journey)
    mean_success_rate = []
    for rec in recommendations:
        took_rec = sub_group[sub_group[rec] == 1.0]
        mean_success_rate.append(took_rec['ttj_sub_12'].astype(int).mean())
    average = sum(mean_success_rate) / len(mean_success_rate)
    
    return average