In [None]:
import os
import yaml


import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns
import numpy as np
import pickle
import boto3
import time
import math

from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import confusion_matrix
from sklearn.metrics import auc, roc_curve

from sklearn.linear_model import LogisticRegression, Lasso

from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

from scipy.stats import randint

import itertools

from iefp.recommendation import *

from IPython.display import display
pd.options.display.max_columns = None
pd.options.display.width = 0

In [None]:
%matplotlib inline
%load_ext autoreload

In [None]:
df_train = pd.read_parquet("s3://iefp-unemployment/copy_test_train/train_T180509.parquet")
df_test = pd.read_parquet("s3://iefp-unemployment/copy_test_train/test_T180509.parquet")

In [None]:
new_df_test = pd.read_parquet("s3://iefp-unemployment/modelling/test.parquet")

In [None]:
s3 = boto3.client('s3')
response = s3.get_object(Bucket='iefp-unemployment', Key='copy_test_train/random_forest_T180509.pkl')
model = pickle.loads(response["Body"].read())

In [None]:
def get_sub_group(df, observation):
    
    journey = pd.DataFrame(observation).T
    
    journey["youngest"] = (journey["d_age"] >= 0.0) & (journey["d_age"] < 0.2)
    journey["young_adult"] = (journey["d_age"] >= 0.2) & (journey["d_age"] < 0.3)
    journey["adult"] = (journey["d_age"] >= 0.3) & (journey["d_age"] < 0.40)
    journey["middle_adult"] = (journey["d_age"] >= 0.4) & (journey["d_age"] < 0.5)
    journey["older_adult"] = (journey["d_age"] >= 0.5) & (journey["d_age"] < 0.6)
    journey["senior"] = (journey["d_age"] >= 0.6) & (journey["d_age"] < 0.7)
    journey["old_senior"] = (journey["d_age"] >= 0.7) & (journey["d_age"] < 0.8)
    journey["old"] = (journey["d_age"] >= 0.8)
    
    df["youngest"] = (df["d_age"] >= 0.0) & (df["d_age"] < 0.2)
    df["young_adult"] = (df["d_age"] >= 0.2) & (df["d_age"] < 0.3)
    df["adult"] = (df["d_age"] >= 0.3) & (df["d_age"] < 0.40)
    df["middle_adult"] = (df["d_age"] >= 0.4) & (df["d_age"] < 0.5)
    df["older_adult"] = (df["d_age"] >= 0.5) & (df["d_age"] < 0.6)
    df["senior"] = (df["d_age"] >= 0.6) & (df["d_age"] < 0.7)
    df["old_senior"] = (df["d_age"] >= 0.7) & (df["d_age"] < 0.8)
    df["old"] = (df["d_age"] >= 0.8)
    
    dems = ["d_gender_M", "d_disabled", "d_subsidy", "d_rsi_True",
           "youngest", "young_adult", "adult", "middle_adult", 
            "older_adult", "senior", "old_senior", "old", "d_nationality_other",
           "d_school_qualification_2.0", "d_school_qualification_3.0",
            "d_school_qualification_4.0", "d_school_qualification_5.0",
            "d_school_qualification_6.0", "d_school_qualification_nan"]
    
    sub_group = df.merge(journey[dems], on=dems, right_index=True, how="inner")

    return sub_group

In [None]:
full_set = df_train.append(df_test)

In [None]:
def eval_recommendations(journey, full_dataset, recommendation_lst):

    if not recommendation_lst:
        return 1

    predicted_prob_success = recommendation_lst.pop()
    
    sub_group = get_sub_group(full_dataset, journey)
    
    recommendations = [("i_" + "_".join(inter.split())) for inter in recommendation_lst]
    
    mean_success_rate = []
    for rec in recommendations:
        took_rec = sub_group[sub_group[rec] == 1.0]
        mean_success_rate.append(took_rec["ttj_sub_12"].astype(int).mean())

    if not mean_success_rate:
        print("warning: no successful examples")
        average = 0.0
    else:
        average = sum(mean_success_rate) / len(mean_success_rate)
    
    error = abs(predicted_prob_success - average)
    
    return error

In [None]:
df_test.drop(columns = ["ttj", "ttj_sub_12"], inplace=True)

In [None]:
sample = df_test.sample(frac=0.005, random_state=1)

In [None]:
get_top_recommendations(model, sample.iloc[0], 3, 1).iloc[0]

In [None]:
sample["recommendations"] = sample.apply(lambda x: list(get_top_recommendations(model, x, 3, 1).iloc[0]), axis=1)

In [None]:
sample["rec_error"] = sample.apply(lambda x: eval_recommendations(x, full_set, x['recommendations']), axis=1)

In [None]:
sample.to_pickle("./test_set_with_rec_error.pkl")

In [None]:
sample['rec_error'].mean()