## Notebook Magic

In [2]:
%matplotlib inline
%load_ext autoreload

## Imports

In [1]:
import os
import itertools
import yaml
import pandas as pd
import matplotlib.pyplot as plt
import pandas as pd
from random import random
import seaborn as sns
import numpy as np

from iefp.data import postgres
from iefp.data import s3

In [2]:
def generate_combinations(n,k):
    result = list()
    for i in range(1, k + 1):
        for bits in itertools.combinations(range(n), i):
                s = [0] * n
                for bit in bits:
                    s[bit] = 1
                result.append(s)

    return pd.DataFrame(result)

In [3]:
# Load best model and test set
model_path, _, test_path = postgres.get_best_model_paths()

rf = s3.read_object_from_s3(model_path)
df_test = pd.read_parquet(test_path)

In [4]:
# Generate intervention combinations
intervention_names = [col for col in df_test.columns if "i_" in col[:2]]
df_intervention_combinations = generate_combinations(len(intervention_names), 3)
df_intervention_combinations.columns = intervention_names

In [5]:
# Input: Random observation
df_observation = df_test.sample(1)
df_observation = df_observation.drop(intervention_names + ["ttj_sub_12", "ttj"], axis="columns")

In [6]:
# Generate dataframe with fixed features and intervention combinations
df_observation = df_observation.append([df_observation] * (len(df_intervention_combinations) - 1), ignore_index=True)
df_observation = df_observation.merge(df_intervention_combinations, right_index=True, left_index=True)

In [7]:
result = rf.predict_proba(df_observation)

In [14]:
def get_top_interventions(X, y_probs, n):
    top_n_rec = pd.DataFrame(y_probs, columns=["false", "true"]).sort_values("true", ascending=False).head(n)
    
    intervention_cols = [col for col in X.columns if "i_" in col[:2]]
    df_result = X.loc[top_n_rec.index, intervention_cols]
    
    rec_interv = list()
    for i in range(len(df_result)):
        row = df_result.iloc[i]
        interventions = row[row==1]
        # Pretty formatting
        interventions = interventions.index.str.replace("i_", "").str.replace("_", " ").tolist()
        rec_interv.append(interventions + [top_n_rec.iloc[i]["true"]])

    df_rec_interv = pd.DataFrame(rec_interv, columns=(["intervention"] * 3  + ["probability"]))
    df_rec_interv = df_rec_interv.replace("i_", "") 
    return df_rec_interv
    
get_top_interventions(df_observation, result, 5)

Unnamed: 0,intervention,intervention.1,intervention.2,probability
0,tutoring in individual job search,employment-insertion contract,tutoring in collective job search,0.583261
1,tutoring in individual job search,employment-insertion contract,assertive communication job search techn,0.580327
2,tutoring in individual job search,paper subsidy ace,employment-insertion contract,0.580198
3,tutoring in individual job search,paecpe ace,employment-insertion contract,0.57996
4,tutoring in individual job search,internship job,employment-insertion contract,0.579907
