In [4]:
import pandas as pd
import os
import yaml
from sklearn.metrics import confusion_matrix

data_cfg_path = '../alert_data/dataset_cfg.yaml'
with open(data_cfg_path, 'r') as infile:
    data_cfg = yaml.safe_load(infile)

cat_dict = data_cfg['categorical_dict']

test_env_df = pd.DataFrame(columns = ['training_seed',
                                        'method',
                                        'batch_size',
                                        'batch_seed', 
                                        'team',
                                        'distribution', 
                                        'distribution_seed', 
                                        'n_errors',
                                        'tp',
                                        'fp',
                                        'fn',
                                        'tn',
                                        'tpr',
                                        'fpr',
                                        'fpr_disp'])

alerts = pd.read_parquet(f'../../FiFAR/alert_data/processed_data/alerts.parquet')
test = alerts.loc[alerts['month'] == 7]
for method in os.listdir('../../FiFAR/deferral/results/'):
    if method not in ['OvA','Random','DeCCaF']:
        training_seed = 'NA'
        for cc in os.listdir(f'../../FiFAR/deferral/results/{method}'):
            
            env = cc.split('#')[1]
            team = env.split('-')[0]

            if env.split('-')[1] == 'hom':
                distribution = 'hom'
                distribution_seed = 'NA'
            else:
                distribution = 'var'
                distribution_seed = env.split('-')[1].split('_')[1]

            batch_size = 4457
            batch_seed = 1

            assignments = pd.read_parquet(f'../../FiFAR/deferral/results/{method}/{cc}/assignments.parquet')
            reviews = pd.read_parquet(f'../../FiFAR/deferral/results/{method}/{cc}/results.parquet')
            reviews = reviews['prediction']

            labels = test.loc[reviews.index,'fraud_bool']

            n_errors = (reviews != labels).astype(int).mean()
            tn, fp, fn, tp = confusion_matrix(y_true = labels, y_pred = reviews).ravel()
            tpr = tp/(tp+fn)
            fpr = fp/(fp+tn)
            
            sub_test = test.loc[reviews.index,:]
            old_ix = sub_test.loc[test['customer_age'] >= 50].index
            yng_ix = sub_test.loc[test['customer_age'] < 50].index


            old_pred = reviews.loc[old_ix]
            old_label = labels.loc[old_ix]
            fp_old = ((old_pred == 1) & (old_label == 0)).astype(int).sum()
            tn_old = ((old_pred == 0) & (old_label == 0)).astype(int).sum()

            yng_pred = reviews.loc[yng_ix]
            yng_label = labels.loc[yng_ix]
            fp_yng = ((yng_pred == 1) & (yng_label == 0)).astype(int).sum()
            tn_yng = ((yng_pred == 0) & (yng_label == 0)).astype(int).sum()

            fpr_yng = fp_yng/(fp_yng + tn_yng)
            fpr_old = fp_old/(fp_old + tn_old)

            fpr_disp =  fpr_yng/fpr_old

            test_env_df = test_env_df.append(pd.Series([training_seed,method,batch_size, 
                                                        batch_seed, 
                                                        team,
                                                        distribution,
                                                        distribution_seed, 
                                                        n_errors,
                                                        tp,
                                                        fp,
                                                        fn,
                                                        tn,
                                                        tpr,
                                                        fpr, 
                                                        fpr_disp], index = test_env_df.columns), ignore_index = True)
    if method in ['OvA','Random','DeCCaF']:
        for seed in os.listdir(f'../../FiFAR/deferral/results/{method}'):
            training_seed = seed
            for cc in os.listdir(f'../../FiFAR/deferral/results/{method}/{seed}'):
                
                env = cc.split('#')[1]
                team = env.split('-')[0]

                if env.split('-')[1] == 'hom':
                    distribution = 'hom'
                    distribution_seed = 'NA'
                else:
                    distribution = 'var'
                    distribution_seed = env.split('-')[1].split('_')[1]

                batch_size = 4457
                batch_seed = 1

                assignments = pd.read_parquet(f'../../FiFAR/deferral/results/{method}/{seed}/{cc}/assignments.parquet')
                reviews = pd.read_parquet(f'../../FiFAR/deferral/results/{method}/{seed}/{cc}/results.parquet')
                reviews = reviews['prediction']

                labels = test.loc[reviews.index,'fraud_bool']

                n_errors = (reviews != labels).astype(int).mean()
                tn, fp, fn, tp = confusion_matrix(y_true = labels, y_pred = reviews).ravel()
                tpr = tp/(tp+fn)
                fpr = fp/(fp+tn)
                
                sub_test = test.loc[reviews.index,:]
                old_ix = sub_test.loc[test['customer_age'] >= 50].index
                yng_ix = sub_test.loc[test['customer_age'] < 50].index


                old_pred = reviews.loc[old_ix]
                old_label = labels.loc[old_ix]
                fp_old = ((old_pred == 1) & (old_label == 0)).astype(int).sum()
                tn_old = ((old_pred == 0) & (old_label == 0)).astype(int).sum()

                yng_pred = reviews.loc[yng_ix]
                yng_label = labels.loc[yng_ix]
                fp_yng = ((yng_pred == 1) & (yng_label == 0)).astype(int).sum()
                tn_yng = ((yng_pred == 0) & (yng_label == 0)).astype(int).sum()

                fpr_yng = fp_yng/(fp_yng + tn_yng)
                fpr_old = fp_old/(fp_old + tn_old)

                fpr_disp =  fpr_yng/fpr_old

                test_env_df = test_env_df.append(pd.Series([training_seed,method,batch_size, 
                                                            batch_seed, 
                                                            team,
                                                            distribution,
                                                            distribution_seed, 
                                                            n_errors,
                                                            tp,
                                                            fp,
                                                            fn,
                                                            tn,
                                                            tpr,
                                                            fpr, 
                                                            fpr_disp], index = test_env_df.columns), ignore_index = True)

In [5]:
test_env_df['loss'] = (test_env_df['fn'] + 0.057*test_env_df['fp']).astype(float)

In [8]:
a = test_env_df.groupby(['method','team']).mean()['loss'].reset_index().round(1)
import numpy as np
temp = test_env_df.groupby(['method','team'])['loss'].agg(['mean','count','std'])
b = (temp['std']*1.96/np.sqrt(temp['count'])).round(2).reset_index()

table = pd.DataFrame(index = a['team'].unique())
table['Full Rejection'] = a.loc[a['method'] == 'Full_Rej']['loss'].values
table['Only Classifier h'] = a.loc[a['method'] == 'Only_Classifier']['loss'].values
table['Random Deferral'] =  '$' + a.loc[a['method'] == 'Random']['loss'].astype(str).values + " \pm " + b.loc[b['method'] == 'Random'][0].astype(str).values + '$'
table['OvA'] =  r'$' + a.loc[a['method'] == 'OvA']['loss'].astype(str).values + " \pm " + b.loc[b['method'] == 'OvA'][0].astype(str).values + '$'
table['DeCCaF'] =  r'$' + a.loc[a['method'] == 'DeCCaF']['loss'].astype(str).values + " \pm " + b.loc[b['method'] == 'DeCCaF'][0].astype(str).values + '$'
table

Unnamed: 0,Full Rejection,Only Classifier h,Random Deferral,OvA,DeCCaF
team_1,213.5,204.0,$169.6 \pm 2.53$,$151.7 \pm 1.99$,$138.1 \pm 5.14$
team_2,213.5,204.0,$157.8 \pm 1.39$,$142.0 \pm 1.61$,$145.3 \pm 4.28$
team_3,213.5,204.0,$151.2 \pm 1.84$,$131.3 \pm 2.19$,$126.2 \pm 4.52$
team_4,213.5,204.0,$163.1 \pm 1.61$,$145.8 \pm 3.51$,$141.6 \pm 5.32$
team_5,213.5,204.0,$163.3 \pm 1.58$,$141.2 \pm 2.57$,$132.0 \pm 1.93$


In [9]:
(a.loc[a['method'] == 'Random']['loss'].mean() - a.loc[a['method'] == 'OvA']['loss'].mean())/a.loc[a['method'] == 'Random']['loss'].mean()

0.11552795031055897

In [11]:
(a.loc[a['method'] == 'Random']['loss'].mean() - a.loc[a['method'] == 'DeCCaF']['loss'].mean())/a.loc[a['method'] == 'Random']['loss'].mean()

0.15130434782608704

In [14]:


import pickle
from sklearn.metrics import roc_auc_score
from sklearn.calibration import calibration_curve
import numpy as np
with open(f'../../FiFAR/deferral/l2d_predictions/ova.pkl', 'rb') as infile:
        ova = pickle.load(infile)
with open(f'../../FiFAR/deferral/l2d_predictions/deccaf.pkl', 'rb') as infile:
        deccaf = pickle.load(infile)

data = pd.read_parquet(f'../../FiFAR/alert_data/processed_data/alerts.parquet')
val = data.loc[data['month'] == 6]
l = 0.057
e_c = val['fraud_bool'].replace([0,1], [l,1]).mean()
reb_1 = (val['fraud_bool'].mean()/e_c)
reb_0 = (1-val['fraud_bool']).mean()*l/e_c
test = data.loc[data['month'] == 7]
nmin = len(test.loc[test['fraud_bool'] == 0])
nmax = int(nmin*reb_1/reb_0)
preds = pd.read_parquet(f'../../FiFAR/synthetic_experts/expert_predictions.parquet').loc[data.index]
auc = pd.DataFrame()
ece = pd.DataFrame()
a = preds.loc[test.index]
b = test['fraud_bool']
oversampled = pd.concat([test.loc[test['fraud_bool'] == 0], test.loc[test['fraud_bool'] == 1].sample(replace=True, n = nmax, random_state=42)]).index
i=1
for seed in ova:
    batch, team = seed.split('#') 
    for expert in ova[seed].columns.drop('classifier_h'):
        outcomes = (a.loc[oversampled,expert] == b.loc[oversampled]).astype(int)
        auc.loc[i,'batch'] = batch
        auc.loc[i,'team'] = team
        auc.loc[i,'expert'] = expert
        auc.loc[i,'auc_ova'] = (roc_auc_score(y_true = outcomes, y_score = ova[seed].loc[oversampled,expert]))
        prob_true, prob_pred = calibration_curve(y_true = outcomes, y_prob = ova[seed].loc[oversampled,expert], strategy='quantile', n_bins = 10)
        auc.loc[i,'ece_ova'] = (np.mean(np.abs(prob_true - prob_pred)))
        auc.loc[i,'auc_deccaf'] = (roc_auc_score(y_true = outcomes, y_score = deccaf[seed].loc[oversampled,expert]))
        prob_true, prob_pred = calibration_curve(y_true = outcomes, y_prob = deccaf[seed].loc[oversampled,expert], strategy='quantile', n_bins = 10)
        auc.loc[i,'ece_deccaf'] = (np.mean(np.abs(prob_true - prob_pred)))
        i+=1

In [15]:
(auc.mean()['auc_deccaf'] - auc.mean()['auc_ova'])/auc.mean()['auc_ova']

  """Entry point for launching an IPython kernel.


0.03990758522753857

In [16]:
(auc.mean()['ece_ova']- auc.mean()['ece_deccaf'])/auc.mean()['ece_ova']

  """Entry point for launching an IPython kernel.


0.1750215570868043

In [17]:
table = pd.DataFrame()
temp = auc.groupby(by = ['team'])['auc_ova'].agg(['mean','count','std'])
mean = temp['mean']
ci = (temp['std']*1.96/np.sqrt(temp['count'])).round(2)
table['auc_ova'] = '$' + mean.round(2).astype(str) + "\pm" + ci.astype(str) + '$'

temp = auc.groupby(by = ['team'])['ece_ova'].agg(['mean','count','std'])
mean = temp['mean']*100
ci = (temp['std']*100*1.96/np.sqrt(temp['count'])).round(1)
table['ece_ova'] = '$' + mean.round(1).astype(str) + "\pm" + ci.astype(str) + '$'

temp = auc.groupby(by = ['team'])['auc_deccaf'].agg(['mean','count','std'])
mean = temp['mean']
ci = (temp['std']*1.96/np.sqrt(temp['count'])).round(2)
table['auc_deccaf'] = '$' + mean.round(2).astype(str) + "\pm" + ci.astype(str) + '$'

temp = auc.groupby(by = ['team'])['ece_deccaf'].agg(['mean','count','std'])
mean = temp['mean']*100
ci = (temp['std']*100*1.96/np.sqrt(temp['count'])).round(1)
table['ece_deccaf'] = '$' + mean.round(1).astype(str) + "\pm" + ci.astype(str) + '$'

In [19]:
table

Unnamed: 0_level_0,auc_ova,ece_ova,auc_deccaf,ece_deccaf
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
team_1,$0.61\pm0.01$,$6.0\pm0.5$,$0.64\pm0.02$,$4.6\pm0.5$
team_2,$0.58\pm0.01$,$6.1\pm0.6$,$0.6\pm0.01$,$4.9\pm0.4$
team_3,$0.6\pm0.02$,$5.5\pm0.5$,$0.62\pm0.02$,$5.3\pm0.6$
team_4,$0.59\pm0.01$,$5.3\pm0.6$,$0.6\pm0.01$,$3.9\pm0.4$
team_5,$0.59\pm0.01$,$5.6\pm0.7$,$0.61\pm0.01$,$4.9\pm0.6$


In [20]:
test = alerts.loc[alerts['month'] == 7]
exp_pred = pd.read_parquet(f'../../FiFAR/synthetic_experts/expert_predictions.parquet').loc[test.index]


with open(f"../../FiFAR/classifier_h/selected_model/best_model.pickle", 'rb') as fp:
    classifier_h = pickle.load(fp)
with open(f"../../FiFAR/classifier_h/selected_model/model_properties.yaml", 'r') as fp:
    classifier_h_properties = yaml.safe_load(fp)

def sig(x):
    return 1/(1+np.exp(-x))

def output(data, model, init_score):
    return sig(model.predict(data,raw_score=True) + init_score)

X_test = test.drop(columns = ['fraud_bool','model_score','month']) 

a = pd.Series(output(X_test, classifier_h, classifier_h_properties['init_score']), index = X_test.index)

outcomes = b.loc[oversampled]
prob_true, prob_pred = calibration_curve(y_true = outcomes, y_prob = a.loc[oversampled], strategy='quantile', n_bins = 10)


print(roc_auc_score(y_true = outcomes, y_score = a.loc[oversampled]))
print(np.mean(np.abs(prob_true - prob_pred))*100)

0.7072718707150493
4.843789647446122
