In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from interpret.glassbox import ExplainableBoostingClassifier
from interpret import show
from interpret.perf import ROC
from interpret.data import ClassHistogram
from sklearn.metrics import confusion_matrix,roc_curve, auc

In [5]:
df_train_csv = '../../../data/santander/raw/train.csv'
df_train = pd.read_csv(df_train_csv)

display(df_train.shape)
display(df_train.head())

(200000, 202)

Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,train_0,0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,...,4.4354,3.9642,3.1364,1.691,18.5227,-2.3978,7.8784,8.5635,12.7803,-1.0914
1,train_1,0,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,...,7.6421,7.7214,2.5837,10.9516,15.4305,2.0339,8.1267,8.7889,18.356,1.9518
2,train_2,0,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,...,2.9057,9.7905,1.6704,1.6858,21.6042,3.1417,-6.5213,8.2675,14.7222,0.3965
3,train_3,0,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.925,...,4.4666,4.7433,0.7178,1.4214,23.0347,-1.2706,-2.9275,10.2922,17.9697,-8.9996
4,train_4,0,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,...,-1.4905,9.5214,-0.1508,9.1942,13.2876,-1.5121,3.9267,9.5031,17.9974,-8.8104


In [6]:
X = df_train.iloc[:, 2:]
y = df_train.target

x_train, x_valid, y_train, y_valid = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)

print("Size of dataset: Train {} Validation {} ".format(x_train.shape, x_valid.shape))

Size of dataset: Train (134000, 200) Validation (66000, 200) 


In [7]:
# ebm
ebm = ExplainableBoostingClassifier(random_state=42)
ebm.fit(x_train, y_train) 

ExplainableBoostingClassifier(feature_names=['var_0', 'var_1', 'var_2', 'var_3',
                                             'var_4', 'var_5', 'var_6', 'var_7',
                                             'var_8', 'var_9', 'var_10',
                                             'var_11', 'var_12', 'var_13',
                                             'var_14', 'var_15', 'var_16',
                                             'var_17', 'var_18', 'var_19',
                                             'var_20', 'var_21', 'var_22',
                                             'var_23', 'var_24', 'var_25',
                                             'var_26', 'var_27', 'var_28',
                                             'var_29', ...],
                              feature_types=['continuous', 'continuous',
                                             'continuous', 'continuous',
                                             'continuous', 'continuous',
                                   

In [8]:
ebm_perf = ROC(ebm.predict_proba).explain_perf(x_valid, y_valid, name='EBM')
show(ebm_perf)

In [9]:
ebm_global = ebm.explain_global(name='EBM')
show(ebm_global)

In [10]:
y_pred_prob_valid = pd.DataFrame(ebm.predict_proba(x_valid))
y_pred_labels_valid = pd.DataFrame(ebm.predict(x_valid))

In [11]:
y_pred_prob_valid.head()

Unnamed: 0,0,1
0,0.923276,0.076724
1,0.526616,0.473384
2,0.720436,0.279564
3,0.762979,0.237021
4,0.997693,0.002307


In [12]:
y_valid.head()

189938    0
159231    1
137837    0
21401     1
105303    0
Name: target, dtype: int64

In [13]:
ebm_local = ebm.explain_local(x_valid[:5], y_valid[:5], name='EBM')
show(ebm_local)

In [14]:
hist = ClassHistogram().explain_data(x_train, y_train, name = 'train_data')
show([hist, ebm_global, ebm_perf, ebm_local], share_tables=True)

In [15]:
ebm_local_tot = ebm.explain_local(x_valid, y_valid, name='EBM')

# show(ebm_local_tot)

In [16]:

y_pred_prob = pd.DataFrame(ebm.predict_proba(X))
display(y_pred_prob.shape)

y_pred_labels = pd.DataFrame(ebm.predict(X))
display(y_pred_labels.value_counts())

display(confusion_matrix(y, y_pred_labels))

(200000, 2)

0    190541
1      9459
dtype: int64

array([[178056,   1846],
       [ 12485,   7613]])

In [17]:
model_pickle = '01_5_model_ebm_raw_hj.pickle'
with open(model_pickle, 'wb') as handle:
    pickle.dump(ebm, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [18]:
df_train.iloc[:,1:].head()

Unnamed: 0,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,-4.92,...,4.4354,3.9642,3.1364,1.691,18.5227,-2.3978,7.8784,8.5635,12.7803,-1.0914
1,0,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,3.1468,...,7.6421,7.7214,2.5837,10.9516,15.4305,2.0339,8.1267,8.7889,18.356,1.9518
2,0,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,-4.9193,...,2.9057,9.7905,1.6704,1.6858,21.6042,3.1417,-6.5213,8.2675,14.7222,0.3965
3,0,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.925,-5.8609,...,4.4666,4.7433,0.7178,1.4214,23.0347,-1.2706,-2.9275,10.2922,17.9697,-8.9996
4,0,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,6.2654,...,-1.4905,9.5214,-0.1508,9.1942,13.2876,-1.5121,3.9267,9.5031,17.9974,-8.8104


In [19]:
cols = ['target','pred_label','pred_prob'] + X.columns.tolist()
df_result = pd.DataFrame(pd.concat([df_train.target, y_pred_labels, y_pred_prob[1], X],axis=1,ignore_index=True).to_numpy(), columns = cols)

display(df_result.shape)
display(df_result.head(3))

(200000, 203)

Unnamed: 0,target,pred_label,pred_prob,var_0,var_1,var_2,var_3,var_4,var_5,var_6,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,0.0,0.0,0.014744,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,...,4.4354,3.9642,3.1364,1.691,18.5227,-2.3978,7.8784,8.5635,12.7803,-1.0914
1,0.0,0.0,0.255257,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,...,7.6421,7.7214,2.5837,10.9516,15.4305,2.0339,8.1267,8.7889,18.356,1.9518
2,0.0,0.0,0.006072,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,...,2.9057,9.7905,1.6704,1.6858,21.6042,3.1417,-6.5213,8.2675,14.7222,0.3965


In [20]:
conditionlist = [
    (df_result['pred_label']==1) & (df_result['target']==0),
    (df_result['pred_label']==0) & (df_result['target']==1),
    (df_result['pred_label']==1) & (df_result['target']==1),
    (df_result['pred_label']==0) & (df_result['target']==0)]
choicelist = ['FP', 'FN', 'TP', 'TN']
df_result['cf'] = np.select(conditionlist, choicelist)


In [21]:
df_st = df_result.groupby("cf").sample(n=25, random_state=42)

In [22]:
df_st.head()

Unnamed: 0,target,pred_label,pred_prob,var_0,var_1,var_2,var_3,var_4,var_5,var_6,...,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199,cf
46516,1.0,0.0,0.213779,11.6045,0.455,8.6219,9.7706,14.3848,-15.7498,5.2326,...,7.5604,1.4849,6.6708,19.8896,1.2857,1.3911,11.224,18.3293,1.7871,FN
111929,1.0,0.0,0.099202,14.3272,-4.0296,15.0844,5.8801,9.929,-10.2422,5.4827,...,5.2738,0.9733,9.3648,22.792,-1.114,2.7243,8.3986,17.493,7.4513,FN
46284,1.0,0.0,0.060618,14.7952,-6.522,7.9667,4.7216,11.8385,-9.4202,5.8161,...,3.5441,2.7836,0.6533,12.0741,-2.2748,7.0844,8.3978,19.4978,-11.1396,FN
65229,1.0,0.0,0.017597,6.1361,-0.0756,7.6543,9.235,11.8113,-2.95,7.0188,...,8.849,1.346,9.0151,16.3695,0.8877,-8.5804,8.0888,15.364,-14.3228,FN
45492,1.0,0.0,0.104694,9.9396,-0.1923,12.2804,9.4472,10.541,-0.0477,3.7386,...,10.8826,3.8029,8.9503,14.8593,-0.1883,-4.3883,8.8802,16.7774,10.1113,FN


In [23]:
result_pickle = '01_5_result_ebm_raw_hj.pickle'
st_pickle = '01_5_result_ebm_raw_streamlit_hj.pickle'
with open(result_pickle, 'wb') as handle:
    pickle.dump(df_result, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open(st_pickle, 'wb') as handle:
    pickle.dump(df_st, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [24]:
df_st

Unnamed: 0,target,pred_label,pred_prob,var_0,var_1,var_2,var_3,var_4,var_5,var_6,...,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199,cf
46516,1.0,0.0,0.213779,11.6045,0.4550,8.6219,9.7706,14.3848,-15.7498,5.2326,...,7.5604,1.4849,6.6708,19.8896,1.2857,1.3911,11.2240,18.3293,1.7871,FN
111929,1.0,0.0,0.099202,14.3272,-4.0296,15.0844,5.8801,9.9290,-10.2422,5.4827,...,5.2738,0.9733,9.3648,22.7920,-1.1140,2.7243,8.3986,17.4930,7.4513,FN
46284,1.0,0.0,0.060618,14.7952,-6.5220,7.9667,4.7216,11.8385,-9.4202,5.8161,...,3.5441,2.7836,0.6533,12.0741,-2.2748,7.0844,8.3978,19.4978,-11.1396,FN
65229,1.0,0.0,0.017597,6.1361,-0.0756,7.6543,9.2350,11.8113,-2.9500,7.0188,...,8.8490,1.3460,9.0151,16.3695,0.8877,-8.5804,8.0888,15.3640,-14.3228,FN
45492,1.0,0.0,0.104694,9.9396,-0.1923,12.2804,9.4472,10.5410,-0.0477,3.7386,...,10.8826,3.8029,8.9503,14.8593,-0.1883,-4.3883,8.8802,16.7774,10.1113,FN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77302,1.0,1.0,0.967293,13.0081,1.5112,11.5367,9.0771,12.7793,-0.2997,5.9284,...,4.2910,0.6074,3.0147,22.0884,-2.9752,1.2473,9.2082,8.3517,12.8794,TP
156669,1.0,1.0,0.788143,10.4643,-5.0815,12.0416,7.7107,13.4536,0.5676,6.5808,...,9.6826,2.0727,-4.0449,13.1311,1.1691,9.1670,9.4978,10.0347,-4.3915,TP
164376,1.0,1.0,0.960573,16.1955,-1.0311,7.7649,7.2124,12.7211,-13.1849,7.4634,...,12.6262,1.1209,-1.0359,15.2306,-0.0194,0.6302,8.3338,16.6414,2.3697,TP
32750,1.0,1.0,0.621511,7.0743,8.5664,12.8659,5.7046,11.0383,-9.8413,5.3319,...,5.6017,-0.3301,0.3854,13.6557,0.8761,4.9894,9.2158,9.4817,-0.3554,TP


In [25]:
l = ebm_local._internal_obj

In [51]:
display(l.keys())

dict_keys(['overall', 'specific', 'mli'])

In [79]:
l['specific'][1].keys()

dict_keys(['type', 'names', 'scores', 'values', 'extra', 'meta', 'perf'])

In [78]:
l['specific'][1]['type']

'univariate'

In [80]:
len(l['specific'][0]['names'])

210

In [72]:
l['specific'][0]['names'][0:5]

['var_0', 'var_1', 'var_2', 'var_3', 'var_4']

In [83]:
l['specific'][0]['names'][200:210]

['var_109 x var_171',
 'var_81 x var_164',
 'var_53 x var_76',
 'var_66 x var_139',
 'var_22 x var_107',
 'var_109 x var_173',
 'var_26 x var_53',
 'var_0 x var_12',
 'var_34 x var_118',
 'var_81 x var_184']

In [84]:
len(l['specific'][0]['scores'])

210

In [73]:
l['specific'][1]['scores'][0:5]

[0.28687437718936126,
 0.017321100919479207,
 0.4160339739057263,
 0.0023474073997953003,
 -0.020296432707943356]

In [85]:
l['specific'][1]['scores'][200:210]

[0.0033340899760628895,
 0.007324794351638202,
 -0.009083837097018812,
 -0.008460798659350252,
 -0.01875443204025186,
 0.006844843034695767,
 -0.007571852102485991,
 0.0018634010599832761,
 0.032679809443992236,
 -0.0019439606778471862]

In [86]:
len(l['specific'][1]['values'])

210

In [74]:
l['specific'][1]['values'][0:5]

[15.19, -1.857, 15.99, 7.777, 10.02]

In [87]:
l['specific'][1]['values'][200:210]

['', '', '', '', '', '', '', '', '', '']

In [75]:
l['specific'][1]['extra']

{'names': ['Intercept'], 'scores': [-3.1939642307216656], 'values': [1]}

In [76]:
l['specific'][1]['meta']

{'label_names': [0, 1]}

In [77]:
l['specific'][1]['perf']

{'is_classification': True,
 'actual': 1,
 'predicted': 0,
 'actual_score': 0.4863536431111178,
 'predicted_score': 0.5136463568888823}

In [99]:
range(len(X))

range(0, 200000)

In [None]:
df_ebm_local = pd.DataFrame()
for i in range(len(X)):
    i+=1
    scores = pd.Series(l['specific'][i]['scores'])
    values = pd.Series(l['specific'][i]['values'])
    df_ebm_local = pd.concat([df_ebm_local, pd.DataFrame([scores, values]).T], axis=0)
    #df
# df_ebm_local = pd.DataFrame(pd.concat())
# df_ebm_local.columns = ['scores','values']

In [None]:
pd.concat([df_train.target, y_pred_labels, y_pred_prob[1], X],axis=1,ignore_index=True).to_numpy()

display(df_result.shape)
display(df_result.head(3))