In [1]:
!pip install shap

Collecting shap
[?25l  Downloading https://files.pythonhosted.org/packages/a8/77/b504e43e21a2ba543a1ac4696718beb500cfa708af2fb57cb54ce299045c/shap-0.35.0.tar.gz (273kB)
[K     |█▏                              | 10kB 17.5MB/s eta 0:00:01[K     |██▍                             | 20kB 3.3MB/s eta 0:00:01[K     |███▋                            | 30kB 4.7MB/s eta 0:00:01[K     |████▉                           | 40kB 3.1MB/s eta 0:00:01[K     |██████                          | 51kB 3.8MB/s eta 0:00:01[K     |███████▏                        | 61kB 4.5MB/s eta 0:00:01[K     |████████▍                       | 71kB 5.1MB/s eta 0:00:01[K     |█████████▋                      | 81kB 5.7MB/s eta 0:00:01[K     |██████████▉                     | 92kB 6.4MB/s eta 0:00:01[K     |████████████                    | 102kB 4.9MB/s eta 0:00:01[K     |█████████████▏                  | 112kB 4.9MB/s eta 0:00:01[K     |██████████████▍                 | 122kB 4.9MB/s eta 0:00:01[K     |

In [0]:
import numpy as np                   # array, vector, matrix calculations
import pandas as pd                  # DataFrame handling
import shap                          # for consistent, signed variable importance measurements
import lightgbm as lgb                # gradient boosting machines (GBMs)

import matplotlib.pyplot as plt      # plotting
pd.options.display.max_columns = 999 # enable display of all columns in notebook

# enables display of plots in notebook
%matplotlib inline

np.random.seed(12345)                # set random seed for reproducibility

In [0]:
# import XLS file
path = 'https://github.com/firmai/random-assets-two/blob/master/xaib/default_of_credit_card_clients.xls?raw=true'
data = pd.read_excel(path,
                     skiprows=1) # skip the first row of the spreadsheet

# remove spaces from target column name 
data = data.rename(columns={'default payment next month': 'DEFAULT_NEXT_MONTH'}) 

In [4]:
# assign target and inputs for GBM
y = 'DEFAULT_NEXT_MONTH'
X = [name for name in data.columns if name not in [y, 'ID']]
print('y =', y)
print('X =', X)

y = DEFAULT_NEXT_MONTH
X = ['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']


In [0]:
best_params = {'bagging_fraction': 0.2,
 'bagging_seed': 1,
 'objective': 'binary',

 'verbose': -1} 

In [0]:
# execute split, set random state for reproducibility
train = data.sample(frac=0.7,random_state=1) # 70%/30% train/test split
test = data.drop(train.index).set_index("ID",drop=True)
train = train.set_index("ID",drop=True)

In [0]:
train_first = train.sample(frac=0.5,random_state=1)
train_second = train.drop(train_first.index)

In [9]:

from sklearn.metrics import roc_auc_score

# execute split, set random state for reproducibility

d_train_first = lgb.Dataset(train_first.drop([y],axis=1), label=train_first[y])
#d_train_second = lgb.Dataset(train_second.drop([y],axis=1), label=train_second[y])
model = lgb.train(best_params, d_train_first, verbose_eval=1000)
y_pred_second = model.predict(train_second.drop([y],axis=1))
print('ROC AUC {}'.format(roc_auc_score(train_second[y], y_pred_second)))

y_pred_test = model.predict(test.drop([y],axis=1))
print('ROC AUC {}'.format(roc_auc_score(test[y], y_pred_test)))


ROC AUC 0.7713549212252115
ROC AUC 0.7810723558640761


In [0]:
# Crete new dataframe to experience all tranformations
data_n = data.set_index("ID",drop=True).copy()

In [11]:
# Identify interactions
import math

input_row = len(train_first) 
sampled = min(1, (1/math.log(input_row, 3))*np.exp((2/math.log(input_row, 10000000)))/20)

# sampled = sampled *2

explainer = shap.TreeExplainer(model)

single = explainer.shap_values(train_first.drop([y],axis=1)[:int(input_row*sampled)].values)

inter = shap.TreeExplainer(model).shap_interaction_values(train_first.drop([y],axis=1)[:int(input_row*sampled*sampled)].values)
# Same as model

Setting feature_perturbation = "tree_path_dependent" because no background data was given.
LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray


In [12]:
is_true = False
inter_portion = 0.5

shap_fram = pd.DataFrame(single[0][:,:], columns=X)
shap_fram = shap_fram.mean().sort_values().to_frame()
shap_new = shap_fram[0].abs().sort_values(ascending=is_true)
main_ft = shap_new[shap_new.cumsum().sub((shap_new.sum()*inter_portion)).le(0)]


# for is_true in [False, True]:
#   for inter_portion in list(np.linspace(0.1,1,10,endpoint=False)):

print(str(is_true) + " " + str(inter_portion))
# create new contributive features
def main_calc(new_df,main_ft):
  df_square = new_df[list(main_ft.index)]
  df_square = df_square - df_square.min()
  sqr_name = [str(fa)+"_POWER_2" for fa in df_square.columns]
  log_p_name = [str(fa)+"_LOG_p_one_abs" for fa in df_square.columns]
  rec_p_name = [str(fa)+"_RECIP_p_one" for fa in df_square.columns]
  sqrt_name = [str(fa)+"_SQRT_p_one" for fa in df_square.columns]

  df_sqr = pd.DataFrame(np.power(df_square.values, 2),columns=sqr_name, index=new_df.index)
  df_log = pd.DataFrame(np.log(df_square.add(1).abs().values),columns=log_p_name, index=new_df.index)
  df_rec = pd.DataFrame(np.reciprocal(df_square.add(1).values),columns=rec_p_name, index=new_df.index)
  df_sqrt = pd.DataFrame(np.sqrt(df_square.abs().add(1).values),columns=sqrt_name, index=new_df.index)

  dfs = [df_sqr, df_log, df_rec, df_sqrt]

  df_connect=  pd.concat(dfs, axis=1)

  return df_connect

main_c = main_calc(data_n,main_ft)

## Finding the top interactions

df_start = pd.DataFrame(np.abs(np.mean(inter ,axis=0)),columns=X, index=X)

#the matrix is symmetric so we need to extract upper triangle matrix without diagonal (k = 1)
sol = (df_start.where(np.triu(np.ones(df_start.shape), k=1).astype(np.bool))
                .stack().sort_values(ascending=is_true))
                
dab = sol[sol.cumsum().sub((sol.sum()*inter_portion)).le(0)]

list_one = [da[0] for da in dab.index]
list_two = [da[1] for da in dab.index]


## Creating new formulaic interactions

def inter_cal(list_one, list_two,new_df):

  new_df = new_df - new_df.min() ## Don't want dividion zero
  mult = [str(ra)+"_X_"+str(ba) for ra, ba in zip(list_one, list_two)]
  div = [str(ra)+"_DIV_"+str(ba) for ra, ba in zip(list_one, list_two)]
  print("len one " + str(len(list_one)) )
  print("len two " + str(len(list_two)) )
  inter_mult = pd.DataFrame(new_df[list_one].values*new_df[list_two].values, columns=mult, index=new_df.index)
  div_p_one = pd.DataFrame(new_df[list_one].values/new_df[list_two].add(1).values, columns=div, index=new_df.index)

  df_one = pd.concat((inter_mult,div_p_one), axis=1)

  return df_one


inter_md = inter_cal(list_one, list_two,data_n)

## Creating sum interactions

def inter_sum(df, sol):
  ungroup = sol.to_frame().reset_index()
  group = ungroup.groupby("level_0").mean().sort_values(by=0,ascending=is_true)
  gr = group[0]
  gr = gr[gr.cumsum().sub((gr.sum()*inter_portion)).le(0)]
  list_cols = []
  for gra in gr.index:
    summer = ungroup[(ungroup["level_0"]==gra)|(ungroup["level_1"]==gra)].sort_values(0,ascending=is_true).set_index(['level_0', 'level_1'])
    suum = summer[0]
    suum = suum[suum.cumsum().sub((suum.sum()*inter_portion)).le(0)]
    list_one = [da[0] for da in suum.index]
    list_two = [da[1] for da in suum.index]
    list_one.extend(list_two)
    lit = list(set(list_one))
    name = "_ADD_".join(lit)
    df[name] = df[lit].sum(axis=1)
    list_cols.append(name)
  return df[list_cols]

inter_s = inter_sum(data_n, sol)


pdList = [data_n, main_c, inter_md, inter_s ] #inter_md, main_c, inter_s
new_df = pd.concat(pdList, axis=1)

## First One
# execute split, set random state for reproducibility
train = new_df.sample(frac=0.7,random_state=1) # 70%/30% train/test split
test = new_df.drop(train.index)

train_first = train.sample(frac=0.5,random_state=1)
train_second = train.drop(train_first.index)

d_train_first = lgb.Dataset(train_first.drop([y],axis=1), label=train_first[y])
#d_train_second = lgb.Dataset(train_second.drop([y],axis=1), label=train_second[y])

model = lgb.train(best_params, d_train_first, verbose_eval=1000)
y_pred_second = model.predict(train_second.drop([y],axis=1))
print('ROC AUC {}'.format(roc_auc_score(train_second[y], y_pred_second)))

y_pred_test = model.predict(test.drop([y],axis=1))
print('ROC AUC {}'.format(roc_auc_score(test[y], y_pred_test)))



False 0.5
len one 29
len two 29
ROC AUC 0.7726681725199329
ROC AUC 0.7778544047490384


In [14]:
explainer = shap.TreeExplainer(model)
single = explainer.shap_values(train_first.drop([y],axis=1)[:int(input_row*sampled)].values)
shap_fram = pd.DataFrame(single[0][:,:], columns=train_first.drop([y],axis=1).columns)
shap_fram = shap_fram.mean().sort_values().to_frame()
shap_new = shap_fram[0].abs().sort_values(ascending=is_true)


Setting feature_perturbation = "tree_path_dependent" because no background data was given.
LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray


Unnamed: 0_level_0,PAY_0,LIMIT_BAL_X_PAY_AMT1,PAY_AMT3_X_PAY_AMT4,BILL_AMT2,PAY_3_X_BILL_AMT1,BILL_AMT1_ADD_BILL_AMT4_ADD_BILL_AMT3,BILL_AMT1_ADD_BILL_AMT4_ADD_BILL_AMT3,BILL_AMT1_DIV_PAY_AMT1,BILL_AMT4_DIV_PAY_AMT5,BILL_AMT1_DIV_PAY_AMT2,BILL_AMT1_X_BILL_AMT4,BILL_AMT3_DIV_BILL_AMT4,BILL_AMT1,PAY_6,LIMIT_BAL_DIV_BILL_AMT5,PAY_5
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,2,0,0,3102,169493,4602,4602,169493.000000,170000.000000,245.642029,28813810000,0.929130,3913,-2,0.122948,-2
2,-1,0,1000000,1725,336524,8636,8636,168262.000000,173272.000000,168.093906,29155093264,0.923087,2682,2,1.297323,0
3,0,121440000,1000000,14027,389638,57129,57129,128.254773,184.146853,129.792805,35911181089,0.926714,29239,0,0.830884,0
4,0,80000000,1320000,48233,425140,124595,124595,106.231884,185.340187,105.232673,42155606980,1.041550,46990,0,0.362667,0
5,-1,80000000,90000000,5670,174197,65392,65392,87.054973,276.724638,4.748841,33261175180,1.011302,8617,0,0.398085,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29996,0,1785000000,15244141,192815,709056,485317,485317,41.704270,51.590482,17.725514,91469642112,1.417139,188948,0,1.865473,0
29997,-1,257180000,1160742,1828,167263,14164,14164,91.002720,178979.000000,47.423589,29936564477,0.898234,1683,0,1.618029,0
29998,4,0,92400000,3356,676580,27201,27201,169145.000000,95.391304,169145.000000,32286059310,0.838343,3565,0,0.196238,0
29999,1,6013000000,2268828,78379,327870,127433,127433,1.908418,4.206061,48.074780,36520455690,1.048448,-1645,-1,0.751154,0


In [22]:
main_ft.head(20)

PAY_0                                                 0.031307
LIMIT_BAL_X_PAY_AMT1                                  0.018804
PAY_AMT3_X_PAY_AMT4                                   0.017929
BILL_AMT2                                             0.017352
PAY_3_X_BILL_AMT1                                     0.015120
BILL_AMT1_ADD_BILL_AMT4_ADD_BILL_AMT3                 0.014086
BILL_AMT1_DIV_PAY_AMT1                                0.013410
BILL_AMT4_DIV_PAY_AMT5                                0.012612
BILL_AMT1_DIV_PAY_AMT2                                0.012019
BILL_AMT1_X_BILL_AMT4                                 0.011474
BILL_AMT3_DIV_BILL_AMT4                               0.010926
BILL_AMT1                                             0.010903
PAY_6                                                 0.010115
LIMIT_BAL_DIV_BILL_AMT5                               0.009578
PAY_5                                                 0.009193
PAY_0_DIV_PAY_2                                       0

In [26]:
shap_new[shap_new.cumsum().sub((shap_new.sum()*0.8)).le(0)]

PAY_0                                                 0.031307
LIMIT_BAL_X_PAY_AMT1                                  0.018804
PAY_AMT3_X_PAY_AMT4                                   0.017929
BILL_AMT2                                             0.017352
PAY_3_X_BILL_AMT1                                     0.015120
BILL_AMT1_ADD_BILL_AMT4_ADD_BILL_AMT3                 0.014086
BILL_AMT1_DIV_PAY_AMT1                                0.013410
BILL_AMT4_DIV_PAY_AMT5                                0.012612
BILL_AMT1_DIV_PAY_AMT2                                0.012019
BILL_AMT1_X_BILL_AMT4                                 0.011474
BILL_AMT3_DIV_BILL_AMT4                               0.010926
BILL_AMT1                                             0.010903
PAY_6                                                 0.010115
LIMIT_BAL_DIV_BILL_AMT5                               0.009578
PAY_5                                                 0.009193
PAY_0_DIV_PAY_2                                       0

In [67]:
# input_row = len(train_second) 
# sampled = min(1, (1/math.log(input_row, 3))*np.exp((2/math.log(input_row, 10000000)))/20)


for bar in [.99, .95, .90, .80,.7,.6]:

  main_ft = shap_new[shap_new.cumsum().sub((shap_new.sum()*bar)).le(0)]
  print(len(list(main_ft.index)))
  new_df_2 = new_df[list(main_ft.index)].copy()
  new_df_2[y] = new_df[y]

  train = new_df_2.sample(frac=0.7,random_state=1) # 70%/30% train/test split
  test = new_df_2.drop(train.index)

  train_first = train.sample(frac=0.5,random_state=1)
  train_second = train.drop(train_first.index)

  #d_train_first = lgb.Dataset(train_first.drop([y],axis=1), label=train_first[y])
  #model = lgb.train(best_params, d_train_first, verbose_eval=1000)

  d_train_first = lgb.Dataset(train_first.drop([y],axis=1), label=train_first[y])
  model = lgb.train(best_params, d_train_first, verbose_eval=1000)

  y_pred_second = model.predict(train_second.drop([y],axis=1))
  print('ROC AUC {}'.format(roc_auc_score(train_second[y], y_pred_second)))

  y_pred_test = model.predict(test.drop([y],axis=1))
  print('ROC AUC {}'.format(roc_auc_score(test[y], y_pred_test)))




77


KeyError: ignored

In [37]:
neigh.predict_proba(test[X])[:, 0]

array([0.33333333, 0.33333333, 1.        , ..., 1.        , 0.66666667,
       0.66666667])

In [34]:
neigh.predict_proba(test[X])

array([[0.33333333, 0.66666667],
       [0.33333333, 0.66666667],
       [1.        , 0.        ],
       ...,
       [1.        , 0.        ],
       [0.66666667, 0.33333333],
       [0.66666667, 0.33333333]])

In [96]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

train = data.sample(frac=0.7,random_state=1) # 70%/30% train/test split
test = data.drop(train.index).set_index("ID",drop=True)
train = train.set_index("ID",drop=True)

neigh = RandomForestClassifier()
neigh.fit(train.drop([y],axis=1), train[y])

print('ROC AUC {}'.format(roc_auc_score(test[y], neigh.predict_proba(test.drop([y],axis=1))[:, 1])))


ROC AUC 0.7686945780583906


In [0]:
data_n = data_n.T.drop_duplicates().T

In [0]:
pdList = [data_n, main_c, inter_md, inter_s ] #inter_md, main_c, inter_s
new_df = pd.concat(pdList, axis=1)


In [0]:
new_df = new_df.T.drop_duplicates().T

In [95]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

train = new_df.sample(frac=0.7,random_state=1) # 70%/30% train/test split
test = new_df.drop(train.index)

neigh = RandomForestClassifier()
neigh.fit(train.drop([y],axis=1), train[y])

print('ROC AUC {}'.format(roc_auc_score(test[y], neigh.predict_proba(test.drop([y],axis=1))[:, 1])))



ROC AUC 0.7630901550072805


In [75]:
train[y]

Unnamed: 0_level_0,DEFAULT_NEXT_MONTH,DEFAULT_NEXT_MONTH
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
10748,0,0
12574,1,1
29677,0,0
8857,1,1
21099,0,0
...,...,...
27956,0,0
27108,0,0
26,0,0
14778,0,0


In [74]:
from sklearn.neighbors import KNeighborsClassifier

train = new_df.sample(frac=0.7,random_state=1) # 70%/30% train/test split
test = new_df.drop(train.index)

neigh = LogisticRegression()
neigh.fit(train.drop([y],axis=1), train[y])

print('ROC AUC {}'.format(roc_auc_score(test[y], neigh.predict_proba(test.drop([y],axis=1))[:, 1])))



ValueError: ignored

In [0]:
gr.index

Index(['BILL_AMT5', 'BILL_AMT4', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT1'], dtype='object', name='level_0')

In [0]:
target = y

In [0]:
inter_mult.head()

Unnamed: 0_level_0,BILL_AMT5_X_BILL_AMT6,BILL_AMT4_X_BILL_AMT5,BILL_AMT3_X_BILL_AMT4,BILL_AMT4_X_PAY_AMT5,BILL_AMT2_X_BILL_AMT3,BILL_AMT1_X_BILL_AMT6,BILL_AMT5_X_PAY_AMT6,BILL_AMT1_X_BILL_AMT2,BILL_AMT5_X_PAY_AMT1,BILL_AMT2_X_PAY_AMT1,BILL_AMT3_X_BILL_AMT5,BILL_AMT2_X_PAY_AMT3,PAY_AMT1_X_PAY_AMT3,PAY_4_X_BILL_AMT1,BILL_AMT5_X_PAY_AMT4,BILL_AMT3_X_BILL_AMT6,BILL_AMT3_X_PAY_AMT3,BILL_AMT2_X_BILL_AMT5,BILL_AMT4_X_PAY_AMT3,PAY_AMT2_X_PAY_AMT5,PAY_AMT3_X_PAY_AMT4,PAY_AMT3_X_PAY_AMT6,LIMIT_BAL_X_BILL_AMT1,PAY_AMT2_X_PAY_AMT3,PAY_0_X_PAY_3,BILL_AMT2_X_BILL_AMT4,BILL_AMT5_DIV_BILL_AMT6,BILL_AMT4_DIV_BILL_AMT5,BILL_AMT3_DIV_BILL_AMT4,BILL_AMT4_DIV_PAY_AMT5,BILL_AMT2_DIV_BILL_AMT3,BILL_AMT1_DIV_BILL_AMT6,BILL_AMT5_DIV_PAY_AMT6,BILL_AMT1_DIV_BILL_AMT2,BILL_AMT5_DIV_PAY_AMT1,BILL_AMT2_DIV_PAY_AMT1,BILL_AMT3_DIV_BILL_AMT5,BILL_AMT2_DIV_PAY_AMT3,PAY_AMT1_DIV_PAY_AMT3,PAY_4_DIV_BILL_AMT1,BILL_AMT5_DIV_PAY_AMT4,BILL_AMT3_DIV_BILL_AMT6,BILL_AMT3_DIV_PAY_AMT3,BILL_AMT2_DIV_BILL_AMT5,BILL_AMT4_DIV_PAY_AMT3,PAY_AMT2_DIV_PAY_AMT5,PAY_AMT3_DIV_PAY_AMT4,PAY_AMT3_DIV_PAY_AMT6,LIMIT_BAL_DIV_BILL_AMT1,PAY_AMT2_DIV_PAY_AMT3,PAY_0_DIV_PAY_3,BILL_AMT2_DIV_BILL_AMT4
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1
10748,5263112000.0,12590580000.0,20169730000.0,750847500.0,31845000000.0,11993250000.0,714642600.0,45298210000.0,1380849000.0,2832307000.0,15525550000.0,1859663000.0,129179500.0,-49790.953141,777661000.0,8431350000.0,1452428000.0,19878640000.0,1177859000.0,69533050.0,72750820.0,66855400.0,107050700000.0,109076900.0,0.001831,25824970000.0,1.841392,1.299128,1.233107,21.78093,1.28038,4.196025,13.55964,1.110964,7.018103,14.395,1.601964,21.923083,1.522965,3e-06,12.460963,2.949844,17.122319,2.051124,13.885509,2.017201,1.165848,1.26864,2.127194,1.285981,1.18868,1.578846
12574,1681763000.0,1890017000.0,2106095000.0,228444000.0,2290539000.0,2075490000.0,219861400.0,2536771000.0,239603800.0,281819800.0,1947422000.0,395213200.0,46068040.0,-63042.59076,204817400.0,1874032000.0,374426000.0,2055538000.0,363388900.0,29527660.0,39379730.0,42272210.0,9141954000.0,46970030.0,2.191545,2223020000.0,1.039161,1.081477,1.030372,8.94597,1.055516,1.282439,7.947429,1.049247,7.292706,8.57758,1.114323,6.116832,0.713119,-4e-06,8.531057,1.157961,5.795108,1.176186,5.624288,1.156489,1.64041,1.528186,3.434579,0.727079,0.009337,1.087574
29677,3495235000.0,3755832000.0,8375812000.0,4372572000.0,4635287000.0,4330021000.0,278879100.0,2574954000.0,240532400.0,283871600.0,3927608000.0,366987000.0,42468640.0,-11539.235903,260167600.0,7794661000.0,693462500.0,2078528000.0,663133500.0,2545531000.0,45935470.0,49239190.0,11292070000.0,386049000.0,1.18064,4432560000.0,0.50389,2.132521,1.045735,1.83174,0.529215,0.624234,6.314461,1.049699,7.320966,8.640031,2.230053,6.683497,0.77355,1.5e-05,6.76853,1.123702,12.629088,1.180176,12.076753,1.066372,1.195191,1.115012,4.177655,7.030644,0.064946,0.553418
8857,6431053000.0,6782168000.0,7367865000.0,545292100.0,7779446000.0,7048416000.0,552051400.0,7848470000.0,759073200.0,839180400.0,7036828000.0,605897700.0,64225490.0,245047.946525,394316300.0,6986428000.0,595388900.0,7161031000.0,573842000.0,48167470.0,33363260.0,46709290.0,21805030000.0,50689380.0,5.633307,7497910000.0,1.007214,1.047043,1.037548,13.020932,1.01765,1.103902,11.731847,0.991375,8.532564,9.43302,1.086357,13.064387,1.384963,4.3e-05,16.423883,1.094194,12.837798,1.105532,12.373207,1.150323,1.389818,0.99277,2.802404,1.093101,0.780016,1.055861
21099,3180829000.0,3704580000.0,4502759000.0,453262700.0,4556503000.0,4108747000.0,407655600.0,4842404000.0,321862600.0,382614300.0,3833020000.0,481289500.0,41322340.0,-16100.135379,275133500.0,3866162000.0,492102300.0,3748798000.0,475612600.0,67635050.0,35323020.0,52336870.0,31713780000.0,70970070.0,-0.166074,4403821000.0,0.991428,1.174726,1.03467,9.599974,0.978028,1.280644,7.734922,1.08662,9.796317,11.645341,1.215453,9.258114,0.795006,1.1e-05,11.459798,1.205034,9.466108,1.188747,9.148916,1.432615,1.471444,0.993168,6.027034,1.365303,2.390466,1.011936


In [0]:
inter_mult = inter_cal(list_one, list_two, train.drop(columns=[target]))

len one 26
len two 26


In [0]:
inter_mult

Unnamed: 0_level_0,BILL_AMT5_X_BILL_AMT6,BILL_AMT4_X_BILL_AMT5,BILL_AMT3_X_BILL_AMT4,BILL_AMT4_X_PAY_AMT5,BILL_AMT2_X_BILL_AMT3,BILL_AMT1_X_BILL_AMT6,BILL_AMT5_X_PAY_AMT6,BILL_AMT1_X_BILL_AMT2,BILL_AMT5_X_PAY_AMT1,BILL_AMT2_X_PAY_AMT1,BILL_AMT3_X_BILL_AMT5,BILL_AMT2_X_PAY_AMT3,PAY_AMT1_X_PAY_AMT3,PAY_4_X_BILL_AMT1,BILL_AMT5_X_PAY_AMT4,BILL_AMT3_X_BILL_AMT6,BILL_AMT3_X_PAY_AMT3,BILL_AMT2_X_BILL_AMT5,BILL_AMT4_X_PAY_AMT3,PAY_AMT2_X_PAY_AMT5,PAY_AMT3_X_PAY_AMT4,PAY_AMT3_X_PAY_AMT6,LIMIT_BAL_X_BILL_AMT1,PAY_AMT2_X_PAY_AMT3,PAY_0_X_PAY_3,BILL_AMT2_X_BILL_AMT4,BILL_AMT5_DIV_BILL_AMT6,BILL_AMT4_DIV_BILL_AMT5,BILL_AMT3_DIV_BILL_AMT4,BILL_AMT4_DIV_PAY_AMT5,BILL_AMT2_DIV_BILL_AMT3,BILL_AMT1_DIV_BILL_AMT6,BILL_AMT5_DIV_PAY_AMT6,BILL_AMT1_DIV_BILL_AMT2,BILL_AMT5_DIV_PAY_AMT1,BILL_AMT2_DIV_PAY_AMT1,BILL_AMT3_DIV_BILL_AMT5,BILL_AMT2_DIV_PAY_AMT3,PAY_AMT1_DIV_PAY_AMT3,PAY_4_DIV_BILL_AMT1,BILL_AMT5_DIV_PAY_AMT4,BILL_AMT3_DIV_BILL_AMT6,BILL_AMT3_DIV_PAY_AMT3,BILL_AMT2_DIV_BILL_AMT5,BILL_AMT4_DIV_PAY_AMT3,PAY_AMT2_DIV_PAY_AMT5,PAY_AMT3_DIV_PAY_AMT4,PAY_AMT3_DIV_PAY_AMT6,LIMIT_BAL_DIV_BILL_AMT1,PAY_AMT2_DIV_PAY_AMT3,PAY_0_DIV_PAY_3,BILL_AMT2_DIV_BILL_AMT4
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1
10748,818323977,4874987567,9312669875,84373000,16820818875,2446969836,115558000,26329934484,479276805,1264133115,6377357125,609588000,33180000,0,173337000,1563241125,441500000,8805346263,337492000,6000000,12000000,8000000,53559320000,24000000,0,12858192081,4.079356,1.460263,1.308176,84.289710,1.380717,12.198037,28.875562,1.133696,6.964802,18.370058,1.910280,38.089978,2.073482,0.000006,19.253582,7.792714,27.587103,2.637556,21.088228,5.995005,1.333222,1.999500,1.794268,1.499875,1.00,1.806220
12574,1058340,1923220,-1264120,307580,267784,29760,0,-11456,0,0,-851224,-1012424,0,-32,0,-695640,-2115344,-407404,4779320,0,0,0,320000,0,2,-605020,1.223416,1.484636,-0.441750,9.240437,0.477912,0.035446,1139.000000,-0.092437,1139.000000,-357.000000,-0.655838,-0.126193,0.000353,0.000000,1139.000000,-0.802363,-0.264051,-0.313433,0.597738,0.005464,2829.000000,2829.000000,303.060606,0.000353,-0.00,-0.211118
29677,57183100,59767500,2126665575,2022302325,0,18914410,1801800,0,0,0,60134100,0,0,0,1690000,2034706659,101765400,0,101145000,2034706659,2860000,3049200,21500000,101765400,1,0,0.029576,35.338970,1.006134,1.045194,0.000022,0.009798,0.937996,431.000000,1301.000000,1.000000,35.555726,0.000454,0.000454,0.002320,1.000000,1.051605,21.016811,0.000769,20.888687,1.051605,1.691776,1.586878,116.011601,21.016811,,0.000022
8857,1616808312,1622422368,1634035548,65196800,1581904248,1488205943,63705600,1445729752,147319200,145957600,1596661416,63116800,5920000,109947,0,1628381307,64161600,1570661568,65196800,2560000,0,2560000,2931920000,2560000,6,1607427104,0.980521,1.023407,0.984122,25.452217,0.983717,0.902532,24.870081,0.929048,10.758444,10.659011,1.007158,24.640225,2.311680,0.000109,39817.000000,0.987539,25.048095,0.990758,25.452217,1.000000,1601.000000,1.000000,2.182838,1.000000,0.75,0.968097
21099,268643070,347719520,469701952,44896000,360478672,363838797,30980000,361426212,0,0,324112760,34456000,0,0,0,362884932,41848000,266861720,44896000,8000000,0,4000000,5664330000,8000000,0,386734144,0.893162,1.449164,0.932113,11.218891,0.823369,1.209640,7.741629,1.217714,15491.000000,17229.000000,1.350784,8.610195,0.000500,0.000048,15491.000000,1.206469,10.457271,1.112194,11.218891,1.999500,2001.000000,1.000000,12.869447,1.999500,2.00,0.767473
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27956,585312,1270320,1226080,1106000,583552,564928,562800,583552,604608,565504,623904,1188160,1188160,-776,0,564928,1226080,604608,2496400,560000,0,1106000,108640000,1264000,-2,1188160,1.104252,1.963975,0.491461,2.255350,0.969112,1.065844,1.148359,1.031873,1.069057,1.000000,0.965217,0.476281,0.476281,0.000000,805.000000,1.065844,0.491461,0.935404,1.000000,1.142653,1581.000000,2.255350,180.181467,0.506641,inf,0.476281
27108,1689198066,5280920979,15233998822,615535000,15595955872,4898268798,85794000,15677246512,190548474,559834144,5308332162,631672384,22263304,0,107242500,4872869988,620214952,5406394704,617012284,26000000,12530000,10024000,16170830000,26062400,0,15515421424,1.089362,2.869784,1.005191,24.616677,1.018473,3.158841,21.438281,0.986980,9.655188,28.366644,2.884680,25.141233,0.886296,0.000008,17.152339,3.142462,24.685218,2.937969,24.557750,1.039992,2.004398,2.505247,1.045091,1.037502,1.00,1.023760
26,896242134,864056643,1043478241,30762954,1506121630,1430790520,29739513,1990992200,58852617,82491130,1074530067,41851810,1974973,0,42715128,1082347058,36059023,1247150490,28995967,1514412,1433432,997997,2381000000,1427426,0,1211110270,0.992778,0.971103,1.243579,27.251176,1.160643,1.584884,29.889780,1.138959,15.111449,21.180851,1.207643,41.727545,1.970060,0.000021,20.816469,1.198922,35.952096,1.401643,28.910180,1.342427,0.699232,1.004008,1.049978,1.424152,1.00,1.443351
14778,102272769,56764269,31505769,0,110082156,171688401,34788720,332952924,30339000,58836000,56764269,0,0,0,45508500,56764269,0,198336156,0,0,0,0,1527930000,0,0,110082156,1.000000,0.555072,1.000000,5614.000000,3.493587,1.678663,2.939262,0.865650,3.370210,6.535488,0.555072,19613.000000,3001.000000,0.000059,2.247056,0.555072,5614.000000,1.939193,5614.000000,3001.000000,0.000222,0.000291,5.301037,3001.000000,1.00,3.493587


In [0]:
Second

In [0]:
 ungroup

Unnamed: 0,level_0,level_1,0
0,BILL_AMT5,BILL_AMT6,0.042100
1,BILL_AMT4,BILL_AMT5,0.040378
2,BILL_AMT3,BILL_AMT4,0.038016
3,BILL_AMT4,PAY_AMT5,0.035577
4,BILL_AMT2,BILL_AMT3,0.033015
...,...,...,...
248,LIMIT_BAL,SEX,0.000009
249,MARRIAGE,PAY_4,0.000008
250,SEX,PAY_AMT3,0.000007
251,SEX,BILL_AMT6,0.000006


In [0]:
inter.shape

(2000, 23, 23)

In [0]:
names = ["x","y","z"]
index = pd.MultiIndex.from_product([range(s)for s in inter.shape], names=names)
df = pd.DataFrame({'A': inter.flatten()}, index=index)['A']

In [0]:
df.shape

(1058000,)

In [0]:
df = df.unstack(level='x').swaplevel().sort_index()
df.columns = X
df.index.names = ['DATE', 'i']

ValueError: ignored

In [0]:
df.head()

x  y  z
0  0  0    0.722416
      1   -0.003709
      2   -0.011782
      3    0.005092
      4    0.010104
Name: A, dtype: float64

In [0]:
pan = pd.Panel(inter)
df = pan.swapaxes(0, 2).to_frame()
df.index = df.index.droplevel('minor')
df.index.name = 'Date'
df.index = df.index+1

TypeError: ignored

In [0]:
index = pd.MultiIndex.from_product([range(s)for s in A.shape], names=names)
df = pd.DataFrame({'A': A.flatten()}, index=index)['A']

In [0]:
inter

In [0]:
interact

array([[[ 7.22416088e-01, -3.70911867e-03, -1.17816884e-02, ...,
          5.45808674e-02, -1.61260361e-01,  0.00000000e+00],
        [-3.70911867e-03, -1.28327058e-03,  2.88120791e-03, ...,
          3.59514646e-03, -2.27297740e-03,  0.00000000e+00],
        [-1.17816884e-02,  2.88120791e-03, -5.94752639e-02, ...,
          5.10869690e-03, -2.38137691e-02,  0.00000000e+00],
        ...,
        [ 5.45808674e-02,  3.59514646e-03,  5.10869690e-03, ...,
          2.26046817e-01,  1.86530239e-02,  0.00000000e+00],
        [-1.61260361e-01, -2.27297740e-03, -2.38137691e-02, ...,
          1.86530239e-02,  1.02684490e-01,  0.00000000e+00],
        [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00]],

       [[ 2.37102027e-01,  7.10941655e-03, -2.08144997e-02, ...,
         -7.74274108e-02,  5.69689024e-03,  0.00000000e+00],
        [ 7.10941655e-03,  3.34411610e-02, -7.23061904e-04, ...,
          6.69391521e-02,  3.91182175e

In [0]:
# the first 2,000 people in order to run quicker
# and then use 80% additive. 

shap_interaction_values = shap.TreeExplainer(model).shap_interaction_values(X.iloc[:2000,:])



In [0]:
# inter_mean = np.mean(inter,axis=0); inter_mean.shape
# inter_cross = pd.DataFrame(inter_mean, index=X, columns=X).abs()

# s = inter_cross.unstack()
# so = s.sort_values(kind="quicksort",ascending=False)

# def get_redundant_pairs(df):
#     '''Get diagonal and lower triangular pairs of correlation matrix'''
#     pairs_to_drop = set()
#     cols = df.columns
#     for i in range(0, df.shape[1]):
#         for j in range(0, i+1):
#             pairs_to_drop.add((cols[i], cols[j]))
#     return pairs_to_drop

# def get_top_abs_correlations(df):
#     au_corr = df.unstack()
#     labels_to_drop = get_redundant_pairs(df)
#     au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
#     return au_corr

# print("Top Absolute Correlations")
# get = get_top_abs_correlations(inter_cross)

# ungroup = get.to_frame().reset_index()
# ungroup = get.to_frame().reset_index()
# ungroup.groupby("level_0").mean().sort_values(by=0,ascending=False)

In [0]:
d_train = lgb.Dataset(train.drop([y],axis=1), label=train[y])
d_test = lgb.Dataset(test.drop([y],axis=1), label=test[y])
model = lgb.train(best_params, d_train, verbose_eval=1000)
y_pred_test = model.predict(test.drop([y],axis=1))
print('ROC AUC {}'.format(roc_auc_score(test[y], y_pred_test)))