In [None]:
import random
import numpy as np
import pandas as pd
from tensorflow import keras
import pickle
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score
from tqdm.notebook import tqdm

import os,sys,inspect
current_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parent_dir = os.path.dirname(current_dir)
parent_dir = os.path.dirname(parent_dir)
sys.path.insert(0, parent_dir)

from explain.eval import local_analysis, local_roar
from explain.sparse_mlp import get_sparse_mlp

In [None]:
seed = 3
np.random.seed(seed)
random.seed(seed)
samples = 300

# Import data

In [None]:
# Load CSV from URL using NumPy
url = "https://raw.githubusercontent.com/meauxt/credit-card-default/master/credit_cards_dataset.csv"
names = ['ID', 'LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0',
       'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6',
       'Default']
df = pd.read_csv(url, names=names)
df = df.drop(['ID'], axis = 1)
names = names[1:np.size(names,0)]
print(df.shape)

# Convert to numeric
for column in df:
    df[column] = pd.to_numeric(df[column],errors='coerce')
df = df.dropna()

# rescale sex (sex : male = 0 , female = 1)
df['SEX'][df['SEX']==1]=0
df['SEX'][df['SEX']==2]=1

# rescale marital status (married = 0, single = 1) after dropping 'others'
df['MARRIAGE'][df['MARRIAGE']==1]=0
df['MARRIAGE'][df['MARRIAGE']==2]=1
df.drop(df[df['MARRIAGE']==3].index , inplace=True)

# remove unknown from education
df.drop(df[df['EDUCATION']==5].index, inplace=True)
df.drop(df[df['EDUCATION']==6].index, inplace=True)
print("Dataset:")
print(df.shape)

# Convert in log PAY_AMT
pay_atm_vars = ['PAY_AMT1','PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']
for j in pay_atm_vars:
    # set 0 to min
    df[j][df[j]==0] = np.min(df[j][df[j]!=0])
    # take log
    df[j] = np.log(df[j])
    
# balanced subsampling
index_sample = np.random.choice(df[df['Default']==1].index.values, size=samples // 2, replace=False)
index_sample = np.hstack((np.random.choice(df[df['Default']==0].index.values, size=samples // 2, replace=False), index_sample))
np.random.shuffle(index_sample)
df = df.loc[index_sample]

arr_df = df.values
all_keys = df.keys()

# Scale all data
scaler = MinMaxScaler()
scaler.fit(arr_df)
arr_df = scaler.transform(arr_df)
outcome = np.asarray(arr_df[:, -1])
data = np.asarray(arr_df[:, :-1])
x_train, x_test, y_train, y_test = train_test_split(data, outcome, test_size=0.2, random_state=seed)
n_vars = x_test.shape[1]
n_out_of_sample = x_test.shape[0]
print("Training set: ", x_train.shape[0], " (", np.sum(y_train), ")") 
print("Testing set: ", n_out_of_sample, " (", np.sum(y_test), ")") 
print('n features: ', n_vars)

In [None]:
# attributions
with open('a_zero.npy', 'rb') as f:
        a_zero = np.load(f)
        a_zero_train = np.load(f)
with open('a_average.npy', 'rb') as f:
        a_average = np.load(f)
        a_average_train = np.load(f)
with open('a_neutral_05.npy', 'rb') as f:
        a_neutral_05 = np.load(f)
        a_neutral_05_train = np.load(f)
with open('a_maxdist.npy', 'rb') as f:
        a_maxdist = np.load(f)
        a_maxdist_train = np.load(f)
with open('a_pdata.npy', 'rb') as f:
        a_pdata = np.load(f)
        a_pdata_train = np.load(f)
# baselines
with open('zero_reference.npy', 'rb') as f:
    zero_reference = np.load(f)
with open('average_reference.npy', 'rb') as f:
    average_reference = np.load(f)
with open('reference.npy', 'rb') as f:
    reference = np.load(f)
with open('maxdist_references.npy', 'rb') as f:
    maxdist_references = np.load(f)
    maxdist_references_train = np.load(f)
# import model
with open('conf.pickle', 'rb') as f:
     conf = pickle.load(f)
model = keras.models.load_model("model")
# build sparse model for neutral baselines
# Get weights and biases and store them in a list
ws = []
bs = []
for layer in model.layers:
    ws.append(layer.get_weights()[0])
    bs.append(layer.get_weights()[1])

ls = []
for _, activation in conf:
    ls.append(activation)
model_sparse = get_sparse_mlp(ws, bs, ls, reference)

# Get Predictions
y_hat_train = model.predict(x_train)
y_hat_test = model.predict(x_test)
roc_auc = roc_auc_score(np.array(y_test), y_hat_test)
ave_pre = average_precision_score(np.array(y_test), y_hat_test)

print('Best model: ')
print(conf)
print('Test set results: ')
print("ROC AUC: ", roc_auc)
print("AVG PRE: ", ave_pre)

# Local Analysis via Information Content

In [None]:
n_vars = a_zero.shape[1]
cols = list(np.linspace(0, n_vars, n_vars+1).astype(int).astype(str))

## Zero

In [None]:
_, abs_log_odds_a_zero = local_analysis(model, x_test, a_zero, zero_reference, asc=False)
df_a_zero_abs = pd.DataFrame(abs_log_odds_a_zero, columns=cols)
df_a_zero_abs.to_csv('./results/result_abs_zero.csv')

## Average

In [None]:
_, abs_log_odds_a_average = local_analysis(model, x_test, a_average, average_reference, asc=False)
df_a_average_abs = pd.DataFrame(abs_log_odds_a_average, columns=cols)
df_a_average_abs.to_csv('./results/result_abs_ave.csv')

## Neutrality 0.5

In [None]:
n_base_x_feat = len(reference)//n_vars
_, abs_log_odds_a_neutral = \
    local_analysis(model_sparse, x_test, a_neutral_05, reference, asc=False, n_base_x_feat = n_base_x_feat)
df_a_neutral_abs = pd.DataFrame(abs_log_odds_a_neutral, columns=cols)
df_a_neutral_abs.to_csv('./results/result_abs_neutral.csv')

## MaxDist

In [None]:
_, abs_log_odds_a_maxdist = local_analysis(model, x_test, a_maxdist, maxdist_references, asc=False)
df_a_maxdist_abs = pd.DataFrame(abs_log_odds_a_maxdist, columns=cols)
df_a_maxdist_abs.to_csv('./results/result_abs_maxdist.csv')

## pdata

In [None]:
_, abs_log_odds_a_pdata = local_analysis(model, x_test, a_pdata, average_reference, asc=False)
df_a_pdata_abs = pd.DataFrame(abs_log_odds_a_pdata, columns=cols)
df_a_pdata_abs.to_csv('./results/result_abs_pdata.csv')

# Local Analysis via ROAR

In [None]:
n_train = 30
# Zero
abs_a_zero = np.abs(a_zero)
abs_a_zero_train = np.abs(a_zero_train)
delta_performance_a_zero = np.zeros(n_vars)
delta_performance_a_zero[0] = ave_pre
# Average
abs_a_average = np.abs(a_average)
abs_a_average_train = np.abs(a_average_train)
delta_performance_a_average = np.zeros(n_vars)
delta_performance_a_average[0] = ave_pre
# Neutral
abs_a_neutral = np.abs(a_neutral_05)
abs_a_neutral_train = np.abs(a_neutral_05_train)
delta_performance_a_neutral = np.zeros(n_vars)
delta_performance_a_neutral[0] = ave_pre
# Maximum Distance
abs_a_maxdist = np.abs(a_maxdist)
abs_a_maxdist_train = np.abs(a_maxdist_train)
delta_performance_a_maxdist = np.zeros(n_vars)
delta_performance_a_maxdist[0] = ave_pre
# P_data
abs_a_pdata = np.abs(a_pdata_train)
abs_a_pdata_train = np.abs(a_pdata_train)
delta_performance_a_pdata = np.zeros(n_vars)
delta_performance_a_pdata[0] = ave_pre
# random - uniform
random_imp = np.random.random(abs_a_pdata.shape)
random_imp_train = np.random.random(abs_a_pdata_train.shape)
delta_performance_a_random = np.zeros(n_vars)
delta_performance_a_random[0] = ave_pre

In [None]:
# roar on average
for j in tqdm(range(n_vars - 1)):

    delta_performance_a_zero[j + 1] = \
        local_roar(x_train, x_test,
             y_train, y_test,
             abs_a_zero_train, abs_a_zero,
             conf, j, n_train = n_train, replace_with_train=[average_reference], replace_with_test=[average_reference])

    delta_performance_a_average[j + 1] = \
        local_roar(x_train, x_test,
             y_train, y_test,
             abs_a_average_train, abs_a_average,
             conf, j, n_train = n_train, replace_with_train=[average_reference], replace_with_test=[average_reference])

    delta_performance_a_neutral[j + 1] = \
        local_roar(x_train, x_test,
             y_train, y_test,
             abs_a_neutral_train, abs_a_neutral,
             conf, j, n_train = n_train, replace_with_train=[average_reference], replace_with_test=[average_reference])
    
    delta_performance_a_maxdist[j + 1] = \
        local_roar(x_train, x_test,
             y_train, y_test,
             abs_a_maxdist_train, abs_a_maxdist,
             conf, j, n_train = n_train, replace_with_train=[average_reference], replace_with_test=[average_reference])

    delta_performance_a_pdata[j + 1] = \
        local_roar(x_train, x_test,
             y_train, y_test,
             abs_a_pdata_train, abs_a_pdata,
             conf, j, n_train = n_train, replace_with_train=[average_reference], replace_with_test=[average_reference])

    delta_performance_a_random[j + 1] = \
        local_roar(x_train, x_test,
             y_train, y_test,
             random_imp_train, random_imp,
             conf, j, n_train = n_train, replace_with_train=[average_reference], replace_with_test=[average_reference])

results_df = pd.DataFrame({'zero _perf': delta_performance_a_zero, # 1 # sh on zeros
                           'ave _perf': delta_performance_a_average, # 2 # sh on average
                           'neutral _perf': delta_performance_a_neutral, # 3 # Neutral
                           'max dist _perf': delta_performance_a_maxdist, # 4 # max dist
                           'pdata _perf': delta_performance_a_pdata, # 5 # p data
                           'random _perf': delta_performance_a_random # 6 # Random
                           })

results_df.to_csv('./results/results_ROAR.csv')