## Explanations of RPS-LJE and Influence Function on German Credit Risk Analysis with XGBoost
Table 2 and Table 11 (appendix)

In [1]:
import numpy as np
import torch
import pandas as pd

In [2]:
path = "../data"
X_train_clean_res = pd.read_csv('{}/X_train_clean_res.csv'.format(path), index_col=0)
y_train_clean_res = pd.read_csv('{}/Y_train_clean_res.csv'.format(path), index_col=0)
X_test_clean = pd.read_csv('{}/X_test_clean.csv'.format(path), index_col=0)
y_test_clean = pd.read_csv('{}/Y_test_clean.csv'.format(path), index_col=0)
X_train_clean = pd.read_csv('{}/X_train_clean.csv'.format(path), index_col=0)
y_train_clean = pd.read_csv('{}/Y_train_clean.csv'.format(path), index_col=0)

data = pd.read_csv('{}/german_data.csv'.format(path), index_col=0)
data_translated = pd.read_csv('{}/german_data_translated.csv'.format(path), index_col=0)

In [3]:
path = '../saved_models/base'

weight_matrix_influence = np.load('{}/calculated_weights/influence_weight_matrix.npz'.format(path), allow_pickle=True)['weight_matrix'].squeeze()
grad_test = np.load('{}/calculated_weights/influence_weight_matrix.npz'.format(path), allow_pickle=True)['jaccobian_test']
weight_matrix_ours = np.load('{}/calculated_weights/ours_weight_matrix_with_lr_0.0001.npz'.format(path), allow_pickle=True)['weight_matrix'].squeeze()
file = np.load('{}/model/saved_outputs.npz'.format(path))
intermediate_train = torch.from_numpy(file['intermediate_train'])
intermediate_test = torch.from_numpy(file['intermediate_test'])
labels_train = file['labels_train']
labels_test = file['labels_test']
pred_train = file['pred_train']
pred_test = file['pred_test']

In [4]:
data_translated.head()


Unnamed: 0_level_0,existingchecking,duration,credithistory,purpose,creditamount,savings,employmentsince,installmentrate,statussex,otherdebtors,...,property,age,otherinstallmentplans,housing,existingcredits,job,peopleliable,telephone,foreignworker,classification
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,poor,6,critical account/ other credits existing (not...,radio/television,1169,unknown/ no savings account,more than 7 years,4,"male, single",none,...,real estate,67,none,own,2,skilled employee / official,1,yes,yes,1
1,little,48,existing credits paid back duly till now,radio/television,5951,little,1 to 4 years,2,"female, divorced/separated/married",none,...,real estate,22,none,own,1,skilled employee / official,1,none,yes,2
2,no checking account,12,critical account/ other credits existing (not...,education,2096,little,4 to 7 years,2,"male, single",none,...,real estate,49,none,own,1,unskilled - resident,2,none,yes,1
3,poor,42,existing credits paid back duly till now,furniture/equipment,7882,little,4 to 7 years,2,"male, single",guarantor,...,building society savings agreement/ life insu...,45,none,for free,1,skilled employee / official,2,none,yes,1
4,poor,24,delay in paying off in the past,car (new),4870,little,1 to 4 years,3,"male, single",none,...,unknown / no property,53,none,for free,2,skilled employee / official,2,none,yes,2


In [5]:
wrongly_predicted_train_ids = np.argwhere(np.abs(pred_train-labels_train)>0).flatten()
wrongly_predicted_test_ids = np.argwhere(np.abs(pred_test-labels_test)>0).flatten()

In [6]:
names = ['existingchecking', 'duration', 'credithistory', 'purpose', 'creditamount',
         'savings', 'employmentsince', 'installmentrate', 'statussex', 'otherdebtors',
         'residencesince', 'property', 'age', 'otherinstallmentplans', 'housing',
         'existingcredits', 'job', 'peopleliable', 'telephone', 'foreignworker', 'classification']
def get_influence_order(test_point=None):
    tmp = grad_test[test_point, 0]@ np.transpose(weight_matrix_influence)
    pos_idx = np.argsort(tmp)
    return pos_idx

def get_ours_order(test_point=None):
    true_class = labels_test[test_point]
    tmp = np.dot(weight_matrix_ours,
                 intermediate_test[test_point,:])
    if true_class == 1:
        pos_idx = np.flip(np.argsort(tmp), axis=0)
    else:
        pos_idx = np.argsort(tmp)
    return pos_idx

In [7]:
def get_data_by_cleaned_idx(X_cleaned, idx):
    return data_translated.iloc[X_cleaned.iloc[idx,:]['id']], X_cleaned.iloc[idx,:]['id']

def sort_by_feature_importance(df):
    df = df[['method','type','id', 'classification',
            'existingchecking', 'credithistory', 'savings',
            'otherdebtors','employmentsince', 'otherinstallmentplans',
            'housing','purpose', 'property', 'duration',
            'creditamount','statussex','existingcredits',
            'installmentrate','residencesince', 'age',
            'job', 'peopleliable', 'telephone',
            'foreignworker']]
    return df

In [8]:
def experiment_with_test_data(test_pt):
    columns=['method','type','id'] + names
    data_array_ours = []
    data_array_IF = []
    test_data, data_id = get_data_by_cleaned_idx(X_test_clean, test_pt)
    data_array_ours.append(np.concatenate([[' ','Test point',data_id],test_data.values]))
    data_array_IF.append(np.concatenate([[' ','Test point',data_id],test_data.values]))
    ours_idx_pos_in_res = [i for i in get_ours_order(test_pt) if not i in wrongly_predicted_train_ids]
    inf_idx_pos_in_res = [i for i in get_influence_order(test_pt) if not i in wrongly_predicted_train_ids]
    for i in range(3):
        our_pos_data, data_id = get_data_by_cleaned_idx(X_train_clean_res, ours_idx_pos_in_res[i])
        data_array_ours.append(np.concatenate([['REP-LJE','Positive {}'.format(i+1),
                                                data_id ],our_pos_data.values]))

        inf_pos_data, data_id  = get_data_by_cleaned_idx(X_train_clean_res, inf_idx_pos_in_res[i])
        data_array_IF.append(np.concatenate([['Influence function','Positive {}'.format(i+1),
                                              data_id],inf_pos_data.values]))
    df_ours = sort_by_feature_importance(
        pd.DataFrame(data=data_array_ours, columns=columns))
    df_IF = sort_by_feature_importance(
        pd.DataFrame(data_array_IF, columns=columns))
    df_all = pd.concat([df_ours,df_IF.iloc[1:,:]])
    return df_ours, df_IF, df_all


In [11]:
df_all_list=[]
for i in [8, 94, 84, 56, 0, 32]:
    df_ours, df_IF, df_all=experiment_with_test_data(i)
    df_all_list.append(df_all)
df_all_stack = pd.concat(df_all_list)
df_all_stack.to_csv('results/German_credit.csv')
