# 3.8.2 ELI5 Model Interpretations
#### for both monotonic and non-monotonic models

## 1) Load Required Libraries

In [1]:
import os
import csv
import numpy as np
import pandas as pd 
import xgboost as xgb
import eli5
from eli5 import show_weights, show_prediction
from sklearn.feature_extraction import DictVectorizer

Change some pandas settings

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

%matplotlib inline

Set some path constants

In [3]:
saved_models_path = os.path.join(os.getcwd(), 'models')
datasets_path = os.path.join(os.getcwd(), 'data')

## 2) Load Data

### 2.1) Load One-hot encoded version

In [4]:
train_df = pd.read_csv(os.path.join(datasets_path, 'new.train.1hot.df.csv'))
test_df = pd.read_csv(os.path.join(datasets_path, 'new.test.1hot.df.csv'))

### 2.2) Some Minor Data Prep

Drop useless columns

In [5]:
train_df = train_df.drop(columns=["Unnamed: 0"])
test_df = test_df.drop(columns=["Unnamed: 0"])

Reorder default to first

In [6]:
train_df = train_df[['default', 'loan_amnt', 'term_36MO', 'term_60MO', 'int_rate', 'installment', 'grade_A', 'grade_B', 'grade_C', 'grade_D', 'grade_E', 'grade_F', 'grade_G', 'home_ownership_MORTGAGE', 'home_ownership_OWN', 'home_ownership_RENT', 'annual_inc', 'verification_status_NOT_VERIFIED', 'verification_status_SOURCE_VERIFIED', 'verification_status_VERIFIED', 'purpose_CREDIT_CARD', 'purpose_DEBT_CONSOLIDATION', 'purpose_HOME_IMPROVEMENT', 'purpose_MAJOR_PURCHASE', 'purpose_OTHERS', 'purpose_SMALL_BUSINESS', 'dti', 'open_acc', 'pub_rec', 'total_acc', 'initial_list_status_F', 'initial_list_status_W', 'compensation_of_employees', 'gross_operating_surplus', 'per_capita_real_gdp_by_state', 'quantity_indexes_for_real_gdp_by_state', 'real_gdp_by_state', 'subsidies', 'taxes_on_production_and_imports', 'regions_MIDWEST', 'regions_NORTHEAST', 'regions_SOUTH', 'regions_WEST', 'month']]
test_df = test_df[['default', 'loan_amnt', 'term_36MO', 'term_60MO', 'int_rate', 'installment', 'grade_A', 'grade_B', 'grade_C', 'grade_D', 'grade_E', 'grade_F', 'grade_G', 'home_ownership_MORTGAGE', 'home_ownership_OWN', 'home_ownership_RENT', 'annual_inc', 'verification_status_NOT_VERIFIED', 'verification_status_SOURCE_VERIFIED', 'verification_status_VERIFIED', 'purpose_CREDIT_CARD', 'purpose_DEBT_CONSOLIDATION', 'purpose_HOME_IMPROVEMENT', 'purpose_MAJOR_PURCHASE', 'purpose_OTHERS', 'purpose_SMALL_BUSINESS', 'dti', 'open_acc', 'pub_rec', 'total_acc', 'initial_list_status_F', 'initial_list_status_W', 'compensation_of_employees', 'gross_operating_surplus', 'per_capita_real_gdp_by_state', 'quantity_indexes_for_real_gdp_by_state', 'real_gdp_by_state', 'subsidies', 'taxes_on_production_and_imports', 'regions_MIDWEST', 'regions_NORTHEAST', 'regions_SOUTH', 'regions_WEST', 'month']]

See summary stats to make sure data has no issues

In [7]:
train_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
loan_amnt,204640.0,13522.286699,8128.199436,500.0,7200.0,12000.0,18200.0,35000.0
term_36MO,204640.0,0.77737,0.416013,0.0,1.0,1.0,1.0,1.0
term_60MO,204640.0,0.22263,0.416013,0.0,0.0,0.0,0.0,1.0
int_rate,204640.0,13.780552,4.387821,5.32,10.74,13.61,16.55,28.99
installment,204640.0,416.810278,244.639684,15.69,238.06,364.38,546.21,1424.57
grade_A,204640.0,0.165618,0.371738,0.0,0.0,0.0,0.0,1.0
grade_B,204640.0,0.297601,0.457204,0.0,0.0,0.0,1.0,1.0
grade_C,204640.0,0.258556,0.437842,0.0,0.0,0.0,1.0,1.0
grade_D,204640.0,0.161249,0.367761,0.0,0.0,0.0,0.0,1.0
grade_E,204640.0,0.077302,0.26707,0.0,0.0,0.0,0.0,1.0


Check for missing values

In [8]:
train_df.isna().any()

default                                   False
loan_amnt                                 False
term_36MO                                 False
term_60MO                                 False
int_rate                                  False
installment                               False
grade_A                                   False
grade_B                                   False
grade_C                                   False
grade_D                                   False
grade_E                                   False
grade_F                                   False
grade_G                                   False
home_ownership_MORTGAGE                   False
home_ownership_OWN                        False
home_ownership_RENT                       False
annual_inc                                False
verification_status_NOT_VERIFIED          False
verification_status_SOURCE_VERIFIED       False
verification_status_VERIFIED              False
purpose_CREDIT_CARD                     

### 2.3) Convert Datasets to Numpy Arrays
#### (they are all numeric so this should work fine)

In [9]:
train = train_df
test = test_df
train.default = train.default.astype(int)
test.default = test.default.astype(int)

In [10]:
train = train.as_matrix()
test = test.as_matrix()

  """Entry point for launching an IPython kernel.
  


Make sure they are all the right shape

In [11]:
train.shape

(204640, 44)

In [12]:
test.shape

(51159, 44)

### 2.4) Generate Y/X Sets for all versions

In [50]:
x_train = train_df[['loan_amnt', 'term_36MO', 'term_60MO', 'int_rate', 'installment', 'grade_A', 'grade_B', 'grade_C', 'grade_D', 'grade_E', 'grade_F', 'grade_G', 'home_ownership_MORTGAGE', 'home_ownership_OWN', 'home_ownership_RENT', 'annual_inc', 'verification_status_NOT_VERIFIED', 'verification_status_SOURCE_VERIFIED', 'verification_status_VERIFIED', 'purpose_CREDIT_CARD', 'purpose_DEBT_CONSOLIDATION', 'purpose_HOME_IMPROVEMENT', 'purpose_MAJOR_PURCHASE', 'purpose_OTHERS', 'purpose_SMALL_BUSINESS', 'dti', 'open_acc', 'pub_rec', 'total_acc', 'initial_list_status_F', 'initial_list_status_W', 'compensation_of_employees', 'gross_operating_surplus', 'per_capita_real_gdp_by_state', 'quantity_indexes_for_real_gdp_by_state', 'real_gdp_by_state', 'subsidies', 'taxes_on_production_and_imports', 'regions_MIDWEST', 'regions_NORTHEAST', 'regions_SOUTH', 'regions_WEST', 'month']]
y_train = train_df[['default']]
x_test = test_df[['loan_amnt', 'term_36MO', 'term_60MO', 'int_rate', 'installment', 'grade_A', 'grade_B', 'grade_C', 'grade_D', 'grade_E', 'grade_F', 'grade_G', 'home_ownership_MORTGAGE', 'home_ownership_OWN', 'home_ownership_RENT', 'annual_inc', 'verification_status_NOT_VERIFIED', 'verification_status_SOURCE_VERIFIED', 'verification_status_VERIFIED', 'purpose_CREDIT_CARD', 'purpose_DEBT_CONSOLIDATION', 'purpose_HOME_IMPROVEMENT', 'purpose_MAJOR_PURCHASE', 'purpose_OTHERS', 'purpose_SMALL_BUSINESS', 'dti', 'open_acc', 'pub_rec', 'total_acc', 'initial_list_status_F', 'initial_list_status_W', 'compensation_of_employees', 'gross_operating_surplus', 'per_capita_real_gdp_by_state', 'quantity_indexes_for_real_gdp_by_state', 'real_gdp_by_state', 'subsidies', 'taxes_on_production_and_imports', 'regions_MIDWEST', 'regions_NORTHEAST', 'regions_SOUTH', 'regions_WEST', 'month']]
y_test = test_df[['default']]

In [20]:
x_train = train[:,1:44]
y_train = train[:,0:1]
x_test = test[:,1:44]
y_test = test[:,0:1]

### 2.5) Create DMatrixes

In [51]:
dtrain = xgb.DMatrix(x_train, y_train)
dtest = xgb.DMatrix(x_test, y_test)

## 3) Load Models

### 3.1) Monotonic 

First tried loading model...

In [25]:
mono_mdl = xgb.Booster()
mono_mdl.load_model(os.path.join(saved_models_path, 'new.xgb.mono.opt.costopt.mdl'))

In [26]:
mono_mdl.feature_names = train_df.columns[1:44]

...but model wouldn't load correctly because of discrepancies between R and Python saved model formats so I retrained it with exactly the same parameters and data:

In [52]:
mono_params = dict(
    booster = "gbtree",
    max_depth = 5,
    min_child_weight = 2,
    eta = 0.170257653004955,
    gamma = 0.393479593470693,
    subsample = 0.9, 
    colsample_bytree = 0.8, 
    objective = "binary:logistic",
    eval_metric= "aucpr",
    monotone_constraints = "(1, -1, 1, 1, 1, -1, -1, 1, 1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, -1, 1, -1, -1, 1, 1, 1, 1, 1, -1, 1, -1, 1, -1, -1, 1, 1, -1, 1, 1, 1, 1, -1, -1)"
)
mono_mdl2 = xgb.train(params = mono_params, num_boost_round = 153, dtrain = dtrain)

### 3.2) Non-Monotonic 

First tried loading model...

In [27]:
nonmono_mdl = xgb.Booster()
nonmono_mdl.load_model(os.path.join(saved_models_path, 'new.xgb.opt.costopt.mdl'))

In [28]:
nonmono_mdl.feature_names = train_df.columns[1:44]

...but model wouldn't load correctly because of discrepancies between R and Python saved model formats so I retrained it with exactly the same parameters and data:

In [53]:
nonmono_params = dict(
    booster = "gbtree",
    max_depth = 5,
    min_child_weight = 2,
    eta = 0.0691802625378494,
    gamma = 0.193399890288106,
    subsample = 0.9, 
    colsample_bytree = 0.8, 
    objective = "binary:logistic",
    eval_metric= "aucpr"
)
nonmono_mdl2 = xgb.train(params = nonmono_params, num_boost_round = 256, dtrain = dtrain)

## 4) Explain Weights

### 4.1) Monotonic 

In [55]:
eli5.explain_weights_xgboost(mono_mdl2)

Weight,Feature
0.3786,grade_A
0.1192,grade_B
0.1166,int_rate
0.1059,term_60MO
0.0716,term_36MO
0.0217,annual_inc
0.0204,purpose_SMALL_BUSINESS
0.0161,home_ownership_RENT
0.0142,dti
0.0101,regions_WEST


### 4.2) Non-Monotonic 

In [56]:
eli5.explain_weights_xgboost(nonmono_mdl2)

Weight,Feature
0.3129,grade_A
0.1208,grade_B
0.0766,int_rate
0.0737,term_60MO
0.0600,term_36MO
0.0490,grade_C
0.0186,home_ownership_RENT
0.0182,regions_WEST
0.0171,annual_inc
0.0155,grade_F


## 5) Explain Inference Time Predictions

### 5.1) Monotonic

In [60]:
show_prediction(mono_mdl2, x_test.iloc[1], show_feature_values=True)

Contribution?,Feature,Value
0.59,purpose_SMALL_BUSINESS,1.0
0.11,month,3.0
0.075,open_acc,16.0
0.061,quantity_indexes_for_real_gdp_by_state,335.0
0.041,grade_B,0.0
0.04,grade_A,0.0
0.032,initial_list_status_F,1.0
0.032,dti,18.58
0.03,real_gdp_by_state,310.0
0.027,purpose_CREDIT_CARD,0.0


### 5.2) Monotonic

In [61]:
show_prediction(nonmono_mdl2, x_test.iloc[1], show_feature_values=True)

Contribution?,Feature,Value
0.572,purpose_SMALL_BUSINESS,1.0
0.215,taxes_on_production_and_imports,4.0
0.185,gross_operating_surplus,198.0
0.154,month,3.0
0.09,per_capita_real_gdp_by_state,445.0
0.08,verification_status_NOT_VERIFIED,1.0
0.076,open_acc,16.0
0.046,dti,18.58
0.043,grade_A,0.0
0.038,grade_B,0.0
