In [3]:
from catboost import CatBoostClassifier
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
import shap


In [4]:
Xtrain = pd.read_csv('../train.csv')
Xtest = pd.read_csv('../holdout.csv')

In [5]:
Xtest.index = max(Xtrain.index)+1+np.arange(Xtest.shape[0])

In [6]:
Xtest

Unnamed: 0,age,work_class,education,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
17099,19,Private,High School grad,Never-Married,Sales,Own-child,White,Female,0.0,0.0,7.0,United-States,<=50K
17100,49,Private,Masters,Married,White-Collar,Husband,White,Male,15024.0,0.0,80.0,United-States,>50K
17101,50,Private,High School grad,Married,Blue-Collar,Husband,White,Male,3103.0,0.0,40.0,United-States,>50K
17102,39,Local-gov,Masters,Never-Married,Professional,Not-in-family,White,Female,0.0,0.0,50.0,United-States,<=50K
17103,26,Private,High School grad,Married,Admin,Husband,White,Male,0.0,0.0,40.0,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...
30666,32,Private,Bachelors,Never-Married,Sales,Own-child,White,Male,13550.0,0.0,35.0,United-States,>50K
30667,35,Private,Bachelors,Married,Sales,Husband,White,Male,7298.0,0.0,48.0,United-States,>50K
30668,42,Private,High School grad,Married,Admin,Wife,White,Female,0.0,0.0,36.0,United-States,>50K
30669,41,Private,Bachelors,Separated,Admin,Unmarried,White,Female,0.0,0.0,33.0,United-States,<=50K


In [7]:
cat_feat = [i for i in Xtrain.columns if Xtrain[i].dtype=='object']
cat_feat

['work_class',
 'education',
 'marital_status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native_country',
 'income']

In [8]:
cat_feat.remove('income')
clf = CatBoostClassifier(cat_features=cat_feat, 
                         n_estimators=100, 
                         verbose=False).fit(Xtrain.drop(['income'],axis=1),
                                            Xtrain['income'])

In [9]:
print(classification_report(Xtest['income'],clf.predict(Xtest.drop(['income'],axis=1))))

              precision    recall  f1-score   support

       <=50K       0.89      0.94      0.91     10232
        >50K       0.78      0.64      0.70      3340

    accuracy                           0.87     13572
   macro avg       0.83      0.79      0.81     13572
weighted avg       0.86      0.87      0.86     13572



In [12]:
import shap
feature_names=Xtest.drop(['income'],axis=1).columns
feature_names

Index(['age', 'work_class', 'education', 'marital_status', 'occupation',
       'relationship', 'race', 'sex', 'capital_gain', 'capital_loss',
       'hours_per_week', 'native_country'],
      dtype='object')

In [13]:
# Explain the model's predictions using SHAP
explainer = shap.TreeExplainer(clf, feature_names=Xtest.drop(['income'],axis=1).columns)
shap_values = explainer.shap_values(Xtest.drop(['income'],axis=1))

In [13]:
shap_values

array([[ 0.62023317, -0.025023  , -0.26420791, ..., -0.03816325,
        -0.02147674,  0.01325644],
       [ 0.71760503,  0.00803673,  0.77530535, ..., -0.04542148,
         0.02218287,  0.01325145],
       [ 0.83769247,  0.26514419,  1.68334031, ..., -0.04507616,
         0.745832  ,  0.01150284],
       ...,
       [ 0.62043321, -0.02287208, -0.1585457 , ..., -0.06468847,
        -0.14177433,  0.01674966],
       [ 0.63780991,  0.02119963,  0.77456633, ..., -0.04418947,
        -0.56468339,  0.00984906],
       [-1.84626109, -0.1321492 ,  0.79860245, ...,  2.17667834,
         0.00635044,  0.01077901]])

In [17]:
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[50], feature_names=Xtest.drop(['income'],axis=1).columns)

In [20]:
shap.force_plot(explainer.expected_value, shap_values[0:500], feature_names=Xtest.drop(['income'],axis=1).columns)