In [None]:
import os
os.chdir(os.environ['PROJECT_ROOT'])

%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pathlib
import glob

## ELI5
https://eli5.readthedocs.io/en/latest/

**Example from: https://eli5.readthedocs.io/en/latest/tutorials/xgboost-titanic.html**

In [None]:
titanic_file = 'D:/Install/miniconda/envs/logml/lib/data/titanic.csv'

In [None]:
import csv

with open(titanic_file, 'rt') as f:
    data = list(csv.DictReader(f))
data[:1]

In [None]:
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

_all_xs = [{k: v for k, v in row.items() if k != 'Survived'} for row in data]
_all_ys = np.array([int(row['Survived']) for row in data])

all_xs, all_ys = shuffle(_all_xs, _all_ys, random_state=0)
train_xs, valid_xs, train_ys, valid_ys = train_test_split(
    all_xs, all_ys, test_size=0.25, random_state=0)
print('{} items total, {:.1%} true'.format(len(all_xs), np.mean(all_ys)))

In [None]:
for x in all_xs:
    if x['Age']:
        x['Age'] = float(x['Age'])
    else:
        x.pop('Age')
    x['Fare'] = float(x['Fare'])
    x['SibSp'] = int(x['SibSp'])
    x['Parch'] = int(x['Parch'])

In [None]:
from lightgbm import LGBMClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score

clf = LGBMClassifier()
vec = DictVectorizer()
pipeline = make_pipeline(vec, clf)

def evaluate(_clf):
    scores = cross_val_score(_clf, all_xs, all_ys, scoring='accuracy', cv=10)
    print('Accuracy: {:.3f} ± {:.3f}'.format(np.mean(scores), 2 * np.std(scores)))
    _clf.fit(train_xs, train_ys)  # so that parts of the original pipeline are fitted

evaluate(pipeline)

**Global explainability (feature importance)**

In [None]:
from eli5 import show_weights
show_weights(clf, vec=vec, importance_type='gain')

In [None]:
show_weights(clf, vec=vec, importance_type='weight')

**Local explainability**

In [None]:
from eli5 import show_prediction
show_prediction(clf, valid_xs[1], vec=vec, show_feature_values=True)

In [None]:
show_prediction(clf, valid_xs[110], vec=vec, show_feature_values=True)

## SHAP
https://github.com/slundberg/shap

In [None]:
import shap
from lightgbm import LGBMRegressor

# load JS visualization code to notebook
shap.initjs()

# train XGBoost model
X, y = shap.datasets.boston()
model = LGBMRegressor().fit(X, y)

# explain the model's predictions using SHAP
# (same syntax works for LightGBM, CatBoost, scikit-learn and spark models)
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X)

**Local explainability**

In [None]:
# visualize the 107 prediction's explanation
shap.force_plot(explainer.expected_value, shap_values[107], X.loc[107,:], feature_names=X.columns)

**Global explainability**

In [None]:
shap.summary_plot(shap_values, X)

In [None]:
print(
    sorted(
        list(zip(model.booster_.feature_importance(importance_type='split'), X.columns)),
        key=lambda x: -x[0]
    )
)

In [None]:
import seaborn as sns

sns.scatterplot(X['LSTAT'], y)

In [None]:
import seaborn as sns

sns.scatterplot(X['RM'], y)