# Explain model Dalex

# Break-down Plots

In [1]:
# Preprocess data
import pandas as pd
import numpy as np
# Visual data
import matplotlib.pyplot as plt
import seaborn as sns
# ML
import sklearn as sk
from sklearn import model_selection
from sklearn import preprocessing
from sklearn import pipeline
from sklearn import compose
from sklearn import metrics
# Explainn model
import dalex as dx

In [2]:
import dalex as dx
titanic = dx.datasets.load_titanic()
X = titanic.drop(columns='survived')
y = titanic.survived

In [3]:
titanic

Unnamed: 0,gender,age,class,embarked,fare,sibsp,parch,survived
0,male,42.0,3rd,Southampton,7.11,0,0,0
1,male,13.0,3rd,Southampton,20.05,0,2,0
2,male,16.0,3rd,Southampton,20.05,1,1,0
3,female,39.0,3rd,Southampton,20.05,1,1,1
4,female,16.0,3rd,Southampton,7.13,0,0,1
...,...,...,...,...,...,...,...,...
2202,male,41.0,deck crew,Belfast,0.00,0,0,1
2203,male,40.0,victualling crew,Southampton,0.00,0,0,1
2204,male,32.0,engineering crew,Southampton,0.00,0,0,0
2205,male,20.0,restaurant staff,Southampton,0.00,0,0,0


In [4]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

preprocess = make_column_transformer(
    (StandardScaler(), ['age', 'fare', 'parch', 'sibsp']),
    (OneHotEncoder(), ['gender', 'class', 'embarked']))

In [5]:
from sklearn.linear_model import LogisticRegression

titanic_lr = make_pipeline(
    preprocess,
    LogisticRegression(penalty = 'l2'))
titanic_lr.fit(X, y)

from sklearn.ensemble import RandomForestClassifier

titanic_rf = make_pipeline(
    preprocess,
    RandomForestClassifier(max_depth = 3, n_estimators = 500))
titanic_rf.fit(X, y)

from sklearn.ensemble import GradientBoostingClassifier

titanic_gbc = make_pipeline(
    preprocess,
    GradientBoostingClassifier(n_estimators = 100))
titanic_gbc.fit(X, y)

from sklearn.svm import SVC

titanic_svm = make_pipeline(
    preprocess,
    SVC(probability = True))
titanic_svm.fit(X, y)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('standardscaler',
                                                  StandardScaler(),
                                                  ['age', 'fare', 'parch',
                                                   'sibsp']),
                                                 ('onehotencoder',
                                                  OneHotEncoder(),
                                                  ['gender', 'class',
                                                   'embarked'])])),
                ('svc', SVC(probability=True))])

In [6]:
henry = pd.DataFrame({'gender'   : ['male'],
                       'age'     : [47],
                       'class'   : ['1st'],
                       'embarked': ['Cherbourg'],
                       'fare'    : [25],
                       'sibsp'   : [0],
                       'parch'   : [0]},
                      index = ['Henry'])
titanic_lr.predict_proba(henry)
# array([[0.56798421 0.43201579]])
titanic_rf.predict_proba(henry)
# array([[0.69917845 0.30082155]])
titanic_gbc.predict_proba(henry)
# array([[0.78542886 0.21457114]])
titanic_svm.predict(henry)
# array([[0.81725832 0.18274168]])

array([0], dtype=int64)

In [7]:
titanic_rf_exp = dx.Explainer(titanic_rf, X, y, 
                  label = "Titanic RF Pipeline")

Preparation of a new explainer is initiated

  -> data              : 2207 rows 7 cols
  -> target variable   : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray.
  -> target variable   : 2207 values
  -> model_class       : sklearn.ensemble._forest.RandomForestClassifier (default)
  -> label             : Titanic RF Pipeline
  -> predict function  : <function yhat_proba_default at 0x000001AD89EFB0D0> will be used (default)
  -> predict function  : Accepts only pandas.DataFrame, numpy.ndarray causes problems.
  -> predicted values  : min = 0.164, mean = 0.322, max = 0.882
  -> model type        : classification will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -0.829, mean = -0.000222, max = 0.833
  -> model_info        : package sklearn

A new explainer has been created!


In [8]:
bd_henry = titanic_rf_exp.predict_parts(henry, 
             type = 'break_down')
bd_henry.result

Unnamed: 0,variable_name,variable_value,variable,cumulative,contribution,sign,position,label
0,intercept,1,intercept,0.322379,0.322379,1.0,8,Titanic RF Pipeline
1,class,1st,class = 1st,0.38979,0.067412,1.0,7,Titanic RF Pipeline
2,embarked,Cherbourg,embarked = Cherbourg,0.417352,0.027562,1.0,6,Titanic RF Pipeline
3,fare,25.0,fare = 25.0,0.429627,0.012275,1.0,5,Titanic RF Pipeline
4,sibsp,0.0,sibsp = 0.0,0.429737,0.00011,1.0,4,Titanic RF Pipeline
5,parch,0.0,parch = 0.0,0.425002,-0.004735,-1.0,3,Titanic RF Pipeline
6,age,47.0,age = 47.0,0.422557,-0.002446,-1.0,2,Titanic RF Pipeline
7,gender,male,gender = male,0.314105,-0.108451,-1.0,1,Titanic RF Pipeline
8,,,prediction,0.314105,0.314105,1.0,0,Titanic RF Pipeline


In [9]:
def configure_plotly_browser_state():
  import IPython
  display(IPython.core.display.HTML('''
        <script src="/static/components/requirejs/require.js"></script>
        <script>
          requirejs.config({
            paths: {
              base: '/static/base',
              plotly: 'https://cdn.plot.ly/plotly-latest.min.js?noext',
            },
          });
        </script>
        '''))

import IPython # Config IPython run in vevery cell
IPython.get_ipython().events.register('pre_run_cell', configure_plotly_browser_state)

In [None]:
bd_henry.plot()