In [1]:
import pandas as pd
import numpy as np
import time
import warnings
import shap
from statwolfml import Builder
from ACME import ACME

In [2]:
import warnings
warnings.filterwarnings("ignore")

## Full Boston dataset

In [3]:
from sklearn import datasets
boston = datasets.load_boston()
X = boston.data
y = boston.target
        
dataframe = pd.DataFrame(X, columns=boston.feature_names)
dataframe['target'] = y

In [4]:
features = dataframe.drop(columns={'target'}).columns

## Train different model

In [5]:
models_name = ['linear_regression','random_forest_regressor','cat_boost_regressor','sgd_regressor','svr']
models = {}
for model in models_name:
        build_model = Builder(model).feature_names(features).labels(['target']).build()
        train_model = build_model.train(dataframe)
        models[model] = train_model

### Compare ASHAP and SHAP results

### BUILDING 

In [6]:
time_start = {}
time_elapsed = {}

#### Linear Regression

In [7]:
time_start = time.clock()
acme_lr = ACME(models['linear_regression'].base_model()['model'],'target')
acme_lr = acme_lr.fit(dataframe)
time_elapsed['ACME_LR'] = (time.clock() - time_start)

In [8]:
time_start = time.clock()
shap_lr = shap.KernelExplainer(models['linear_regression'].base_model()['model'].predict, dataframe.drop(columns='target'))
shap_lr_values = shap_lr.shap_values(dataframe.drop(columns='target'))
time_elapsed['SHAP_LR'] = (time.clock() - time_start)

Using 506 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.


HBox(children=(FloatProgress(value=0.0, max=506.0), HTML(value='')))




#### Random Forest

In [10]:
time_start = time.clock()
acme_rf = ACME(models['random_forest_regressor'].base_model()['model'],'target')
acme_rf = acme_rf.fit(dataframe)
time_elapsed['ACME_RF'] = (time.clock() - time_start)

In [11]:
time_start = time.clock()
shap_rf = shap.KernelExplainer(models['random_forest_regressor'].base_model()['model'].predict, dataframe.drop(columns='target'))
shap_rf_values = shap_rf.shap_values(dataframe.drop(columns='target'))
time_elapsed['SHAP_RF'] = (time.clock() - time_start)

Using 506 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.


HBox(children=(FloatProgress(value=0.0, max=506.0), HTML(value='')))




#### Cat_boost_regressor

In [12]:
time_start = time.clock()
acme_ct = ACME(models['cat_boost_regressor'].base_model()['model'],'target')
acme_ct = acme_ct.fit(dataframe)
time_elapsed['ACME_CT'] = (time.clock() - time_start)

In [None]:
time_start = time.clock()
shap_ct = shap.KernelExplainer(models['cat_boost_regressor'].base_model()['model'].predict, dataframe.drop(columns='target'))
shap_ct_values = shap_ct.shap_values(dataframe.drop(columns='target'))
time_elapsed['SHAP_CT'] = (time.clock() - time_start)

Using 506 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.


HBox(children=(FloatProgress(value=0.0, max=506.0), HTML(value='')))

#### SVR

In [None]:
time_start = time.clock()
acme_svr = ACME(models['svr'].base_model()['model'],'target')
acme_svr = acme_svr.fit(dataframe)
time_elapsed['ACME_SVR'] = (time.clock() - time_start)

In [None]:
time_start = time.clock()
shap_svr = shap.KernelExplainer(models['svr'].base_model()['model'].predict, dataframe.drop(columns='target'))
shap_svr_values = shap_svr.shap_values(dataframe.drop(columns='target'))
time_elapsed['SHAP_SVR'] = (time.clock() - time_start)

### VISUAL COMPARISON

In [None]:
time_elapsed

### LinearModel


In [None]:
shap.summary_plot(shap_lr_values, dataframe.drop(columns='target'), plot_size=(10,7))

### CatBoost

In [None]:
shap.summary_plot(shap_ct_values, dataframe.drop(columns='target'), plot_size=(10,7))

### RandomForest 

In [None]:
shap.summary_plot(shap_rf_values, dataframe.drop(columns='target'), plot_size=(10,7))

### SVR

In [None]:
shap.summary_plot(shap_svr_values, dataframe.drop(columns='target'), plot_size=(10,7))