In [40]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(rc={'figure.figsize':(11.7,8.27)})
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
FROM_SCRATCH = False

9.1.1 Preparing the data.
We’re using the wine-quality dataset, a numeric tabular dataset containing features that refer to the chemical composition of wines and quality ratings. To make this a simple classification task, we bucket all wines with ratings greater
than five as good, and the rest we label bad. We also normalize all the features.

In [41]:
store = pd.read_csv('input/store.csv')
train = pd.read_csv('input/train.csv',parse_dates=[2])
test = pd.read_csv('input/test.csv',parse_dates=[3])
# fillna in store with 0 has better result than median()
# Aufbereiten der daten
store.fillna(0, inplace=True)
# fill missing values in test with 1
# Aufbereiten der Daten
test.fillna(value = 1, inplace = True)
# merge data with store
# Alles in eine Tabelle
train = pd.merge(train, store, on='Store')
test = pd.merge(test, store, on='Store')


Columns (7) have mixed types. Specify dtype option on import or set low_memory=False.


In [42]:
train.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,5,2015-07-31,5263,555,1,1,0,1,c,a,1270.0,9.0,2008.0,0,0.0,0.0,0
1,1,4,2015-07-30,5020,546,1,1,0,1,c,a,1270.0,9.0,2008.0,0,0.0,0.0,0
2,1,3,2015-07-29,4782,523,1,1,0,1,c,a,1270.0,9.0,2008.0,0,0.0,0.0,0
3,1,2,2015-07-28,5011,560,1,1,0,1,c,a,1270.0,9.0,2008.0,0,0.0,0.0,0
4,1,1,2015-07-27,6102,612,1,1,0,1,c,a,1270.0,9.0,2008.0,0,0.0,0.0,0


In [43]:
train['Class'] = 3

classes = [0, 1, 2, 3, 4, 5, 6]

for Id in train['Store'].unique():
    store_df = train[train['Store'] == Id]
    for DayOfWeek in  store_df['DayOfWeek'].unique():
        if DayOfWeek != 7:
            store_df_day = store_df[store_df['DayOfWeek']== DayOfWeek]
            quantile = np.arange(1, 7) / 7
            store_quan = np.arange(0, 8)
            store_quan[0] = -1
            for index, x in enumerate(quantile):
                store_quan[index+1] = store_df_day['Sales'].quantile(x)
            store_quan[7] = store_df_day['Sales'].max()
            # befüllen der Werte Klasse für bestimmten Tag und bestimmten Store
            # for store in store_df_day:
            store_df_day['Class'] = pd.cut(store_df_day['Sales'], store_quan, labels=classes)
            train.loc[(train['Store'] == Id) & (train['DayOfWeek'] == DayOfWeek), 'Class'] = store_df_day['Class'].to_numpy()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/

In [44]:
train.isna().sum()

Store                        0
DayOfWeek                    0
Date                         0
Sales                        0
Customers                    0
Open                         0
Promo                        0
StateHoliday                 0
SchoolHoliday                0
StoreType                    0
Assortment                   0
CompetitionDistance          0
CompetitionOpenSinceMonth    0
CompetitionOpenSinceYear     0
Promo2                       0
Promo2SinceWeek              0
Promo2SinceYear              0
PromoInterval                0
Class                        0
dtype: int64

In [45]:
# process train and test
# aufbereiten der daten, neue Spalten und manche werden entfernt
def process(data, isTest = False):
    # label encode some features
    mappings = {'0':0, 'a':1, 'b':2, 'c':3, 'd':4}
    # buchstaben zu zahlen
    data.StoreType.replace(mappings, inplace=True)
    data.Assortment.replace(mappings, inplace=True)
    data.StateHoliday.replace(mappings, inplace=True)

    # extract some features from date column
    data['Month'] = data.Date.dt.month
    data['Year'] = data.Date.dt.year
    data['Day'] = data.Date.dt.day
    data['WeekOfYear'] = data.Date.dt.weekofyear

    # calculate competiter open time in months
    data['CompetitionOpen'] = 12 * (data.Year - data.CompetitionOpenSinceYear) + \
        (data.Month - data.CompetitionOpenSinceMonth)
    data['CompetitionOpen'] = data['CompetitionOpen'].apply(lambda x: x if x > 0 else 0)

    # calculate promo2 open time in months
    data['PromoOpen'] = 12 * (data.Year - data.Promo2SinceYear) + \
        (data.WeekOfYear - data.Promo2SinceWeek) / 4.0
    data['PromoOpen'] = data['PromoOpen'].apply(lambda x: x if x > 0 else 0)

    # Indicate whether the month is in promo interval
    month2str = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', \
             7:'Jul', 8:'Aug', 9:'Sept', 10:'Oct', 11:'Nov', 12:'Dec'}
    data['month_str'] = data.Month.map(month2str)

    def check(row):
        if isinstance(row['PromoInterval'],str) and row['month_str'] in row['PromoInterval']:
            return 1
        else:
            return 0

    data['IsPromoMonth'] =  data.apply(lambda row: check(row),axis=1)

    # select the features we need
    features = ['Store', 'DayOfWeek', 'Promo', 'StateHoliday', 'SchoolHoliday',
       'StoreType', 'Assortment', 'CompetitionDistance',
       'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2',
       'Promo2SinceWeek', 'Promo2SinceYear', 'Year', 'Month', 'Day',
       'WeekOfYear', 'CompetitionOpen', 'PromoOpen', 'IsPromoMonth']
    if not isTest:
        features.append('Sales')
        features.append('Class')

    data = data[features]
    return data

train = train.sort_values(['Date'],ascending = False)
train = process(train)
test = process(test,isTest = True)

Series.dt.weekofyear and Series.dt.week have been deprecated. Please use Series.dt.isocalendar().week instead.
Series.dt.weekofyear and Series.dt.week have been deprecated. Please use Series.dt.isocalendar().week instead.


In [46]:
test.head()

Unnamed: 0,Store,DayOfWeek,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,Year,Month,Day,WeekOfYear,CompetitionOpen,PromoOpen,IsPromoMonth
0,1,4,1,0,0,3,1,1270.0,9.0,2008.0,0,0.0,0.0,2015,9,17,38,84.0,24189.5,0
1,1,3,1,0,0,3,1,1270.0,9.0,2008.0,0,0.0,0.0,2015,9,16,38,84.0,24189.5,0
2,1,2,1,0,0,3,1,1270.0,9.0,2008.0,0,0.0,0.0,2015,9,15,38,84.0,24189.5,0
3,1,1,1,0,0,3,1,1270.0,9.0,2008.0,0,0.0,0.0,2015,9,14,38,84.0,24189.5,0
4,1,7,0,0,0,3,1,1270.0,9.0,2008.0,0,0.0,0.0,2015,9,13,37,84.0,24189.25,0


In [47]:
train.head()

Unnamed: 0,Store,DayOfWeek,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,...,Promo2SinceYear,Year,Month,Day,WeekOfYear,CompetitionOpen,PromoOpen,IsPromoMonth,Sales,Class
0,1,5,1,0,1,3,1,1270.0,9.0,2008.0,...,0.0,2015,7,31,31,82.0,24187.75,0,5263,5
679364,747,5,1,0,1,3,3,45740.0,8.0,2008.0,...,0.0,2015,7,31,31,83.0,24187.75,0,10708,6
702362,772,5,1,0,1,4,3,1850.0,0.0,0.0,...,0.0,2015,7,31,31,24187.0,24187.75,0,5224,6
683890,752,5,1,0,1,1,1,970.0,3.0,2013.0,...,2013.0,2015,7,31,31,28.0,24.0,0,7763,6
17714,20,5,1,0,0,4,1,2340.0,5.0,2009.0,...,2014.0,2015,7,31,31,74.0,9.75,1,9593,6


In [48]:
featuresR  = ['Store', 'DayOfWeek', 'Promo', 'StateHoliday', 'SchoolHoliday',
       'StoreType', 'Assortment', 'CompetitionDistance',
       'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2',
       'Promo2SinceWeek', 'Promo2SinceYear', 'Year', 'Month', 'Day',
       'WeekOfYear', 'CompetitionOpen', 'PromoOpen', 'IsPromoMonth']
train_data = train[featuresR].to_numpy()
test_data = test[featuresR].to_numpy()
labels_train = train[['Class']].to_numpy()

X_trainR, X_testR, y_trainR, y_testR = train_test_split(train_data, labels_train, random_state=0)
X_trainR, X_testR = X_trainR.astype('float32'), X_testR.astype('float32')
y_train_labR, y_test_labR = y_trainR[:, 0], y_testR[:, 0]
y_trainR, y_testR = y_trainR[:, 1:].astype('float32'), y_testR[:, 1:].astype('float32')
scalerR = StandardScaler()
scalerR.fit(X_trainR)
category_map = {1: ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"], 2:["PromoNo", "PromoYes"], 3: ["NoStateHoliday", "PublicHoliday", "EasterHoliday", "ChristmasHoliday"],
                4:["SchoolHolidayNo", "SchoolHolidayYes"], 5: ["StoreTypeA", "StoreTypeB", "StoreTypeC", "StoreTypeD", "StoreTypeE"], 6:["Basic", "Extra", "Extended"], 8:["None","Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"], 10: ["NoPromo2", "Promo2"], 14: ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"], 19: ["NoPromoMonth", "PromoMonth"] }

In [49]:

column_values = set(train['CompetitionOpenSinceMonth'])
print(column_values)

{0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0}


Select good wine instance
We partition the dataset into good and bad portions and select an instance of interest. I’ve chosen it to be a good quality
wine.
Note that bad wines are class 1 and correspond to the second model output being high, whereas good wines are class
0 and correspond to the first model output being high.

In [50]:
# bad_days = np.array([a for a, b in zip(X_trainR, y_trainR) if b[1] == 1])
# good_days = np.array([a for a, b in zip(X_trainR, y_trainR) if b[1] == 0])
xR = np.array([[747,5,1,0,1,3,3,45740.0,8.0,2008.0,1,0.0,0.0,2015,7,31,31,83.0,24187.75,0]])

9.1.2 Training models
Creating an Autoencoder
For some of the explainers, we need an autoencoder to check whether example instances are close to the training data
distribution or not.

Random Forest Model
We need a tree-based model to get results for the tree SHAP explainer. Hence we train a random forest on the winequality dataset.

XGBoost Model

In [51]:
# define eval metrics
# Mittleres Abweichungsquadrat
from sklearn.metrics import mean_squared_error
def rmspe(y, yhat):
    return np.sqrt(np.mean((yhat/y-1) ** 2))
# expm1 ist umkehr von log1p
def rmspe_xg(yhat, y):
    yhat = yhat
    y = y
    y = np.expm1(y.get_label())
    yhat = np.expm1(yhat)
    return "rmspe", rmspe(y,yhat)
def rmse(ytest, y):
    return np.sqrt(mean_squared_error(ytest, y))

In [52]:
y_train_xgb = y_train_labR
y_test_xgb = y_test_labR

In [53]:
from sklearn.metrics import accuracy_score
import xgboost as xgb

def make_xgb_modelR():
    params = {"objective": "multi:softmax", # for linear regression
              "booster" : "gbtree",   # use tree based models
              "eta": 0.05,   # learning rate
              "max_depth": 10,    # maximum depth of a tree
              "subsample": 0.9,    # Subsample ratio of the training instances
              "colsample_bytree": 0.7,   # Subsample ratio of columns when constructing each tree
              "n_estimators": 1000,
              "silent": 1,   # silent mode
              "seed": 10,   # Random number seed
              "num_class": 7
              #"gpu_id": 0,
              #"tree_method": "gpu_hist"
              }
    # anzahl trainingsrunden
    num_boost_round = 10

    dtrain = xgb.DMatrix(X_trainR, y_train_xgb)
    dtest = xgb.DMatrix(X_testR, y_test_xgb)
    watchlist = [(dtrain, 'train'), (dtest, 'eval')]
    # train the xgboost model
    model = xgb.train(params, dtrain, num_boost_round, evals=watchlist, \
      early_stopping_rounds= 10, verbose_eval=True)
    y_predR = np.rint(model.predict(xgb.DMatrix(X_testR)))
    print(y_predR)
    print('accuracy_score:', accuracy_score(y_predR, y_test_xgb))
    # print('f1_score:', f1_score(y_predR, y_test_xgb))

    return model

In [54]:
"""
import xgboost as xgb

def make_xgb_modelR():
    params = {"objective": "reg:linear", # for linear regression
              "booster" : "gbtree",   # use tree based models
              "eta": 0.05,   # learning rate
              "max_depth": 15,    # maximum depth of a tree
              "subsample": 0.9,    # Subsample ratio of the training instances
              "colsample_bytree": 0.7,   # Subsample ratio of columns when constructing each tree
              "n_estimators": 100,
              "silent": 1,   # silent mode
              "seed": 10,   # Random number seed
              #"num_class": 7
              #"gpu_id": 0,
              #"tree_method": "gpu_hist"
              }
    # anzahl trainingsrunden
    num_boost_round = 100

    dtrain = xgb.DMatrix(X_trainR, y_train_xgb)
    dtest = xgb.DMatrix(X_testR, y_test_xgb)
    watchlist = [(dtrain, 'train'), (dtest, 'eval')]
    # train the xgboost model
    model = xgb.train(params, dtrain, num_boost_round, evals=watchlist, \
      early_stopping_rounds= 100, verbose_eval=True)
    y_predR = np.rint(model.predict(xgb.DMatrix(X_testR)))
    print(y_predR)
    print('accuracy_score:', accuracy_score(y_predR, y_test_xgb))
    # print('f1_score:', f1_score(y_predR, y_test_xgb))

    return model
"""

'\nimport xgboost as xgb\n\ndef make_xgb_modelR():\n    params = {"objective": "reg:linear", # for linear regression\n              "booster" : "gbtree",   # use tree based models\n              "eta": 0.05,   # learning rate\n              "max_depth": 15,    # maximum depth of a tree\n              "subsample": 0.9,    # Subsample ratio of the training instances\n              "colsample_bytree": 0.7,   # Subsample ratio of columns when constructing each tree\n              "n_estimators": 100,\n              "silent": 1,   # silent mode\n              "seed": 10,   # Random number seed\n              #"num_class": 7\n              #"gpu_id": 0,\n              #"tree_method": "gpu_hist"\n              }\n    # anzahl trainingsrunden\n    num_boost_round = 100\n\n    dtrain = xgb.DMatrix(X_trainR, y_train_xgb)\n    dtest = xgb.DMatrix(X_testR, y_test_xgb)\n    watchlist = [(dtrain, \'train\'), (dtest, \'eval\')]\n    # train the xgboost model\n    model = xgb.train(params, dtrain, num

Tensorflow Model
Finally, we also train a TensorFlow model.


In [55]:
# Modell wird schon passend gespeichert @Bene :)
import os.path

if FROM_SCRATCH or not os.path.isfile('modelXGBR.json'):
    modelXGBR = make_xgb_modelR()
    modelXGBR.save_model('modelXGBR.json')
else:
    modelXGBR = xgb.Booster()
    modelXGBR.load_model('modelXGBR.json')

In [56]:
#modelXGBR = make_xgb_modelR()
#modelXGBR.save_model("modelXGBR.json")

In [57]:
y_pred = np.rint(modelXGBR.predict(xgb.DMatrix(X_testR)))
print(y_test_xgb)
print(y_pred)
error = rmse(y_test_xgb, y_pred)
print('RMSE: {:.4f}'.format(error))

[2 2 1 ... 2 3 3]
[2. 3. 1. ... 1. 4. 3.]
RMSE: 1.1924


Load/Make models
We save and load the same models each time to ensure stable results. If they don’t exist we create new ones. If you
want to generate new models on each notebook run, then set FROM_SCRATCH=True.

In [58]:
"""
import shap

dtrain = xgb.DMatrix(X_trainR, y_train_xgb)
modelXGBR.set_param({"predictor": "gpu_predictor"})
shap_values = modelXGBR.predict(dtrain, pred_contribs=True)
shap_interaction_values = modelXGBR.predict(dtrain, pred_interactions=True)

modelXGBR.set_param({"predictor": "gpu_predictor"})
explainer = shap.TreeExplainer(modelXGBR)
shap_values = explainer.shap_values(X_trainR)
shap.summary_plot(shap_values, X_trainR,max_display= 10, title = 'SHAP', plot_type= 'bar')
"""

'\nimport shap\n\ndtrain = xgb.DMatrix(X_trainR, y_train_xgb)\nmodelXGBR.set_param({"predictor": "gpu_predictor"})\nshap_values = modelXGBR.predict(dtrain, pred_contribs=True)\nshap_interaction_values = modelXGBR.predict(dtrain, pred_interactions=True)\n\nmodelXGBR.set_param({"predictor": "gpu_predictor"})\nexplainer = shap.TreeExplainer(modelXGBR)\nshap_values = explainer.shap_values(X_trainR)\nshap.summary_plot(shap_values, X_trainR,max_display= 10, title = \'SHAP\', plot_type= \'bar\')\n'

9.1.3 Util functions
These are utility functions for exploring results. The first shows two instances of the data side by side and compares
the difference. We’ll use this to see how the counterfactuals differ from their original instances. The second function
plots the importance of each feature. This will be useful for visualizing the attribution methods

In [59]:
"""
def compare_instances(x, cf):

    #Show the difference in values between two instances.

    x = x.astype('float64')
    cf = cf.astype('float64')
    for f, v1, v2 in zip(features, x[0], cf[0]):
        print(f'{f:<25} instance: {round(v1, 3):^10} counter factual: {round(v2, 3):^10} difference: {round(v2, 7):^5}')

def plot_importance(feat_imp, feat_names, class_idx, **kwargs):

    #Create a horizontal barchart of feature effects, sorted by their magnitude.

    df = pd.DataFrame(data=feat_imp, columns=feat_names).sort_values(by=0, axis='columns')
    feat_imp, feat_names = df.values[0], df.columns
    fig, ax = plt.subplots(figsize=(10, 5))
    y_pos = np.arange(len(feat_imp))
    ax.barh(y_pos, feat_imp)
    ax.set_yticks(y_pos)
    ax.set_yticklabels(feat_names, fontsize=15)
    ax.invert_yaxis()
    ax.set_xlabel(f'Feature effects for class {class_idx}', fontsize=15)
    return ax, fig
"""

"\ndef compare_instances(x, cf):\n\n    #Show the difference in values between two instances.\n\n    x = x.astype('float64')\n    cf = cf.astype('float64')\n    for f, v1, v2 in zip(features, x[0], cf[0]):\n        print(f'{f:<25} instance: {round(v1, 3):^10} counter factual: {round(v2, 3):^10} difference: {round(v2, 7):^5}')\n\ndef plot_importance(feat_imp, feat_names, class_idx, **kwargs):\n\n    #Create a horizontal barchart of feature effects, sorted by their magnitude.\n\n    df = pd.DataFrame(data=feat_imp, columns=feat_names).sort_values(by=0, axis='columns')\n    feat_imp, feat_names = df.values[0], df.columns\n    fig, ax = plt.subplots(figsize=(10, 5))\n    y_pos = np.arange(len(feat_imp))\n    ax.barh(y_pos, feat_imp)\n    ax.set_yticks(y_pos)\n    ax.set_yticklabels(feat_names, fontsize=15)\n    ax.invert_yaxis()\n    ax.set_xlabel(f'Feature effects for class {class_idx}', fontsize=15)\n    return ax, fig\n"

9.1.5 Local Necessary Features
Anchors
Anchors tell us what features need to stay the same for a specific instance for the model to give the same classification.
In the case of a trained image classification model, an anchor for a given instance would be a minimal subset of the
image that the model uses to make its decision.
Here we apply Anchors to the tensor flow model trained on the wine-quality dataset.

In [60]:
""""
from alibi.explainers import AnchorTabular
predict_fnR = lambda x: modelR.predict(scalerR.transform(x))
explainerR = AnchorTabular(predict_fnR, featuresR, categorical_names=category_map)
explainerR.fit(X_trainR, disc_perc=(25, 50, 75))
resultR = explainerR.explain(xR, threshold=0.95)
"""

'"\nfrom alibi.explainers import AnchorTabular\npredict_fnR = lambda x: modelR.predict(scalerR.transform(x))\nexplainerR = AnchorTabular(predict_fnR, featuresR, categorical_names=category_map)\nexplainerR.fit(X_trainR, disc_perc=(25, 50, 75))\nresultR = explainerR.explain(xR, threshold=0.95)\n'

In [61]:
category_map = {1: ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"], 2:["PromoNo", "PromoYes"], 3: ["NoStateHoliday", "PublicHoliday", "EasterHoliday", "ChristmasHoliday"],
                4:["SchoolHolidayNo", "SchoolHolidayYes"], 5: ["StoreTypeA", "StoreTypeB", "StoreTypeC", "StoreTypeD"], 6:[ "?", "Basic", "Extra", "Extended"], 8:["None","Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"], 10: ["NoPromo2", "Promo2"], 14: ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"], 19: ["NoPromoMonth", "PromoMonth"] }

In [62]:
category_map = { 1: ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"], 2:["PromoNo", "PromoYes"], 3: ["NoStateHoliday", "PublicHoliday", "EasterHoliday", "ChristmasHoliday"],
                4:["SchoolHolidayNo", "SchoolHolidayYes"], 5: ["?", "StoreTypeA", "StoreTypeB", "StoreTypeC", "StoreTypeD"], 6:[ "?", "Basic", "Extra", "Extended"], 8:["None","Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"], 10: ["NoPromo2", "Promo2"], 14: ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"], 19: ["NoPromoMonth", "PromoMonth"]}

In [63]:
from alibi.explainers import AnchorTabular
predict_xgb = lambda x: np.rint(modelXGBR.predict(xgb.DMatrix(x)))
explainerXGB = AnchorTabular(predict_xgb, featuresR, categorical_names=category_map)
explainerXGB.fit(X_trainR, disc_perc=(25, 50, 75))
resultXGB = explainerXGB.explain(xR, threshold=0.95)

In [64]:
"""
print('Anchor =', resultR.data['anchor'])
print('Precision = ', resultR.data['precision'])
print('Coverage = ', resultR.data['coverage'])
"""

"\nprint('Anchor =', resultR.data['anchor'])\nprint('Precision = ', resultR.data['precision'])\nprint('Coverage = ', resultR.data['coverage'])\n"

In [65]:
print('Anchor =', resultXGB.data['anchor'])
print('Precision = ', resultXGB.data['precision'])
print('Coverage = ', resultXGB.data['coverage'])

Anchor = ['Day > 23.00', 'Promo = PromoYes', 'Month = Aug', 'DayOfWeek = Saturday', 'CompetitionOpenSinceMonth = Aug']
Precision =  0.9768518518518519
Coverage =  4.063410171108892e-05


In [66]:
"""
idx  = 11
print(explainerR.predictor(X_testR[idx].reshape(1, -1))[0])
explanation = explainerR.explain(X_testR[idx], threshold=0.95)
print('Anchor: %s' % (' AND '.join(explanation.anchor)))
print('Precision: %.2f' % explanation.precision)
print('Coverage: %.2f' % explanation.coverage)
"""

"\nidx  = 11\nprint(explainerR.predictor(X_testR[idx].reshape(1, -1))[0])\nexplanation = explainerR.explain(X_testR[idx], threshold=0.95)\nprint('Anchor: %s' % (' AND '.join(explanation.anchor)))\nprint('Precision: %.2f' % explanation.precision)\nprint('Coverage: %.2f' % explanation.coverage)\n"

In [67]:
print('Anchor =', resultXGB.data['anchor'])
print('Precision = ', resultXGB.data['precision'])
print('Coverage = ', resultXGB.data['coverage'])

Anchor = ['Day > 23.00', 'Promo = PromoYes', 'Month = Aug', 'DayOfWeek = Saturday', 'CompetitionOpenSinceMonth = Aug']
Precision =  0.9768518518518519
Coverage =  4.063410171108892e-05


In [68]:
"""
idx  = 11
print(explainerR.predictor(X_testR[idx].reshape(1, -1))[0])
explanation = explainerR.explain(X_testR[idx], threshold=0.95)
print('Anchor: %s' % (' AND '.join(explanation.anchor)))
print('Precision: %.2f' % explanation.precision)
print('Coverage: %.2f' % explanation.coverage)
"""

"\nidx  = 11\nprint(explainerR.predictor(X_testR[idx].reshape(1, -1))[0])\nexplanation = explainerR.explain(X_testR[idx], threshold=0.95)\nprint('Anchor: %s' % (' AND '.join(explanation.anchor)))\nprint('Precision: %.2f' % explanation.precision)\nprint('Coverage: %.2f' % explanation.coverage)\n"

In [69]:
idx = 11
print(explainerXGB.predictor(X_testR[idx].reshape(1, -1))[0])
explanation = explainerXGB.explain(X_testR[idx], threshold=0.95)
print('Anchor: %s' % (' AND '.join(explanation.anchor)))
print('Precision: %.2f' % explanation.precision)
print('Coverage: %.2f' % explanation.coverage)

2.0
Anchor: Promo = PromoNo AND DayOfWeek = Wednesday AND Day <= 16.00 AND Year > 2014.00 AND CompetitionOpen > 24163.00 AND WeekOfYear <= 35.00 AND PromoOpen > 24171.75 AND Month = Mar AND SchoolHoliday = SchoolHolidayNo AND StateHoliday = NoStateHoliday AND StoreType = StoreTypeD
Precision: 0.97
Coverage: 0.00
