In [52]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(rc={'figure.figsize':(11.7,8.27)})
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
FROM_SCRATCH = True

9.1.1 Preparing the data.
We’re using the wine-quality dataset, a numeric tabular dataset containing features that refer to the chemical composition of wines and quality ratings. To make this a simple classification task, we bucket all wines with ratings greater
than five as good, and the rest we label bad. We also normalize all the features.

In [53]:
store = pd.read_csv('input/store.csv')
train = pd.read_csv('input/train.csv',parse_dates=[2])
test = pd.read_csv('input/test.csv',parse_dates=[3])
# fillna in store with 0 has better result than median()
# Aufbereiten der daten
store.fillna(0, inplace=True)
# fill missing values in test with 1
# Aufbereiten der Daten
test.fillna(value = 1, inplace = True)
# merge data with store
# Alles in eine Tabelle
train = pd.merge(train, store, on='Store')
test = pd.merge(test, store, on='Store')


Columns (7) have mixed types. Specify dtype option on import or set low_memory=False.


In [54]:
train.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,5,2015-07-31,5263,555,1,1,0,1,c,a,1270.0,9.0,2008.0,0,0.0,0.0,0
1,1,4,2015-07-30,5020,546,1,1,0,1,c,a,1270.0,9.0,2008.0,0,0.0,0.0,0
2,1,3,2015-07-29,4782,523,1,1,0,1,c,a,1270.0,9.0,2008.0,0,0.0,0.0,0
3,1,2,2015-07-28,5011,560,1,1,0,1,c,a,1270.0,9.0,2008.0,0,0.0,0.0,0
4,1,1,2015-07-27,6102,612,1,1,0,1,c,a,1270.0,9.0,2008.0,0,0.0,0.0,0


In [55]:
nQ = 20
train['Class'] = 0
classes = np.arange(0,nQ+1)

#  Quantile für alle Tage
quantile = np.arange(1, nQ) / nQ
store_quan = np.arange(0, nQ+2)
store_quan[0] = -1
store_quan[1] = 1
for index, x in enumerate(quantile):
    store_quan[index+2] = train['Sales'].quantile(x)
store_quan[nQ+1] = train['Sales'].max()

"""
#  Quantil für verkaufsoffene Sonntage
store_df_day = train.loc[(train['DayOfWeek']== 7) & (train['Open'] == 1)]
quantile = np.arange(1, nQ) / nQ
store_quan_Sun = np.arange(0, nQ+2)
store_quan_Sun[0] = -1
store_quan_Sun[1] = 1
for index, x in enumerate(quantile):
    store_quan_Sun[index+2] = store_df_day['Sales'].quantile(x)
store_quan_Sun[nQ+1] = store_df_day['Sales'].max()

# Spezifische Quantile für jeden Tag und jeden Store
store_quan = np.zeros((train['Store'].max(), 7, nQ+2))
for Id in train['Store'].unique():
    store_df = train[train['Store'] == Id]
    for DayOfWeek in  store_df['DayOfWeek'].unique():
        if DayOfWeek != 7:
            store_df_day = store_df.loc[(store_df['DayOfWeek']== DayOfWeek) & (store_df['Open'] == 1)]
            quantile = np.arange(1, nQ) / nQ
            store_quan[Id-1][DayOfWeek-1][0] = -1
            store_quan[Id-1][DayOfWeek-1][1] = 1
            for index, x in enumerate(quantile):
                store_quan[Id-1][DayOfWeek-1][index+2] = store_df_day['Sales'].quantile(x)
            store_quan[Id-1][DayOfWeek-1][nQ+1] = store_df_day['Sales'].max()
        else:
            store_quan[Id-1][DayOfWeek-1] = store_quan_Sun
"""

"\n#  Quantil für verkaufsoffene Sonntage\nstore_df_day = train.loc[(train['DayOfWeek']== 7) & (train['Open'] == 1)]\nquantile = np.arange(1, nQ) / nQ\nstore_quan_Sun = np.arange(0, nQ+2)\nstore_quan_Sun[0] = -1\nstore_quan_Sun[1] = 1\nfor index, x in enumerate(quantile):\n    store_quan_Sun[index+2] = store_df_day['Sales'].quantile(x)\nstore_quan_Sun[nQ+1] = store_df_day['Sales'].max()\n\n# Spezifische Quantile für jeden Tag und jeden Store\nstore_quan = np.zeros((train['Store'].max(), 7, nQ+2))\nfor Id in train['Store'].unique():\n    store_df = train[train['Store'] == Id]\n    for DayOfWeek in  store_df['DayOfWeek'].unique():\n        if DayOfWeek != 7:\n            store_df_day = store_df.loc[(store_df['DayOfWeek']== DayOfWeek) & (store_df['Open'] == 1)]\n            quantile = np.arange(1, nQ) / nQ\n            store_quan[Id-1][DayOfWeek-1][0] = -1\n            store_quan[Id-1][DayOfWeek-1][1] = 1\n            for index, x in enumerate(quantile):\n                store_quan[Id-1][

In [56]:
print(store_quan)

[   -1     1     0     0     0  2937  3727  4233  4649  5028  5387  5744
  6106  6486  6897  7352  7856  8461  9226 10288 12137 41551]


In [57]:
# process train and test
# aufbereiten der daten, neue Spalten und manche werden entfernt
def process(data, isTest = False):
    # label encode some features
    mappings = {'0':0, 'a':1, 'b':2, 'c':3, 'd':4}
    # buchstaben zu zahlen
    data.StoreType.replace(mappings, inplace=True)
    data.Assortment.replace(mappings, inplace=True)
    data.StateHoliday.replace(mappings, inplace=True)

    # extract some features from date column
    data['Month'] = data.Date.dt.month
    data['Year'] = data.Date.dt.year
    data['Day'] = data.Date.dt.day
    data['WeekOfYear'] = data.Date.dt.weekofyear

    # calculate competiter open time in months
    data['CompetitionOpen'] = 12 * (data.Year - data.CompetitionOpenSinceYear) + \
        (data.Month - data.CompetitionOpenSinceMonth)
    data['CompetitionOpen'] = data['CompetitionOpen'].apply(lambda x: x if x > 0 else 0)

    # calculate promo2 open time in months
    data['PromoOpen'] = 12 * (data.Year - data.Promo2SinceYear) + \
        (data.WeekOfYear - data.Promo2SinceWeek) / 4.0
    data['PromoOpen'] = data['PromoOpen'].apply(lambda x: x if x > 0 else 0)

    # Indicate whether the month is in promo interval
    month2str = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', \
             7:'Jul', 8:'Aug', 9:'Sept', 10:'Oct', 11:'Nov', 12:'Dec'}
    data['month_str'] = data.Month.map(month2str)

    def check(row):
        if isinstance(row['PromoInterval'],str) and row['month_str'] in row['PromoInterval']:
            return 1
        else:
            return 0

    data['IsPromoMonth'] =  data.apply(lambda row: check(row),axis=1)

    # select the features we need
    features = ['Store', 'DayOfWeek', 'Promo', 'StateHoliday', 'SchoolHoliday',
       'StoreType', 'Assortment', 'CompetitionDistance',
       'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2',
       'Promo2SinceWeek', 'Promo2SinceYear', 'Year', 'Month', 'Day',
       'WeekOfYear', 'CompetitionOpen', 'PromoOpen', 'IsPromoMonth']
    if not isTest:
        features.append('Sales')

    data = data[features]
    return data

train = train.sort_values(['Date'],ascending = False)
train = process(train)
test = process(test,isTest = True)

Series.dt.weekofyear and Series.dt.week have been deprecated. Please use Series.dt.isocalendar().week instead.
Series.dt.weekofyear and Series.dt.week have been deprecated. Please use Series.dt.isocalendar().week instead.


In [58]:
test.head()

Unnamed: 0,Store,DayOfWeek,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,Year,Month,Day,WeekOfYear,CompetitionOpen,PromoOpen,IsPromoMonth
0,1,4,1,0,0,3,1,1270.0,9.0,2008.0,0,0.0,0.0,2015,9,17,38,84.0,24189.5,0
1,1,3,1,0,0,3,1,1270.0,9.0,2008.0,0,0.0,0.0,2015,9,16,38,84.0,24189.5,0
2,1,2,1,0,0,3,1,1270.0,9.0,2008.0,0,0.0,0.0,2015,9,15,38,84.0,24189.5,0
3,1,1,1,0,0,3,1,1270.0,9.0,2008.0,0,0.0,0.0,2015,9,14,38,84.0,24189.5,0
4,1,7,0,0,0,3,1,1270.0,9.0,2008.0,0,0.0,0.0,2015,9,13,37,84.0,24189.25,0


In [59]:
train.head()

Unnamed: 0,Store,DayOfWeek,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,...,Promo2SinceWeek,Promo2SinceYear,Year,Month,Day,WeekOfYear,CompetitionOpen,PromoOpen,IsPromoMonth,Sales
0,1,5,1,0,1,3,1,1270.0,9.0,2008.0,...,0.0,0.0,2015,7,31,31,82.0,24187.75,0,5263
679364,747,5,1,0,1,3,3,45740.0,8.0,2008.0,...,0.0,0.0,2015,7,31,31,83.0,24187.75,0,10708
702362,772,5,1,0,1,4,3,1850.0,0.0,0.0,...,0.0,0.0,2015,7,31,31,24187.0,24187.75,0,5224
683890,752,5,1,0,1,1,1,970.0,3.0,2013.0,...,31.0,2013.0,2015,7,31,31,28.0,24.0,0,7763
17714,20,5,1,0,0,4,1,2340.0,5.0,2009.0,...,40.0,2014.0,2015,7,31,31,74.0,9.75,1,9593


In [60]:
featuresR  = ['Store', 'DayOfWeek', 'Promo', 'StateHoliday', 'SchoolHoliday',
       'StoreType', 'Assortment', 'CompetitionDistance',
       'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2',
       'Promo2SinceWeek', 'Promo2SinceYear', 'Year', 'Month', 'Day',
       'WeekOfYear', 'CompetitionOpen', 'PromoOpen', 'IsPromoMonth']
train_data = train[featuresR].to_numpy()
test_data = test[featuresR].to_numpy()
labels_train = train[['Sales']].to_numpy()

X_trainR, X_testR, y_trainR, y_testR = train_test_split(train_data, labels_train, random_state=0)
X_trainR, X_testR = X_trainR.astype('float32'), X_testR.astype('float32')
y_train_labR, y_test_labR = y_trainR[:, 0], y_testR[:, 0]
y_trainR, y_testR = y_trainR[:, 1:].astype('float32'), y_testR[:, 1:].astype('float32')
scalerR = StandardScaler()
scalerR.fit(X_trainR)
category_map = {1: ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"], 2:["PromoNo", "PromoYes"], 3: ["NoStateHoliday", "PublicHoliday", "EasterHoliday", "ChristmasHoliday"],
                4:["SchoolHolidayNo", "SchoolHolidayYes"], 5: ["StoreTypeA", "StoreTypeB", "StoreTypeC", "StoreTypeD", "StoreTypeE"], 6:["Basic", "Extra", "Extended"], 8:["None","Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"], 10: ["NoPromo2", "Promo2"], 14: ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"], 19: ["NoPromoMonth", "PromoMonth"] }

Select good wine instance
We partition the dataset into good and bad portions and select an instance of interest. I’ve chosen it to be a good quality
wine.
Note that bad wines are class 1 and correspond to the second model output being high, whereas good wines are class
0 and correspond to the first model output being high.

In [61]:
# bad_days = np.array([a for a, b in zip(X_trainR, y_trainR) if b[1] == 1])
# good_days = np.array([a for a, b in zip(X_trainR, y_trainR) if b[1] == 0])
xR = np.array([[747,5,1,0,1,3,3,45740.0,8.0,2008.0,1,0.0,0.0,2015,7,31,31,83.0,24187.75,0]])

9.1.2 Training models
Creating an Autoencoder
For some of the explainers, we need an autoencoder to check whether example instances are close to the training data
distribution or not.

Random Forest Model
We need a tree-based model to get results for the tree SHAP explainer. Hence we train a random forest on the winequality dataset.

XGBoost Model

In [62]:
# define eval metrics
# Mittleres Abweichungsquadrat
from sklearn.metrics import mean_squared_error
def rmspe(y, yhat):
    return np.sqrt(np.mean((yhat/y-1) ** 2))
# expm1 ist umkehr von log1p
def rmspe_xg(yhat, y):
    yhat = yhat
    y = y
    y = np.expm1(y.get_label())
    yhat = np.expm1(yhat)
    return "rmspe", rmspe(y,yhat)
def rmse(ytest, y):
    return np.sqrt(mean_squared_error(ytest, y))

In [63]:
y_train_xgb = y_train_labR
y_test_xgb = y_test_labR

In [64]:
import xgboost as xgb
from sklearn.metrics import accuracy_score

def make_xgb_modelR():
    params = {"objective": "reg:linear", # for linear regression
              "booster" : "gbtree",   # use tree based models
              "eta": 0.03,   # learning rate
              "max_depth": 10,    # maximum depth of a tree
              "subsample": 0.9,    # Subsample ratio of the training instances
              "colsample_bytree": 0.7,   # Subsample ratio of columns when constructing each tree
              "silent": 1,   # silent mode
              "seed": 10,   # Random number seed
              "gpu_id": 0,
              "tree_method": "gpu_hist",
              # "eval_metric": "rmse"
              }
    # anzahl trainingsrunden
    num_boost_round = 5000

    dtrain = xgb.DMatrix(X_trainR, y_train_xgb)
    dtest = xgb.DMatrix(X_testR, y_test_xgb)
    watchlist = [(dtrain, 'train'), (dtest, 'eval')]
    # train the xgboost model
    model = xgb.train(params, dtrain, num_boost_round, evals=watchlist, \
      early_stopping_rounds= 1000, verbose_eval=True)
    y_predR = model.predict(xgb.DMatrix(X_testR))
    print(y_predR)
    print('accuracy_score:', rmse(y_predR, y_test_xgb))
    # print('f1_score:', f1_score(y_predR, y_test_xgb))

    return model

Tensorflow Model
Finally, we also train a TensorFlow model.


In [65]:
# Modell wird schon passend gespeichert @Bene :)
import os.path

if FROM_SCRATCH or not os.path.isfile('modelXGBR.json'):
    modelXGBR = make_xgb_modelR()
    modelXGBR.save_model('modelXGBR.json')
else:
    modelXGBR = xgb.Booster()
    modelXGBR.load_model('modelXGBR.json')

Parameters: { "silent" } are not used.

[0]	train-rmse:6783.71780	eval-rmse:6776.07166
[1]	train-rmse:6602.53123	eval-rmse:6594.85779
[2]	train-rmse:6430.81356	eval-rmse:6423.14357
[3]	train-rmse:6261.15897	eval-rmse:6253.53184
[4]	train-rmse:6111.68577	eval-rmse:6104.08319
[5]	train-rmse:5955.24076	eval-rmse:5947.62357
[6]	train-rmse:5804.55756	eval-rmse:5796.90853
[7]	train-rmse:5655.51778	eval-rmse:5647.99697
[8]	train-rmse:5512.68120	eval-rmse:5505.09666
[9]	train-rmse:5376.48564	eval-rmse:5368.93140
[10]	train-rmse:5244.67746	eval-rmse:5237.26217
[11]	train-rmse:5116.35312	eval-rmse:5109.05027
[12]	train-rmse:4990.45318	eval-rmse:4983.26423
[13]	train-rmse:4880.38357	eval-rmse:4873.43848
[14]	train-rmse:4765.83664	eval-rmse:4758.94962
[15]	train-rmse:4655.93718	eval-rmse:4649.02198
[16]	train-rmse:4549.28470	eval-rmse:4542.44135
[17]	train-rmse:4446.80478	eval-rmse:4440.01510
[18]	train-rmse:4346.83566	eval-rmse:4340.25912
[19]	train-rmse:4251.32388	eval-rmse:4244.76147
[20]	train

In [66]:
#y_pred = np.rint(modelXGBR.predict(xgb.DMatrix(X_testR)))
y_pred = modelXGBR.predict(xgb.DMatrix(X_testR))
print(y_test_xgb)
print(y_pred)
error = rmse(y_test_xgb, y_pred)
print('RMSE: {:.4f}'.format(error))
error = rmspe(np.expm1(y_test_xgb), np.expm1(y_pred))
print('RMSPE: {:.4f}'.format(error))

[6331 4247 3536 ... 7728 8646    0]
[6135.7407   4420.1167   3781.1506   ... 7722.087    8793.394
  -47.438404]
RMSE: 613.1185
RMSPE: nan


overflow encountered in expm1
divide by zero encountered in true_divide
invalid value encountered in true_divide


Load/Make models
We save and load the same models each time to ensure stable results. If they don’t exist we create new ones. If you
want to generate new models on each notebook run, then set FROM_SCRATCH=True.

In [67]:
"""
import shap

dtrain = xgb.DMatrix(X_trainR, y_train_xgb)
modelXGBR.set_param({"predictor": "gpu_predictor"})
shap_values = modelXGBR.predict(dtrain, pred_contribs=True)
shap_interaction_values = modelXGBR.predict(dtrain, pred_interactions=True)

modelXGBR.set_param({"predictor": "gpu_predictor"})
explainer = shap.TreeExplainer(modelXGBR)
shap_values = explainer.shap_values(X_trainR)
shap.summary_plot(shap_values, X_trainR,max_display= 10, title = 'SHAP', plot_type= 'bar')
"""

'\nimport shap\n\ndtrain = xgb.DMatrix(X_trainR, y_train_xgb)\nmodelXGBR.set_param({"predictor": "gpu_predictor"})\nshap_values = modelXGBR.predict(dtrain, pred_contribs=True)\nshap_interaction_values = modelXGBR.predict(dtrain, pred_interactions=True)\n\nmodelXGBR.set_param({"predictor": "gpu_predictor"})\nexplainer = shap.TreeExplainer(modelXGBR)\nshap_values = explainer.shap_values(X_trainR)\nshap.summary_plot(shap_values, X_trainR,max_display= 10, title = \'SHAP\', plot_type= \'bar\')\n'

9.1.3 Util functions
These are utility functions for exploring results. The first shows two instances of the data side by side and compares
the difference. We’ll use this to see how the counterfactuals differ from their original instances. The second function
plots the importance of each feature. This will be useful for visualizing the attribution methods

In [68]:
"""
def compare_instances(x, cf):

    #Show the difference in values between two instances.

    x = x.astype('float64')
    cf = cf.astype('float64')
    for f, v1, v2 in zip(features, x[0], cf[0]):
        print(f'{f:<25} instance: {round(v1, 3):^10} counter factual: {round(v2, 3):^10} difference: {round(v2, 7):^5}')

def plot_importance(feat_imp, feat_names, class_idx, **kwargs):

    #Create a horizontal barchart of feature effects, sorted by their magnitude.

    df = pd.DataFrame(data=feat_imp, columns=feat_names).sort_values(by=0, axis='columns')
    feat_imp, feat_names = df.values[0], df.columns
    fig, ax = plt.subplots(figsize=(10, 5))
    y_pos = np.arange(len(feat_imp))
    ax.barh(y_pos, feat_imp)
    ax.set_yticks(y_pos)
    ax.set_yticklabels(feat_names, fontsize=15)
    ax.invert_yaxis()
    ax.set_xlabel(f'Feature effects for class {class_idx}', fontsize=15)
    return ax, fig
"""

"\ndef compare_instances(x, cf):\n\n    #Show the difference in values between two instances.\n\n    x = x.astype('float64')\n    cf = cf.astype('float64')\n    for f, v1, v2 in zip(features, x[0], cf[0]):\n        print(f'{f:<25} instance: {round(v1, 3):^10} counter factual: {round(v2, 3):^10} difference: {round(v2, 7):^5}')\n\ndef plot_importance(feat_imp, feat_names, class_idx, **kwargs):\n\n    #Create a horizontal barchart of feature effects, sorted by their magnitude.\n\n    df = pd.DataFrame(data=feat_imp, columns=feat_names).sort_values(by=0, axis='columns')\n    feat_imp, feat_names = df.values[0], df.columns\n    fig, ax = plt.subplots(figsize=(10, 5))\n    y_pos = np.arange(len(feat_imp))\n    ax.barh(y_pos, feat_imp)\n    ax.set_yticks(y_pos)\n    ax.set_yticklabels(feat_names, fontsize=15)\n    ax.invert_yaxis()\n    ax.set_xlabel(f'Feature effects for class {class_idx}', fontsize=15)\n    return ax, fig\n"

9.1.5 Local Necessary Features
Anchors
Anchors tell us what features need to stay the same for a specific instance for the model to give the same classification.
In the case of a trained image classification model, an anchor for a given instance would be a minimal subset of the
image that the model uses to make its decision.
Here we apply Anchors to the tensor flow model trained on the wine-quality dataset.

In [69]:
""""
from alibi.explainers import AnchorTabular
predict_fnR = lambda x: modelR.predict(scalerR.transform(x))
explainerR = AnchorTabular(predict_fnR, featuresR, categorical_names=category_map)
explainerR.fit(X_trainR, disc_perc=(25, 50, 75))
resultR = explainerR.explain(xR, threshold=0.95)
"""

'"\nfrom alibi.explainers import AnchorTabular\npredict_fnR = lambda x: modelR.predict(scalerR.transform(x))\nexplainerR = AnchorTabular(predict_fnR, featuresR, categorical_names=category_map)\nexplainerR.fit(X_trainR, disc_perc=(25, 50, 75))\nresultR = explainerR.explain(xR, threshold=0.95)\n'

In [70]:
category_map = { 1: ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"], 2:["PromoNo", "PromoYes"], 3: ["NoStateHoliday", "PublicHoliday", "EasterHoliday", "ChristmasHoliday"],
                4:["SchoolHolidayNo", "SchoolHolidayYes"], 5: ["?", "StoreTypeA", "StoreTypeB", "StoreTypeC", "StoreTypeD"], 6:[ "?", "Basic", "Extra", "Extended"], 8:["None","Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"], 10: ["NoPromo2", "Promo2"], 14: ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"], 19: ["NoPromoMonth", "PromoMonth"]}

In [71]:
category_map = { 0:["PromoNo", "PromoYes"], 3: ["NoStateHoliday", "PublicHoliday", "EasterHoliday", "ChristmasHoliday"],
                4:["SchoolHolidayNo", "SchoolHolidayYes"], 5: ["?", "StoreTypeA", "StoreTypeB", "StoreTypeC", "StoreTypeD"], 6:[ "?", "Basic", "Extra", "Extended"], 8:["None","Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"], 10: ["NoPromo2", "Promo2"], 14: ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"], 19: ["NoPromoMonth", "PromoMonth"]}
relevant_map = [False, False, True, True, True, True, True, True, False, False, True, False, False, False, False, False, False, True, True, True]

In [72]:
X_trainR

array([[8.320000e+02, 5.000000e+00, 1.000000e+00, ..., 2.416800e+04,
        4.900000e+01, 0.000000e+00],
       [4.070000e+02, 1.000000e+00, 0.000000e+00, ..., 1.180000e+02,
        2.900000e+01, 1.000000e+00],
       [3.980000e+02, 7.000000e+00, 0.000000e+00, ..., 2.417300e+04,
        2.875000e+01, 0.000000e+00],
       ...,
       [5.850000e+02, 5.000000e+00, 1.000000e+00, ..., 2.000000e+00,
        2.417375e+04, 0.000000e+00],
       [7.420000e+02, 6.000000e+00, 0.000000e+00, ..., 2.417800e+04,
        2.417850e+04, 0.000000e+00],
       [1.043000e+03, 2.000000e+00, 0.000000e+00, ..., 8.200000e+01,
        2.415725e+04, 0.000000e+00]], dtype=float32)

In [73]:
"""
def bin(input_array, data):
    output_array = np.zeros(input_array.shape)
    for i in range(len(input_array)):
        Id = data[i][0]
        Day = data[i][1]
        print(Day)
        print(Id)
        print(input_array[i])
        print(store_quan[np.rint(Id)][np.rint(Day)])
        output_array[i] = np.searchsorted(store_quan[np.rint(Id)][np.rint(Day)], input_array[i])
    return output_array
"""

'\ndef bin(input_array, data):\n    output_array = np.zeros(input_array.shape)\n    for i in range(len(input_array)):\n        Id = data[i][0]\n        Day = data[i][1]\n        print(Day)\n        print(Id)\n        print(input_array[i])\n        print(store_quan[np.rint(Id)][np.rint(Day)])\n        output_array[i] = np.searchsorted(store_quan[np.rint(Id)][np.rint(Day)], input_array[i])\n    return output_array\n'

In [74]:
"""
# Array mit 21 boolean-Werten, die angibt, ob eine Spalte behalten werden soll oder nicht
column_filter = np.array([False, False, True, True, True, True, True, True, False, False, True, False, False, False, False, False, False, True, True, True])

# Die Spalten auswählen, die behalten werden sollen
X_trainR = X_trainR[:, column_filter]

featuresR = [ 'Promo', 'StateHoliday', 'SchoolHoliday',
       'StoreType', 'Assortment', 'CompetitionDistance', 'Promo2',
       'CompetitionOpen', 'PromoOpen', 'IsPromoMonth']

"""

array([[1.000000e+00, 0.000000e+00, 0.000000e+00, ..., 2.416800e+04,
        4.900000e+01, 0.000000e+00],
       [0.000000e+00, 0.000000e+00, 1.000000e+00, ..., 1.180000e+02,
        2.900000e+01, 1.000000e+00],
       [0.000000e+00, 0.000000e+00, 0.000000e+00, ..., 2.417300e+04,
        2.875000e+01, 0.000000e+00],
       ...,
       [1.000000e+00, 0.000000e+00, 0.000000e+00, ..., 2.000000e+00,
        2.417375e+04, 0.000000e+00],
       [0.000000e+00, 0.000000e+00, 0.000000e+00, ..., 2.417800e+04,
        2.417850e+04, 0.000000e+00],
       [0.000000e+00, 0.000000e+00, 0.000000e+00, ..., 8.200000e+01,
        2.415725e+04, 0.000000e+00]], dtype=float32)

In [76]:
def bin(input_array):
    output_array = np.zeros(input_array.shape)
    for i in range(len(input_array)):
        output_array[i] = np.searchsorted(store_quan, input_array[i])
    return output_array

In [78]:
from alibi.explainers import AnchorTabular
predict_xgb = lambda x: bin(modelXGBR.predict(xgb.DMatrix(x)))
explainerXGB = AnchorTabular(predict_xgb, featuresR)
explainerXGB.fit(X_trainR, disc_perc=(25, 50, 75))
resultXGB = explainerXGB.explain(xR, threshold=0.95)



Could not find an anchor satisfying the 0.95 precision constraint. Now returning the best non-eligible result. The desired precision threshold might not be achieved due to the quantile-based discretisation of the numerical features. The resolution of the bins may be too large to find an anchor of required precision. Consider increasing the number of bins in `disc_perc`, but note that for some numerical distribution (e.g. skewed distribution) it may not help.




In [82]:
print('Anchor =', resultXGB.data['anchor'])
print('Precision = ', resultXGB.data['precision'])
print('Coverage = ', resultXGB.data['coverage'])

Anchor = ['IsPromoMonth > 0.00', 'Promo > 1.00', 'CompetitionOpen > 24163.00', 'CompetitionDistance <= 700.00', 'StateHoliday > 0.00', 'Assortment <= 1.00', 'PromoOpen <= 25.00', 'Promo2 > 1.00', 'StoreType <= 1.00', 'SchoolHoliday > 0.00']
Precision =  0.0
Coverage =  2.8837104440127616e-05


In [84]:
idx = 11
print(explainerXGB.predictor(X_testR[idx].reshape(1, -1))[0])
explanation = explainerXGB.explain(X_testR[idx], relevant_features=relevant_map, threshold=0.8)
print('Anchor: %s' % (' AND '.join(explanation.anchor)))
print('Precision: %.2f' % explanation.precision)
print('Coverage: %.2f' % explanation.coverage)

11.0


Could not find an anchor satisfying the 0.8 precision constraint. Now returning the best non-eligible result. The desired precision threshold might not be achieved due to the quantile-based discretisation of the numerical features. The resolution of the bins may be too large to find an anchor of required precision. Consider increasing the number of bins in `disc_perc`, but note that for some numerical distribution (e.g. skewed distribution) it may not help.


Anchor: CompetitionOpen <= 24163.00 AND PromoOpen <= 24171.75 AND Promo > 0.00 AND CompetitionDistance <= 6880.00 AND IsPromoMonth <= 0.00 AND Assortment <= 1.00 AND StoreType <= 4.00
Precision: 0.04
Coverage: 0.08
