Anchor explanations for income prediction

In [2]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from alibi.explainers import AnchorTabular
from alibi.datasets import fetch_adult

Load adult dataset

In [None]:
adult = fetch_adult()
adult.keys()

In [None]:
data = adult.data
target = adult.target
feature_names = adult.feature_names
category_map = adult.category_map
print(adult)

In [None]:
from alibi.utils import gen_category_map

In [None]:
np.random.seed(0)
data_perm = np.random.permutation(np.c_[data, target])
data = data_perm[:,:-1]
target = data_perm[:,-1]

In [None]:
idx = 30000
X_train,Y_train = data[:idx,:], target[:idx]
X_test, Y_test = data[idx+1:,:], target[idx+1:]

In [None]:
ordinal_features = [x for x in range(len(feature_names)) if x not in list(category_map.keys())]
ordinal_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
                                      ('scaler', StandardScaler())])

In [None]:
categorical_features = list(category_map.keys())
categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
                                          ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [None]:
preprocessor = ColumnTransformer(transformers=[('num', ordinal_transformer, ordinal_features),
                                               ('cat', categorical_transformer, categorical_features)])
preprocessor.fit(X_train)
print(X_train)

XGBBoost model

In [None]:
# Anwendung des Gelernten auf den Test Datensatz
# jetzt wird der ganze Datensatz verwendet
import xgboost as xgb

dtrain = xgb.DMatrix(preprocessor.transform(X_train), Y_train)
# print(X_test)
# print(preprocessor.transform(X_test))

dtest = xgb.DMatrix(preprocessor.transform(X_test))
# specify parameters via map
params = {"objective": "reg:linear", # for linear regression
          "booster" : "gbtree",   # use tree based models
          "eta": 0.03,   # learning rate
          "max_depth": 10,    # maximum depth of a tree
          "subsample": 0.9,    # Subsample ratio of the training instances
          "colsample_bytree": 0.7,   # Subsample ratio of columns when constructing each tree
          "silent": 1,   # silent mode
          "seed": 10   # Random number seed
          }
num_round = 300
model = xgb.train(params, dtrain, num_round)
# make prediction
preds = model.predict(dtest)
print([round(value) for value in preds])

In [14]:
predict_fn = lambda x: clf.predict(preprocessor.transform(x))
print(predict_fn(X_train))
print(predict_fn(X_test))
print('Train accuracy: ', accuracy_score(Y_train, predict_fn(X_train)))
print('Test accuracy: ', accuracy_score(Y_test, predict_fn(X_test)))

[0 0 0 ... 0 0 0]
[0 0 0 ... 0 1 0]
Train accuracy:  0.9655333333333334
Test accuracy:  0.855859375


Predictor mit XGBoost

In [15]:
predict_fn_xgb = lambda x: (np.rint(model.predict(xgb.DMatrix(preprocessor.transform(x))))).astype(int)
print('Train accuracy: ', accuracy_score(Y_train, predict_fn_xgb(X_train)))
print('Test accuracy: ', accuracy_score(Y_test, predict_fn_xgb(X_test)))

Train accuracy:  0.9135
Test accuracy:  0.87578125


In [139]:
explainer = AnchorTabular(predict_fn, feature_names)

In [140]:
explainer_xgb = AnchorTabular(predict_fn_xgb, feature_names)

In [141]:
explainer.fit(X_train, disc_perc=[25, 50, 75])

AnchorTabular(meta={
  'name': 'AnchorTabular',
  'type': ['blackbox'],
  'explanations': ['local'],
  'params': {'seed': None, 'disc_perc': [25, 50, 75]},
  'version': '0.8.0'}
)

In [142]:
explainer_xgb.fit(X_train, disc_perc=[25, 50, 75])

AnchorTabular(meta={
  'name': 'AnchorTabular',
  'type': ['blackbox'],
  'explanations': ['local'],
  'params': {'seed': None, 'disc_perc': [25, 50, 75]},
  'version': '0.8.0'}
)

In [143]:
idx = 0
class_names = adult.target_names
print(explainer.predictor(X_test[idx].reshape(1, -1))[0])
print('Prediction: ', class_names[explainer.predictor(X_test[idx].reshape(1, -1))[0]])

0
Prediction:  <=50K


In [144]:
idx = 0
class_names = adult.target_names
print(explainer_xgb.predictor(X_test[idx].reshape(1, -1))[0])
print('Prediction: ', class_names[explainer.predictor(X_test[idx].reshape(1, -1))[0]])

0
Prediction:  <=50K


In [145]:
explanation = explainer.explain(X_test[idx], threshold=0.95)
print('Anchor: %s' % (' AND '.join(explanation.anchor)))
print('Precision: %.2f' % explanation.precision)
print('Coverage: %.2f' % explanation.coverage)

Anchor: Marital Status > 0.00 AND Relationship > 1.00
Precision: 0.98
Coverage: 0.28


In [146]:
explanation_xgb = explainer_xgb.explain(X_test[idx], threshold=0.95)
print('Anchor: %s' % (' AND '.join(explanation_xgb.anchor)))
print('Precision: %.2f' % explanation_xgb.precision)
print('Coverage: %.2f' % explanation_xgb.coverage)

Anchor: Marital Status > 0.00 AND Relationship > 1.00
Precision: 0.99
Coverage: 0.28


In [147]:
idx = 6
class_names = adult.target_names
print('Prediction: ', class_names[explainer.predictor(X_test[idx].reshape(1, -1))[0]])

explanation = explainer.explain(X_test[idx], threshold=0.95)
print('Anchor: %s' % (' AND '.join(explanation.anchor)))
print('Precision: %.2f' % explanation.precision)
print('Coverage: %.2f' % explanation.coverage)

Prediction:  >50K


Could not find an anchor satisfying the 0.95 precision constraint. Now returning the best non-eligible result. The desired precision threshold might not be achieved due to the quantile-based discretisation of the numerical features. The resolution of the bins may be too large to find an anchor of required precision. Consider increasing the number of bins in `disc_perc`, but note that for some numerical distribution (e.g. skewed distribution) it may not help.


Anchor: Capital Loss > 0.00 AND Relationship <= 0.00 AND Age > 37.00 AND Marital Status <= 0.00 AND Workclass > 4.00 AND Sex > 0.00 AND Occupation <= 6.00
Precision: 0.74
Coverage: 0.00


In [149]:
idx = 6
class_names = adult.target_names
print(explainer_xgb.predictor(X_test[idx].reshape(1, -1))[0])
print('Prediction: ', class_names[explainer_xgb.predictor(X_test[idx].reshape(1, -1))[0]])

explanation_xgb = explainer_xgb.explain(X_test[idx], threshold=0.95)
print('Anchor: %s' % (' AND '.join(explanation_xgb.anchor)))
print('Precision: %.2f' % explanation_xgb.precision)
print('Coverage: %.2f' % explanation_xgb.coverage)

1
Prediction:  >50K


Could not find an anchor satisfying the 0.95 precision constraint. Now returning the best non-eligible result. The desired precision threshold might not be achieved due to the quantile-based discretisation of the numerical features. The resolution of the bins may be too large to find an anchor of required precision. Consider increasing the number of bins in `disc_perc`, but note that for some numerical distribution (e.g. skewed distribution) it may not help.


Anchor: Capital Loss > 0.00 AND Relationship <= 0.00 AND Marital Status <= 0.00 AND Age > 47.00 AND Workclass > 4.00 AND Race <= 4.00 AND Country <= 9.00 AND 0.00 < Sex <= 1.00
Precision: 0.76
Coverage: 0.00
