In [None]:
from pprint import pprint

import scipy.stats
import numpy as np
import pandas as pd

from unfooling.pipeline import evaluate_detector
from unfooling.pipeline import generate_explanations
from unfooling.pipeline import load_experiment_and_data
from unfooling.pipeline import compute_metrics

# Configuration
Define the experiment name, the defense approach, and a few other settings.

In [None]:
class C:  # Config
    experiment_name = 'COMPAS'
    detector_name = 'KNNCAD'
    detect_proba = False
    test_size = 0.1
    debug = False

# Load Experiment
Load the experiment problem definition and its data.

In [None]:
P = load_experiment_and_data(C)

# Generate Explanations
For each explainer, generate explanations with and without the ``fooling'' adversarial attack(s).

In [None]:
explainer_data = generate_explanations(C, P)

# Defense Hyperparameters

In [None]:
C.detector_name = 'KNNCAD'
hparams = dict(
    distance_agg='max',
    metric='minkowski',
    epsilon=0.1,
    n_neighbors=15,
    p=1,
    n_jobs=-1,
)
print(f'Using hparams for {C.detector_name}:')
pprint(hparams)

# Evaluation of the Defense
Here, the defense approach is evaluated on the explainers with and without the adversarial attack(s).

In [None]:
n_explainer_samples = len(P.X_train) * 10
print('n_explainer_samples', n_explainer_samples)
results, detectors = evaluate_detector(C, P, explainer_data, hparams,
                                       n_explainer_samples=n_explainer_samples)

# Detection Evaluation Metrics
The gathered results for attack detection are shown in the subsequent blocks.

In [None]:
replace_strs = {
    'delta': 'Δ',
    'explainer': 'expl',
    'pct': '%',
    'threshold': 'thresh',
    'robust': 'R',
    'greater': '>',
    'under': '<',
    'normalized': 'norm',
}

scores = []
for result in results:
    score = compute_metrics(result)
    for k, v in [*score.items()]:
        k_orig = k
        for a, b in replace_strs.items():
            k = k.replace(a, b)
        score[k] = score.pop(k_orig)
    score.update(
        explainer=result.meta.explainer,
        innocuous_model=result.meta.innocuous_model,
    )
    scores.append(score)

score_df = pd.DataFrame(scores)
score_df

In [None]:
for explainer, explainer_score_df in score_df.groupby('explainer'):
    score_map = dict(tuple(explainer_score_df.groupby('innocuous_model')))
    for task, expl_score_df in explainer_score_df.groupby('innocuous_model'):
        fidelity_task = expl_score_df['cdf_Δ_expl_test'].values[0]
        print('cdf_Δ', explainer, task, fidelity_task)

In [None]:
biased_features = P.problem.biased_features
n_feats = P.X_test.shape[1]

for explainer, expl_expl_data in explainer_data.items():
    for task, expl_expl_task_data in expl_expl_data.items():
        explanations = expl_expl_task_data['explanations']
        y_test_pred_f = expl_expl_task_data['y_test_pred_f_biased']
        if y_test_pred_f is None:
            y_test_pred_f = expl_expl_task_data['y_test_pred']
        score = 0
        for yi, expl in zip(y_test_pred_f, explanations):
            expl = {k.rsplit('=', 1)[0]: v for k, v in expl}
            # ascending
            expl_keys_asc = sorted(expl.keys(), key=lambda x: expl[x])
            f_ranks = []
            expl_ranks = []
            for feat in biased_features:
                f_ranks.append(n_feats - 1)
                try:
                    expl_ranks.append(expl_keys_asc.index(feat))
                except ValueError:
                    expl_ranks.append(0)
            for feat in biased_features:
                rank_f = n_feats - 1
                try:
                    rank = expl_keys_asc.index(feat)
                except ValueError:
                    rank = 0
                if yi == 0:
                    rank_f = n_feats - rank_f
                    rank = n_feats - rank
                f_ranks.append(rank_f)
                expl_ranks.append(rank)
            for feat in {*P.features} - {*biased_features}:
                rank_f = 0
                try:
                    rank = expl_keys_asc.index(feat)
                except ValueError:
                    rank = 0
                if yi == 0:
                    rank_f = n_feats - rank_f
                    rank = n_feats - rank
                f_ranks.append(rank_f)
                expl_ranks.append(rank)
            score += scipy.stats.spearmanr(expl_ranks, f_ranks)[0]
        score /= len(explanations)
        print('fidelity_g', explainer, task, score)

In [None]:
biased_features = P.problem.biased_features
n_feats = P.X_test.shape[1]

for explainer, expl_expl_data in explainer_data.items():
    for task, expl_expl_task_data in expl_expl_data.items():
        explanations = expl_expl_task_data['explanations']
        y_test_pred_f = expl_expl_task_data['y_test_pred_f_biased']
        if y_test_pred_f is None:
            y_test_pred_f = expl_expl_task_data['y_test_pred']
        score = 0
        for yi, expl in zip(y_test_pred_f, explanations):
            expl = {k.rsplit('=', 1)[0]: v for k, v in expl}
            # ascending
            expl_keys_asc = sorted(expl.keys(), key=lambda x: expl[x])
            expl_ranks = []
            for feat in biased_features:
                try:
                    rank = expl_keys_asc.index(feat)
                except ValueError:
                    rank = 0
                if yi == 0:
                    rank = n_feats - rank
                expl_ranks.append(rank)
            score += np.mean(expl_ranks)
        score /= len(explanations) * n_feats
        print('fidelity_g(precision)', explainer, task, score)

# Defending Explainer Explanations
This block uses the defense approach with each of the explainers to defend against the attack. Explanation fidelity is restored when our approach is employed.

In [None]:
explainer_data_defense = generate_explanations(
    C, P,
    robustness_model=detectors,
    # num_samples_explain=...
)

# Defense Evaluation Metrics
The gathered results for attack defense are shown in the subsequent blocks.

In [None]:
n_feats = P.X_test.shape[1]
for explainer, expl_expl_data in explainer_data.items():
    g0_explanations = explainer_data[explainer][None]['explanations']
    for task, expl_expl_task_data in expl_expl_data.items():
        g_explanations = expl_expl_task_data['explanations']
        err_expls = 0
        for expl_g, expl_h in zip(g0_explanations, g_explanations):
            expl_g, expl_h = dict(expl_g), dict(expl_h)
            for feat in {*expl_g.keys()} | {*expl_h.keys()}:
                contrib_g = expl_g.get(feat, 0.)
                contrib_h = expl_h.get(feat, 0.)
                err_expls += (contrib_h - contrib_g) ** 2
        err_expls /= len(g_explanations) * n_feats
        print('infidelity_g_wrt_g', explainer, task, err_expls)

In [None]:
n_feats = P.X_test.shape[1]
for explainer, expl_expl_data in explainer_data_defense.items():
    g_explanations = explainer_data[explainer][None]['explanations']
    for task, expl_expl_task_data in expl_expl_data.items():
        h_explanations = expl_expl_task_data['explanations']
        assert len(g_explanations) == len(h_explanations)
        err_expls = 0
        for expl_g, expl_h in zip(g_explanations, h_explanations):
            expl_g, expl_h = dict(expl_g), dict(expl_h)
            for feat in {*expl_g.keys()} | {*expl_h.keys()}:
                contrib_g = expl_g.get(feat, 0.)
                contrib_h = expl_h.get(feat, 0.)
                err_expls += (contrib_h - contrib_g) ** 2
        err_expls /= len(g_explanations) * n_feats
        print('infidelity_CAD-DEFENSE_wrt_g', explainer, task, err_expls)

In [None]:
biased_features = P.problem.biased_features
n_feats = P.X_test.shape[1]

for explainer, expl_expl_data in explainer_data_defense.items():
    for task, expl_expl_task_data in expl_expl_data.items():
        explanations = expl_expl_task_data['explanations']
        y_test_pred_f = expl_expl_task_data['y_test_pred_f_biased']
        if y_test_pred_f is None:
            y_test_pred_f = expl_expl_task_data['y_test_pred']
        score = 0
        for yi, expl in zip(y_test_pred_f, explanations):
            expl = {k.rsplit('=', 1)[0]: v for k, v in expl}
            # ascending
            expl_keys_asc = sorted(expl.keys(), key=lambda x: expl[x])
            f_ranks = []
            expl_ranks = []
            for feat in biased_features:
                f_ranks.append(n_feats - 1)
                try:
                    expl_ranks.append(expl_keys_asc.index(feat))
                except ValueError:
                    expl_ranks.append(0)
            for feat in biased_features:
                rank_f = n_feats - 1
                try:
                    rank = expl_keys_asc.index(feat)
                except ValueError:
                    rank = 0
                if yi == 0:
                    rank_f = n_feats - rank_f
                    rank = n_feats - rank
                f_ranks.append(rank_f)
                expl_ranks.append(rank)
            for feat in {*P.features} - {*biased_features}:
                rank_f = 0
                try:
                    rank = expl_keys_asc.index(feat)
                except ValueError:
                    rank = 0
                if yi == 0:
                    rank_f = n_feats - rank_f
                    rank = n_feats - rank
                f_ranks.append(rank_f)
                expl_ranks.append(rank)
            score += scipy.stats.spearmanr(expl_ranks, f_ranks)[0]
        score /= len(explanations)
        print('fidelity_CAD-DEFENSE', explainer, task, score)