# Counterfactuals Membership Inference Experiment

In [1]:
import pandas as pd
from sklearnex import patch_sklearn
patch_sklearn()
import sklearn.ensemble as es
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
import numpy as np
import logging
import warnings
import dice_ml

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
threads = 15

logging.basicConfig()

logger = logging.getLogger('xai-privacy')

In [3]:
from experiment_setup import run_all_experiments
from experiment_setup import get_heart_disease_dataset
from experiment_setup import get_census_dataset
from experiment_setup import MembershipInference

In [4]:
DATASET_HALF = True

data_heart_dict, data_heart_num_dict, data_heart_cat_dict = get_heart_disease_dataset(halve_dataset=DATASET_HALF)
data_census_dict, data_census_num_dict, data_census_cat_dict = get_census_dataset(halve_dataset=DATASET_HALF)

data_heart = data_heart_dict['dataset']
outcome_name_heart = data_heart_dict['outcome']
numeric_features_heart = data_heart_dict['num']

Feature Age: removed 0 rows for missing values.
Feature RestingBP: removed 59 rows for missing values.
Feature Cholesterol: removed 27 rows for missing values.
Feature FastingBS: add unknown category 2.0
Feature RestingECG: add unknown category 3.0
Feature MaxHR: removed 0 rows for missing values.
Feature Oldpeak: removed 7 rows for missing values.
Feature ST_Slope: add unknown category 4.0
Feature CA: add unknown category 4.0
Feature Thal: add unknown category 8.0
Dropped 71 of 548
Dropped 72 of 548
Dropped 71 of 548
Dropped: 2399 of 32561
census: Dropped 1256 of 15081
num: Dropped 8827 of 15081
cat: Dropped 4850 of 15081


This notebook will test whether membership inference is possible with counterfactuals (CF) that are drawn from the training data. Membership inference means an attacker with access to the explanation can determine for any sample whether it was included in the training data or not.

The idea for counterfactual membership inference is as follows: The attacker obtains counterfactuals for the given sample ("frist counterfactuals"). They access the explainer again to receive counterfactuals for each first counterfactual ("second counterfactuals"). The second counterfactuals should have the same class as the original given sample (because the class was flipped twice). If any second counterfactual is equal to the given sample, then the sample must be part of the training data. This concept relies on the assumption that there is a high likelihood that a sample from the training data will be its own counter-counterfactual.

First, we implement the `train_explainer` and `membership_inference_attack_no_model_access` functions:

In [5]:
# Attack code must be imported so that multiprocessing pool works. Check out cf_attack.py for the implementation of the attack.
from cf_attack import CounterfactualMembershipInference

# Executing Membership Inference

We now generate five counterfactuals for the first sample from the training data to demonstrate counterfactual explanations in general.

In [6]:
features = data_heart.drop(outcome_name_heart, axis=1)
labels = data_heart[outcome_name_heart]

# Train a random forest on training data.
model = es.RandomForestClassifier(random_state=0)
model = model.fit(features, labels)

# Train explainer
d = dice_ml.Data(dataframe=data_heart, continuous_features=numeric_features_heart, outcome_name=outcome_name_heart)


m = dice_ml.Model(model=model, backend="sklearn", model_type='classifier')
# Generating counterfactuals from training data (kd-tree)
exp = dice_ml.Dice(d, m, method="kdtree")

In [7]:
e1 = exp.generate_counterfactuals(features[0:1], total_CFs=5, desired_class="opposite")
e1.visualize_as_dataframe(display_sparse_df=False)

100%|██████████| 1/1 [00:00<00:00,  1.18it/s]

Query instance (original outcome : 0)





Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,CA,Thal,HeartDisease
0,62.0,1.0,2.0,140.0,271.0,0.0,0.0,152.0,0.0,1.0,1.0,4.0,8.0,0.0



Diverse Counterfactual set without sparsity correction (new outcome:  1


Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,CA,Thal
439,62.0,0.0,4.0,140.0,268.0,0.0,2.0,160.0,0.0,3.6,3.0,2.0,3.0
90,57.0,1.0,2.0,124.0,261.0,0.0,0.0,141.0,0.0,0.3,1.0,0.0,7.0
166,60.0,1.0,2.0,160.0,267.0,1.0,1.0,157.0,0.0,0.5,2.0,4.0,8.0
103,66.0,1.0,4.0,112.0,261.0,0.0,0.0,140.0,0.0,1.5,1.0,4.0,8.0
370,67.0,1.0,1.0,142.0,270.0,1.0,0.0,125.0,0.0,2.5,1.0,4.0,8.0


We can see that the counterfactuals are similar to the query sample and that they have a flipped prediction. These are the two general properties of counterfactual explanations.

We will now do a small proof of concept of the experiment with logging enabled to demonstrate how it works.

In [8]:
logger.setLevel(logging.DEBUG)
logging.root.setLevel(logging.ERROR)

EXP = CounterfactualMembershipInference(data_heart, numeric_features_heart, outcome_name_heart, random_state=13)
EXP.membership_inference_experiment(num_queries=10, model=DecisionTreeClassifier(random_state=13), model_access=False, threads=1)

logger.setLevel(logging.ERROR)

DEBUG:xai-privacy:Numeric Features: ['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']
DEBUG:xai-privacy:Categorical Features: ['CA', 'ChestPainType', 'ExerciseAngina', 'FastingBS', 'RestingECG', 'ST_Slope', 'Sex', 'Thal']
DEBUG:xai-privacy:Removed 6 test samples due to unknown category.
DEBUG:xai-privacy:[[ 65.    1.    4.  120.  177.    0.    0.  140.    0.    0.4   1.    0.
    7.    0. ]] taken from training data
DEBUG:xai-privacy:[[ 42.   1.   3. 130. 180.   0.   0. 150.   0.   0.   1.   0.   3.   0.]] taken from test data
DEBUG:xai-privacy:[[ 42.   1.   4. 140. 358.   0.   0. 170.   0.   0.   4.   4.   8.   0.]] taken from training data
DEBUG:xai-privacy:[[ 54.   1.   2. 120. 238.   0.   0. 154.   0.   0.   4.   4.   8.   0.]] taken from test data
DEBUG:xai-privacy:[[ 69.   1.   4. 135.   0.   0.   0. 130.   0.   0.   2.   4.   6.   1.]] taken from training data
DEBUG:xai-privacy:[[ 37.   0.   4. 130. 173.   0.   1. 184.   0.   0.   4.   4.   8.   0.]] taken from test data
D

Total time: 39.37s (training model: 0.03s, training explainer: 0.03s, experiment: 39.31s)
Accuracy: 0.6, precision: 1.0, recall: 0.2


The proof of concept should show that counterfactual membership inference is accurate most of the time. However, some training samples are not recognized because they do not appear as their own counter-counterfactual.

Now we begin executing the actual experiment. We begin by defining the table that will hold the results for all our different experiment variations. Then we execute all variations of the experiment for this dataset. We vary the model between a decision tree, a random forest and a neural network. Each model uses the default configuration of scikit-learn.

In [9]:
dataset_dicts = [data_heart_dict, data_heart_num_dict, data_heart_cat_dict, data_census_dict, data_census_num_dict, data_census_cat_dict]

dt_dict = {'name': 'decision tree', 'model': DecisionTreeClassifier}
rf_dict = {'name': 'random forest', 'model': es.RandomForestClassifier}
nn_dict = {'name': 'neural network', 'model': MLPClassifier}

model_dicts = [dt_dict, rf_dict, nn_dict]

In [10]:
# remove pandas warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

In [None]:
# This will run the experiment for each dataset and model combination

results = run_all_experiments(CounterfactualMembershipInference, dataset_dicts, model_dicts, random_state=0, num_queries=None, model_access=False, threads=threads, convert_cat_to_str=True, repeat=10)

dataset: heart, model: decision tree (repetition 0)
Total time: 195.85s (training model: 0.02s, training explainer: 0.02s, experiment: 195.82s)
Accuracy: 0.8448275862068966, precision: 1.0, recall: 0.698744769874477
dataset: heart, model: decision tree (repetition 1)
Total time: 193.18s (training model: 0.01s, training explainer: 0.03s, experiment: 193.15s)
Accuracy: 0.8586956521739131, precision: 1.0, recall: 0.7280334728033473
dataset: heart, model: decision tree (repetition 2)
Total time: 241.64s (training model: 0.01s, training explainer: 0.02s, experiment: 241.62s)
Accuracy: 0.8728448275862069, precision: 1.0, recall: 0.7531380753138075
dataset: heart, model: decision tree (repetition 3)
Total time: 242.14s (training model: 0.01s, training explainer: 0.02s, experiment: 242.12s)
Accuracy: 0.8530701754385965, precision: 1.0, recall: 0.7196652719665272
dataset: heart, model: decision tree (repetition 4)
Total time: 229.34s (training model: 0.01s, training explainer: 0.02s, experiment



Total time: 201.29s (training model: 0.54s, training explainer: 0.02s, experiment: 200.73s)
Accuracy: 0.8502109704641351, precision: 1.0, recall: 0.702928870292887
dataset: heart, model: neural network (repetition 1)




Total time: 234.03s (training model: 0.54s, training explainer: 0.01s, experiment: 233.48s)
Accuracy: 0.8166311300639659, precision: 1.0, recall: 0.6401673640167364
dataset: heart, model: neural network (repetition 2)




Total time: 190.08s (training model: 0.55s, training explainer: 0.01s, experiment: 189.52s)
Accuracy: 0.8556263269639066, precision: 1.0, recall: 0.7154811715481172
dataset: heart, model: neural network (repetition 3)




Total time: 269.05s (training model: 0.54s, training explainer: 0.01s, experiment: 268.50s)
Accuracy: 0.8205128205128205, precision: 1.0, recall: 0.6485355648535565
dataset: heart, model: neural network (repetition 4)




Total time: 227.71s (training model: 0.54s, training explainer: 0.00s, experiment: 227.16s)
Accuracy: 0.8468085106382979, precision: 1.0, recall: 0.698744769874477
dataset: heart, model: neural network (repetition 5)




Total time: 214.20s (training model: 0.54s, training explainer: 0.00s, experiment: 213.65s)
Accuracy: 0.8091106290672451, precision: 1.0, recall: 0.6317991631799164
dataset: heart, model: neural network (repetition 6)




Total time: 285.91s (training model: 0.54s, training explainer: 0.01s, experiment: 285.37s)
Accuracy: 0.860813704496788, precision: 1.0, recall: 0.7280334728033473
dataset: heart, model: neural network (repetition 7)




# Results

The results of all variations of the membership inference experiment with counterfactuals. In every experiment, we executed the membership inference attack on each sample of the training data and each sample of the test data. Both datasets are of equal size and originate from the same source dataset.

Accuracy is the percentage of samples whose membership (true or false) was correctly inferred. An algorithm guessing at random would achieve an accuracy of 50 percent.

Precision is the percentage of predicted training samples that is actually in the training data.

Recall is the percentage of training samples whose membership (true) was correctly inferred.

In [None]:
results

In [None]:
file_name = 'results/1-6-cf-membership-inference-results'
if DATASET_HALF:
    file_name += '_dataset_size_halved'
results.to_csv(file_name + '.csv', index=False, na_rep='NaN', float_format='%.3f')

# Discussion

In our experiments, membership inference with counterfactuals drawn from the training data has an accuracy between 69% and 85%. Since no false positives can occur, precision is always 100%.

The lower recall of the attack with a neural network and the numeric dataset can be explained by the following observations. Recall increases significantly when one fixes the inbalance in the training dataset between label 0 and 1. Either undersampling the majority class or oversampling the minority class results in membership inference being similarly accurate as for decision trees or random forests.

Why does this change accuracy so much? Let's have a look at the predictions of the three models when using the original inbalanced dataset:

In [None]:
# train models exactly as in the experiment
idx_mid = int(data_heart.shape[0] / 2)

data_train = data_heart.iloc[idx_mid:, :]

x_train = data_train.drop(outcome_name_heart, axis=1)
y_train = data_train[outcome_name_heart]

features = data_heart.drop(outcome_name_heart, axis=1)

model_dt = DecisionTreeClassifier(random_state=0)
model_rf = es.RandomForestClassifier(random_state=0)
model_nn = MLPClassifier(random_state=0)

for model in [model_dt, model_rf, model_nn]:
    model = model.fit(x_train, y_train)
    
# check how often they predict 1/0
pred_dt = model_dt.predict(features)
pred_rf = model_rf.predict(features)
pred_nn = model_nn.predict(features)

print(f'Decision Tree predicted 1 for {np.count_nonzero(pred_dt)} samples.')
print(f'Random Forest predicted 1 for {np.count_nonzero(pred_rf)} samples.')
print(f'Neural Network predicted 1 for {np.count_nonzero(pred_nn)} samples.')
print(f'There is a total of {len(features)} samples.')

It is clear that the inbalance of the dataset has the biggest negative effect on the neural network and causes it to classify a majority of samples with 0. Therefore, there are fewer counterfactuals in the training data with a prediction of 1. This causes them to be further away from the queries on average and thus reducing the chance of a counter-counterfactual being the same as the query. Therefore, recall of the attack is reduced.

The question remains why the recall is equal for a random forest and decision tree. It turns out that both models perfectly fit to the training data because they are not limited in complexity. Therefore they also give exactly the same predictions for the training data and thus receive the same counterfactuals. They do not give the same predictions for the test data, but this does not matter, because this attack will always reject samples from outside the training data regardless of the model (simply because no counterfactual can be shown that is equal to the given sample).