# Counterfactuals Membership Inference Experiment

In [56]:
import pandas as pd
import sklearn.ensemble as es
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
import numpy as np
import logging
import warnings
import dice_ml

In [57]:
threads = 1

In [58]:
%run experiment_setup.ipynb

INFO:xai-privacy:Loading dataset 1: heart disease (numeric features) ...
INFO:xai-privacy:Loading dataset 2: census income (categorical features) ...


Feature Age: removed 0 rows for missing values.
Feature RestingBP: removed 59 rows for missing values.
Feature Cholesterol: removed 27 rows for missing values.
Feature FastingBS: add unknown category 2.0
Feature RestingECG: add unknown category 3.0
Feature MaxHR: removed 0 rows for missing values.
Feature Oldpeak: removed 7 rows for missing values.
Feature ST_Slope: add unknown category 4.0
Feature CA: add unknown category 4.0
Feature Thal: add unknown category 8.0
Dropped 271 of 1097
Dropped 273 of 1097
Dropped 277 of 1097
Dropped: 2399 of 32561
census: Dropped 3848 of 30162
num: Dropped 19859 of 30162
cat: Dropped 12136 of 30162


In [59]:
logger = logging.getLogger('xai-privacy')

This notebook will test whether membership inference is possible with counterfactuals (CF) that are drawn from the training data. Membership inference means an attacker with access to the explanation can determine for any sample whether it was included in the training data or not.

The idea for counterfactual membership inference is as follows: The attacker obtains counterfactuals for the given sample ("frist counterfactuals"). They access the explainer again to receive counterfactuals for each first counterfactual ("second counterfactuals"). The second counterfactuals should have the same class as the original given sample (because the class was flipped twice). If any second counterfactual is equal to the given sample, then the sample must be part of the training data. This concept relies on the assumption that there is a high likelihood that a sample from the training data will be its own counter-counterfactual.

First, we implement the `train_explainer` and `membership_inference_attack_no_model_access` functions:

In [60]:
class CounterfactualMembershipInference(MembershipInference):
    def train_explainer(self, data_train, model):
        # train explainer on training data
        d = dice_ml.Data(dataframe=data_train, continuous_features=self.numeric_features,\
                         outcome_name=self.outcome_name)
        m = dice_ml.Model(model=model, backend="sklearn", model_type='classifier')
        
        # use method "kd-tree" to get counterfactuals drawn from the training data
        return dice_ml.Dice(d, m, method="kdtree")
        
    @staticmethod
    def membership_inference_attack_no_model_access(explainer, samples_df):
        num_samples = len(samples_df)
        inferred_membership = np.full(num_samples, False)
        # we only use the features for membership inference, not the target. Therefore we must drop the last column.
        samples_df = samples_df.drop(samples_df.columns[-1], axis=1)
        
        # This is the default number of counterfactuals per query used on the github page of DiCE
        cfs_per_query = 4
        
        # get first counterfactuals for all given samples
        e1 = explainer.generate_counterfactuals(samples_df, total_CFs=cfs_per_query, desired_class='opposite')
        
        # collect all first counterfactuals in this dataframe to plug it into the explainer once more
        first_cfs_all = pd.DataFrame(columns=samples_df.columns)
        
        # collect the original sample index corresponding to a first counterfactual
        # this is necessary in order to remember which first counterfactuals belonged to which original sample
        respective_sample_index = []
        
        # collect first counterfactuals
        for index in range(num_samples):
            # get counterfactuals for given sample:
            first_cfs = e1.cf_examples_list[index].final_cfs_df
            logger.debug(f'Sample {index}: 1st counterfactuals: \n {first_cfs.to_numpy()}')
            
            first_cfs_all = pd.concat([first_cfs_all, first_cfs])
            
            for i in range(len(first_cfs)):
                respective_sample_index.append(index)
                
        respective_sample_index = np.array(respective_sample_index)
        
        # get second counterfactuals for all first counterfactuals
        e2 = explainer.generate_counterfactuals(first_cfs_all, total_CFs=cfs_per_query, desired_class='opposite')        
        
        # compare all second counterfactuals with the samples they were generated for
        for i, second_cfs_obj in enumerate(e2.cf_examples_list):
            # get the sample that these second counterfactuals belong to:
            index = respective_sample_index[i]
            sample_df = samples_df.iloc[[index], :]
            
            logger.debug(f'Sample {index}: {sample_df.to_numpy()}')
            
            second_cfs = second_cfs_obj.final_cfs_df
            
            logger.debug(f'Sample {index}: 2nd counterfactuals: \n {second_cfs.to_numpy()}')
            
            # if any counter-counterfactual is equal to the given sample, then it is part of the training data:
            # np.isclose is used for comparison because explainer may round floating point values
            result = np.isclose(second_cfs.to_numpy().astype(float), sample_df.to_numpy().astype(float)).all(axis=1).any()
            
            if result:
                logger.debug(f'Inferred membership as true.')
                inferred_membership[index] = True
        
        return inferred_membership

# Executing Membership Inference

We now generate five counterfactuals for the first sample from the training data to demonstrate counterfactual explanations in general.

In [15]:
features = data_heart.drop(outcome_name_heart, axis=1)
labels = data_heart[outcome_name_heart]

# Train a random forest on training data.
model = es.RandomForestClassifier(random_state=0)
model = model.fit(features, labels)

# Train explainer
d = dice_ml.Data(dataframe=data_heart, continuous_features=numeric_features_heart, outcome_name=outcome_name_heart)


m = dice_ml.Model(model=model, backend="sklearn", model_type='classifier')
# Generating counterfactuals from training data (kd-tree)
exp = dice_ml.Dice(d, m, method="kdtree")

In [7]:
e1 = exp.generate_counterfactuals(features[0:1], total_CFs=5, desired_class="opposite")
e1.visualize_as_dataframe(display_sparse_df=False)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:04<00:00,  4.44s/it]

Query instance (original outcome : 1)





Unnamed: 0,Age,RestingBP,Cholesterol,MaxHR,Oldpeak,HeartDisease
0,61.0,130.0,330.0,169.0,0.0,1.0



Diverse Counterfactual set without sparsity correction (new outcome:  0


Unnamed: 0,Age,RestingBP,Cholesterol,MaxHR,Oldpeak
22,37.0,130.0,315.0,158.0,0.0
86,58.0,120.0,340.0,172.0,0.0
468,53.0,140.0,320.0,162.0,0.0
668,55.0,122.0,320.0,155.0,0.0
99,55.0,145.0,326.0,155.0,0.0


We can see that the counterfactuals are similar to the query sample and that they have a flipped prediction. These are the two general properties of counterfactual explanations.

We will now do a small proof of concept of the experiment with logging enabled to demonstrate how it works.

In [8]:
logger.setLevel(logging.DEBUG)
logging.root.setLevel(logging.ERROR)

EXP = CounterfactualMembershipInference(data_heart, numeric_features_heart, outcome_name_heart, random_state=13)
EXP.membership_inference_experiment(num_queries=10, model=DecisionTreeClassifier(random_state=13), model_access=False, threads=1)

logger.setLevel(logging.ERROR)

DEBUG:xai-privacy:[[ 45. 142. 309. 147.   0.   1.]] taken from training data
DEBUG:xai-privacy:[[ 62.  120.  281.  103.    1.4   1. ]] taken from test data
DEBUG:xai-privacy:[[5.50e+01 1.15e+02 0.00e+00 1.55e+02 1.00e-01 1.00e+00]] taken from training data
DEBUG:xai-privacy:[[ 48.  160.  329.   92.    1.5   1. ]] taken from test data
DEBUG:xai-privacy:[[ 38.   92.  117.  134.    2.5   1. ]] taken from training data
DEBUG:xai-privacy:[[ 56. 125.   0.  98.  -2.   1.]] taken from test data
DEBUG:xai-privacy:[[ 44. 140. 235. 180.   0.   0.]] taken from training data
DEBUG:xai-privacy:[[ 47. 140. 257. 135.   1.   0.]] taken from test data
DEBUG:xai-privacy:[[ 41. 126. 306. 163.   0.   0.]] taken from training data
DEBUG:xai-privacy:[[ 51. 110.   0.  92.   0.   1.]] taken from test data
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1

Total time: 11.23s (training model: 0.01s, training explainer: 0.01s, experiment: 11.22s)
Accuracy: 0.9, precision: 1.0, recall: 0.8


The proof of concept should show that counterfactual membership inference is accurate most of the time. However, some training samples are not recognized because they do not appear as their own counter-counterfactual.

Now we begin executing the actual experiment. We begin by defining the table that will hold the results for all our different experiment variations. Then we execute all variations of the experiment for this dataset. We vary the model between a decision tree, a random forest and a neural network. Each model uses the default configuration of scikit-learn.

In [61]:
results_ = {'dataset': [], 'model': [], 'accuracy': [], 'precision': [], 'recall': []}

results = pd.DataFrame(data = results_)

In [62]:
dataset_dicts = [data_heart_dict, data_heart_num_dict, data_heart_cat_dict, data_census_dict, data_census_num_dict, data_census_cat_dict]

dt_dict = {'name': 'decision tree', 'model': DecisionTreeClassifier}
rf_dict = {'name': 'random forest', 'model': es.RandomForestClassifier}
nn_dict = {'name': 'neural network', 'model': MLPClassifier}

model_dicts = [dt_dict, rf_dict, nn_dict]

In [63]:
# remove pandas warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

In [None]:
# This will run the experiment for each dataset and model combination

results = run_all_experiments(CounterfactualMembershipInference, dataset_dicts, model_dicts, random_state=0, num_queries=None, model_access=False, threads=threads, results_table=results, convert_cat_to_str=True)

# Results

The results of all variations of the membership inference experiment with counterfactuals. In every experiment, we executed the membership inference attack on each sample of the training data and each sample of the test data. Both datasets are of equal size and originate from the same source dataset.

Accuracy is the percentage of samples whose membership (true or false) was correctly inferred. An algorithm guessing at random would achieve an accuracy of 50 percent.

Precision is the percentage of predicted training samples that is actually in the training data.

Recall is the percentage of training samples whose membership (true) was correctly inferred.

In [21]:
results

Unnamed: 0,dataset,model,accuracy,precision,recall
0,numeric,decision tree,0.848301,1.0,0.696602
1,numeric,random forest,0.848301,1.0,0.696602
2,numeric,neural network,0.787621,1.0,0.575243
3,categorical,decision tree,0.707071,1.0,0.414141
4,categorical,random forest,0.707071,1.0,0.414141
5,categorical,neural network,0.693085,1.0,0.386169
6,mixed,decision tree,0.696359,1.0,0.392719
7,mixed,random forest,0.696397,1.0,0.392795
8,mixed,neural network,0.665197,1.0,0.330394


In [22]:
results.to_csv('results/1-6-cf-membership-inference-results.csv', index=False, na_rep='NaN', float_format='%.3f')

# Discussion

In our experiments, membership inference with counterfactuals drawn from the training data has an accuracy between 69% and 85%. Since no false positives can occur, precision is always 100%.

The lower recall of the attack with a neural network and the numeric dataset can be explained by the following observations. Recall increases significantly when one fixes the inbalance in the training dataset between label 0 and 1. Either undersampling the majority class or oversampling the minority class results in membership inference being similarly accurate as for decision trees or random forests.

Why does this change accuracy so much? Let's have a look at the predictions of the three models when using the original inbalanced dataset:

In [23]:
# train models exactly as in the experiment
idx_mid = int(data_num.shape[0] / 2)

data_train = data_num.iloc[idx_mid:, :]

x_train = data_train.drop(outcome_name_num, axis=1)
y_train = data_train[outcome_name_num]

features = data_num.drop(outcome_name_num, axis=1)

model_dt = DecisionTreeClassifier(random_state=0)
model_rf = es.RandomForestClassifier(random_state=0)
model_nn = MLPClassifier(random_state=0)

for model in [model_dt, model_rf, model_nn]:
    model = model.fit(x_train, y_train)
    
# check how often they predict 1/0
pred_dt = model_dt.predict(features)
pred_rf = model_rf.predict(features)
pred_nn = model_nn.predict(features)

print(f'Decision Tree predicted 1 for {np.count_nonzero(pred_dt)} samples.')
print(f'Random Forest predicted 1 for {np.count_nonzero(pred_rf)} samples.')
print(f'Neural Network predicted 1 for {np.count_nonzero(pred_nn)} samples.')
print(f'There is a total of {len(features)} samples.')

Decision Tree predicted 1 for 460 samples.
Random Forest predicted 1 for 449 samples.
Neural Network predicted 1 for 389 samples.
There is a total of 824 samples.


It is clear that the inbalance of the dataset has the biggest negative effect on the neural network and causes it to classify a majority of samples with 0. Therefore, there are fewer counterfactuals in the training data with a prediction of 1. This causes them to be further away from the queries on average and thus reducing the chance of a counter-counterfactual being the same as the query. Therefore, recall of the attack is reduced.

The question remains why the recall is equal for a random forest and decision tree. It turns out that both models perfectly fit to the training data because they are not limited in complexity. Therefore they also give exactly the same predictions for the training data and thus receive the same counterfactuals. They do not give the same predictions for the test data, but this does not matter, because this attack will always reject samples from outside the training data regardless of the model (simply because no counterfactual can be shown that is equal to the given sample).