# Influential Instances Membership Inference Experiment

In [1]:
import pandas as pd
import sklearn.ensemble as es
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
import numpy as np
import random
import logging
import sys
import time
import multiprocessing
import torch
from torch import nn
from torch.utils.data import Dataset
import torch.utils.data as data_utils
import pytorch_influence_functions as ptif

This notebook will test whether membership inference is possible with counterfactuals (CF) that are drawn from the training data. Membership inference means an attacker with access to the explanation can determine for any sample whether it was included in the training data or not.

First we define the function that will run the experiment for the different variations. The attacker obtains a counterfactual for the test sample ("counterfactual \#1"). They access the explainer a second time to receive a counterfactual for counterfactual \#1 ("counterfactual \#2"). Counterfactual \#2 should have the same class as the original test sample. If counterfactual \#2 is equal to the test sample, then the test sample must be part of the training data.

In [33]:
class PandasDataset(Dataset):
    def __init__(self, features, targets):
        self.features = features.to_numpy().astype(np.float32)
        self.targets = targets.to_numpy().astype(np.int_)
        
        # for normaliziation of features:
        self.max = self.features.max(axis=0)
        self.min = self.features.min(axis=0)

    def __len__(self):
        return self.features.shape[0]

    def __getitem__(self, idx):
        return [np.divide(self.features[idx] - self.min, self.max - self.min), int(self.targets[idx])]

class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(8, 32),
            nn.ReLU(),
            nn.Linear(32, 32),
            nn.ReLU(),
            nn.Linear(32, 2),
            nn.Softmax(dim=1)
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits
    
class InfluentialInstances():
    def __init__(self, model, train_loader, train_df, outcome_name):
        self.model = model
        self.train_loader = train_loader
        self.train_df = train_df
        
        ptif.init_logging()
        
        self.config = ptif.get_default_config()
        # calculate on CPU
        self.config['gpu'] = -1
        # calculate for one test sample:
        self.config['test_sample_num'] = 1
        self.config['num_classes'] = 1
        
        self.outcome_name = outcome_name
        
    def explain(self, sample):
        test_data = PandasDataset(sample.drop(self.outcome_name, axis=1), sample[self.outcome_name])
        test_loader = data_utils.DataLoader(dataset = test_data, batch_size = 1, shuffle = True)

        influences = ptif.calc_img_wise(self.config, self.model, self.train_loader, test_loader)
                                  
        harmful_id = influences['0']['harmful'][0]
        helpful_id = influences['0']['helpful'][0]
                                  
        return self.train_df.iloc[harmful_id], self.train_df.iloc[helpful_id]
    
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 10 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

In [29]:
def experiment(data, outcome_name, repetitions, model, random_state=0):
    # create random state from seed. This will be used to draw the test samples for the experiment.
    rs = np.random.RandomState(seed=random_state)
    
    # split data into two halves. One is used for training, the other as control data that is not part of the training data.
    # this control data will be needed as test samples that do not belong to the training data.
    idx_mid = int(features.shape[0] / 2)
    
    data_ctrl = data.iloc[:idx_mid, :]

    data_train = data.iloc[idx_mid:, :]
    
    train_dataset = PandasDataset(data_train.drop(outcome_name, axis=1), data_train[outcome_name])
    train_loader = data_utils.DataLoader(dataset = train_dataset, batch_size = 64, shuffle = True)
    
    # train model on training data
    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

    epochs = 10
    for t in range(epochs):
        print(f"Epoch {t+1}\n-------------------------------")
        train_loop(train_loader, model, loss_fn, optimizer)
    print("Done!")
    
    # init explainer (new class)
    explainer = InfluentialInstances(model, train_loader, data_train, outcome_name)
    
    # boolean numpy arrays for actual and inferred membership of the test samples
    sample_membership = np.empty(repetitions)
    inferred_membership = np.empty(repetitions)
    
    for i in range(repetitions):
        if i % 2 == 0:
            # choose sample from training data.
            sample = data_train.sample(random_state=rs)
            sample_membership[i] = True
            logging.debug('%s taken from training data' % sample.to_numpy())
        else:
            # choose sample from control data.
            sample = data_ctrl.sample(random_state=rs)
            sample_membership[i] = False
            logging.debug('%s taken from control data' % sample.to_numpy())
        
        # infer membership using membership inference attack against the explainer
        inferred_membership[i] = ii_membership_inference(sample, explainer)
    
    # calculate accuracy, sensitivity and specificity
    samples_in_training_data = np.count_nonzero(sample_membership)
    samples_not_in_training_data = repetitions - samples_in_training_data
        
    correct_predictions = np.count_nonzero(np.equal(inferred_membership, sample_membership))
    predict_in_training_data_correct = np.count_nonzero(inferred_membership[sample_membership == True])
    predict_not_in_training_data_correct = np.count_nonzero(inferred_membership[sample_membership == False] == False)
                
    ratio_correct = correct_predictions / repetitions
    ratio_correct_td = predict_in_training_data_correct / samples_in_training_data
    ratio_correct_cd = predict_not_in_training_data_correct / samples_not_in_training_data
        
    print('Membership Inference Accuracy: %s, Sensitivity: %s, Specificity: %s'\
          % (ratio_correct, ratio_correct_td, ratio_correct_cd))
    
    return ratio_correct, ratio_correct_td, ratio_correct_cd

In [35]:
def ii_membership_inference(sample, explainer):
    harmful_instance, helpful_instance = explainer.explain(sample)
    
    logging.debug(f'Harmful instance: {harmful_instance.to_numpy()}')
    logging.debug(f'Helpful instance: {helpful_instance.to_numpy()}')
    
    harmful_equal = np.isclose(sample.to_numpy().astype(float), harmful_instance.to_numpy().astype(float)).all()
    helpful_equal = np.isclose(sample.to_numpy().astype(float), helpful_instance.to_numpy().astype(float)).all()
    
    return harmful_equal or helpful_equal

# Dataset 1: Heart Disease

Load dataset one: heart disease

In [2]:
filename = '../data/framingham.csv'

names = ['sex', 'age', 'education', 'smoker', 'cigs_per_day', 'bp_meds', 'prevalent_stroke', 'prevelant_hyp', 'diabetes', \
         'total_chol', 'sys_bp', 'dia_bp', 'bmi', 'heart_rate', 'glucose', 'heart_disease_label']

data = pd.read_csv(filename, names=names)

For this dataset we only look at numerical data so we drop the categorical columns. We also drop the column "education" for which there is no feature description on kaggle: https://www.kaggle.com/dileep070/heart-disease-prediction-using-logistic-regression

In [3]:
data_num = data.drop('sex', axis=1).drop('smoker', axis=1).drop('bp_meds', axis=1).drop('prevalent_stroke', axis=1)\
    .drop('prevelant_hyp', axis=1).drop('diabetes', axis=1).drop('education', axis=1)

data_num.head(5)

Unnamed: 0,age,cigs_per_day,total_chol,sys_bp,dia_bp,bmi,heart_rate,glucose,heart_disease_label
0,39,0.0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,46,0.0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,48,20.0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,61,30.0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,46,23.0,285.0,130.0,84.0,23.1,85.0,85.0,0


Remove any rows that are missing data. Afterwards there should be no more entries with NaN values. We also drop any duplicate rows.

In [4]:
data_num = data_num.dropna()
data_num = data_num.drop_duplicates()

data_num_100 = data_num.sample(n = 100, random_state=13)
data_num_100 = data_num_100.reset_index(drop=True)

continuous_features_num = ['age', 'total_chol', 'sys_bp', 'dia_bp', 'bmi', 'heart_rate', 'glucose']
outcome_name_num = 'heart_disease_label'

data_num.isnull().sum()

age                    0
cigs_per_day           0
total_chol             0
sys_bp                 0
dia_bp                 0
bmi                    0
heart_rate             0
glucose                0
heart_disease_label    0
dtype: int64

We can see that the counterfactuals are similar to the query sample and that most of them have a flipped prediction. These are the two general properties of counterfactual explanations.

We will now do a small proof of concept of the experiment with logging enabled to demonstrate how it works.

In [36]:
logging.root.setLevel(logging.DEBUG)

experiment(data_num, repetitions=10, outcome_name=outcome_name_num, model=NeuralNetwork().to('cpu'))

logging.root.setLevel(logging.WARNING)

Epoch 1
-------------------------------
loss: 0.688230  [    0/ 1900]
loss: 0.685886  [  640/ 1900]
loss: 0.686175  [ 1280/ 1900]
Epoch 2
-------------------------------
loss: 0.684670  [    0/ 1900]
loss: 0.682412  [  640/ 1900]
loss: 0.683906  [ 1280/ 1900]
Epoch 3
-------------------------------
loss: 0.678141  [    0/ 1900]
loss: 0.678828  [  640/ 1900]
loss: 0.681906  [ 1280/ 1900]
Epoch 4
-------------------------------
loss: 0.673320  [    0/ 1900]
loss: 0.675549  [  640/ 1900]
loss: 0.673270  [ 1280/ 1900]
Epoch 5
-------------------------------
loss: 0.673495  [    0/ 1900]
loss: 0.678432  [  640/ 1900]
loss: 0.674575  [ 1280/ 1900]
Epoch 6
-------------------------------
loss: 0.671335  [    0/ 1900]
loss: 0.670219  [  640/ 1900]
loss: 0.664672  [ 1280/ 1900]
Epoch 7
-------------------------------
loss: 0.666049  [    0/ 1900]
loss: 0.666984  [  640/ 1900]
loss: 0.669146  [ 1280/ 1900]
Epoch 8
-------------------------------
loss: 0.672812  [    0/ 1900]
loss: 0.663538  [  6

  return [np.divide(self.features[idx] - self.min, self.max - self.min), int(self.targets[idx])]


2022-01-21 07:57:58,390: The results for this run are:
2022-01-21 07:57:58,391: Influences: 
2022-01-21 07:57:58,392: [tensor(nan), tensor(nan), tensor(nan)]
2022-01-21 07:57:58,393: Most harmful img IDs: 
2022-01-21 07:57:58,394: [0, 1273, 1272]
2022-01-21 07:57:58,395: Most helpful img IDs: 
2022-01-21 07:57:58,397: [1899, 637, 625]
2022-01-21 07:57:58,408: Harmful instance: [ 48.     0.   234.   141.    98.    21.06  53.    82.     0.  ]
2022-01-21 07:57:58,409: Helpful instance: [ 52.     0.   269.   133.5   83.    21.47  80.   107.     0.  ]
2022-01-21 07:57:58,411: [[ 45.    0.  258.  114.   80.   26.6  80.   68.    0. ]] taken from control data
2022-01-21 07:57:58,417: Running on: 1 images per class.
2022-01-21 07:57:58,418: Starting at img number: 0 per class.
2022-01-21 07:58:02,394: The results for this run are:
2022-01-21 07:58:02,394: Influences: 
2022-01-21 07:58:02,395: [tensor(nan), tensor(nan), tensor(nan)]
2022-01-21 07:58:02,397: Most harmful img IDs: 
2022-01-21 07:5

2022-01-21 07:58:21,633: Starting at img number: 0 per class.
2022-01-21 07:58:25,391: The results for this run are:
2022-01-21 07:58:25,392: Influences: 
2022-01-21 07:58:25,392: [tensor(nan), tensor(nan), tensor(nan)]
2022-01-21 07:58:25,394: Most harmful img IDs: 
2022-01-21 07:58:25,395: [0, 1273, 1272]
2022-01-21 07:58:25,396: Most helpful img IDs: 
2022-01-21 07:58:25,396: [1899, 637, 625]
2022-01-21 07:58:25,407: Harmful instance: [ 48.     0.   234.   141.    98.    21.06  53.    82.     0.  ]
2022-01-21 07:58:25,409: Helpful instance: [ 52.     0.   269.   133.5   83.    21.47  80.   107.     0.  ]
2022-01-21 07:58:25,412: [[ 54.     0.   265.   121.    82.    23.52  60.    67.     0.  ]] taken from training data
2022-01-21 07:58:25,415: Running on: 1 images per class.
2022-01-21 07:58:25,416: Starting at img number: 0 per class.
2022-01-21 07:58:29,284: The results for this run are:
2022-01-21 07:58:29,285: Influences: 
2022-01-21 07:58:29,287: [tensor(nan), tensor(nan), tens

In [162]:
results_ = {'dataset': [], 'model': [], 'accuracy': [], 'sensitivity': [], 'specificity': []}

results = pd.DataFrame(data = results_)

We can now begin with the actual experiments.

In [163]:
logging.info("features: continuous, model: decision tree.")

start_time = time.time()

accuracy, sensitivity, specificity = experiment(data_num, repetitions=100, continuous_features=continuous_features_num,\
                            outcome_name=outcome_name_num, random_state=0, clf=DecisionTreeClassifier(random_state=0))

logging.info(f'accuracy: {accuracy}, sensitivity: {sensitivity}, specificity: {specificity}')
results.loc[len(results.index)] = ['continuous', 'decision tree', accuracy, sensitivity, specificity]

print("--- %s seconds ---" % (time.time() - start_time))

100%|██████████| 1/1 [00:00<00:00, 11.35it/s]
100%|██████████| 1/1 [00:00<00:00,  6.53it/s]
100%|██████████| 1/1 [00:00<00:00, 13.18it/s]
100%|██████████| 1/1 [00:00<00:00, 14.23it/s]
100%|██████████| 1/1 [00:00<00:00,  6.43it/s]
100%|██████████| 1/1 [00:00<00:00, 11.21it/s]
100%|██████████| 1/1 [00:00<00:00,  7.27it/s]
100%|██████████| 1/1 [00:00<00:00,  5.32it/s]
100%|██████████| 1/1 [00:00<00:00,  4.53it/s]
100%|██████████| 1/1 [00:00<00:00,  5.85it/s]
100%|██████████| 1/1 [00:00<00:00, 20.04it/s]
100%|██████████| 1/1 [00:00<00:00,  7.77it/s]
100%|██████████| 1/1 [00:00<00:00,  6.22it/s]
100%|██████████| 1/1 [00:00<00:00,  5.13it/s]
100%|██████████| 1/1 [00:00<00:00,  5.49it/s]
100%|██████████| 1/1 [00:00<00:00,  5.01it/s]
100%|██████████| 1/1 [00:00<00:00, 12.51it/s]
100%|██████████| 1/1 [00:00<00:00,  4.31it/s]
100%|██████████| 1/1 [00:00<00:00, 15.93it/s]
100%|██████████| 1/1 [00:00<00:00,  3.26it/s]
100%|██████████| 1/1 [00:00<00:00,  8.10it/s]
100%|██████████| 1/1 [00:00<00:00,

Membership Inference Accuracy: 0.62, Sensitivity: 0.24, Specificity: 1.0
--- 36.29154968261719 seconds ---





In [164]:
logging.info("features: continuous, model: random forest.")

start_time = time.time()

accuracy, sensitivity, specificity = experiment(data_num, repetitions=100, continuous_features=continuous_features_num, outcome_name=outcome_name_num,\
                      clf=es.RandomForestClassifier(random_state=0), random_state=0)

logging.info(f'accuracy: {accuracy}, sensitivity: {sensitivity}, specificity: {specificity}')
results.loc[len(results.index)] = ['continuous', 'random forest', accuracy, sensitivity, specificity]

print("--- %s seconds ---" % (time.time() - start_time))

100%|██████████| 1/1 [00:00<00:00,  2.24it/s]
100%|██████████| 1/1 [00:00<00:00,  1.28it/s]
100%|██████████| 1/1 [00:00<00:00,  3.69it/s]
100%|██████████| 1/1 [00:00<00:00,  4.30it/s]
100%|██████████| 1/1 [00:00<00:00,  1.17it/s]
100%|██████████| 1/1 [00:00<00:00,  2.89it/s]
100%|██████████| 1/1 [00:00<00:00,  1.70it/s]
100%|██████████| 1/1 [00:00<00:00,  2.74it/s]
100%|██████████| 1/1 [00:01<00:00,  1.40s/it]
100%|██████████| 1/1 [00:01<00:00,  1.96s/it]
100%|██████████| 1/1 [00:00<00:00,  8.81it/s]
100%|██████████| 1/1 [00:00<00:00,  1.46it/s]
100%|██████████| 1/1 [00:00<00:00,  1.34it/s]
100%|██████████| 1/1 [00:01<00:00,  1.04s/it]
100%|██████████| 1/1 [00:00<00:00,  1.36it/s]
100%|██████████| 1/1 [00:01<00:00,  1.95s/it]
100%|██████████| 1/1 [00:00<00:00,  3.33it/s]
100%|██████████| 1/1 [00:01<00:00,  1.24s/it]
100%|██████████| 1/1 [00:00<00:00,  5.33it/s]
100%|██████████| 1/1 [00:01<00:00,  1.83s/it]
100%|██████████| 1/1 [00:00<00:00,  1.82it/s]
100%|██████████| 1/1 [00:01<00:00,

Membership Inference Accuracy: 0.62, Sensitivity: 0.24, Specificity: 1.0
--- 187.3954713344574 seconds ---





# Dataset 2: Census Income (categorical)

Load dataset two: census income

In [165]:
filename = '../data/adult.data.csv'

names = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', \
         'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'label']

data_cat = pd.read_csv(filename, names=names)

In [166]:
data_cat.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,label
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


There is missing data in the columns workclass and native_country that needs to be removed.

In [167]:
print("Unique values of columns before removal: ")
print(data_cat.workclass.unique())
print(data_cat.native_country.unique())

data_cat = data_cat[data_cat.workclass != ' ?']
data_cat = data_cat[data_cat.native_country != ' ?']

print("Unique values of columns after removal: ")
print(data_cat.workclass.unique())
print(data_cat.native_country.unique())

Unique values of columns before removal: 
[' State-gov' ' Self-emp-not-inc' ' Private' ' Federal-gov' ' Local-gov'
 ' ?' ' Self-emp-inc' ' Without-pay' ' Never-worked']
[' United-States' ' Cuba' ' Jamaica' ' India' ' ?' ' Mexico' ' South'
 ' Puerto-Rico' ' Honduras' ' England' ' Canada' ' Germany' ' Iran'
 ' Philippines' ' Italy' ' Poland' ' Columbia' ' Cambodia' ' Thailand'
 ' Ecuador' ' Laos' ' Taiwan' ' Haiti' ' Portugal' ' Dominican-Republic'
 ' El-Salvador' ' France' ' Guatemala' ' China' ' Japan' ' Yugoslavia'
 ' Peru' ' Outlying-US(Guam-USVI-etc)' ' Scotland' ' Trinadad&Tobago'
 ' Greece' ' Nicaragua' ' Vietnam' ' Hong' ' Ireland' ' Hungary'
 ' Holand-Netherlands']
Unique values of columns after removal: 
[' State-gov' ' Self-emp-not-inc' ' Private' ' Federal-gov' ' Local-gov'
 ' Self-emp-inc' ' Without-pay' ' Never-worked']
[' United-States' ' Cuba' ' Jamaica' ' India' ' Mexico' ' Puerto-Rico'
 ' Honduras' ' England' ' Canada' ' Germany' ' Iran' ' Philippines'
 ' Poland' ' Colu

We will only use the categorical features of this dataset. Remove continuous columns:

In [168]:
data_cat = data_cat.drop('age', axis=1).drop('fnlwgt', axis=1).drop('education_num', axis=1).drop('capital_gain', axis=1)\
    .drop('capital_loss', axis=1).drop('hours_per_week', axis=1)

data_cat.head(3)

Unnamed: 0,workclass,education,marital_status,occupation,relationship,race,sex,native_country,label
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States,<=50K
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States,<=50K
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States,<=50K


Drop duplicates and create version with only 100 samples.

In [169]:
data_cat.drop_duplicates()

# This needs to be done before the transformations to label encoding. This smaller dataset will contain fewer categories
# Otherwise, DiCE will later throw an error if random samples with categories are created, that do not exist in this dataset
data_cat_100 = data_cat.sample(n = 100, random_state=0)

Transform workclass, education, marital_status, occupation, relationship, race, sex and native_country into label encoded features:

In [170]:
def transform_dataset(dataset):

    dataset['workclass_encoded'] = LabelEncoder().fit_transform(dataset['workclass'])
    dataset['education_encoded'] = LabelEncoder().fit_transform(dataset['education'])
    dataset['marital_status_encoded'] = LabelEncoder().fit_transform(dataset['marital_status'])
    dataset['occupation_encoded'] = LabelEncoder().fit_transform(dataset['occupation'])
    dataset['relationship_encoded'] = LabelEncoder().fit_transform(dataset['relationship'])
    dataset['race_encoded'] = LabelEncoder().fit_transform(dataset['race'])
    dataset['native_country_encoded'] = LabelEncoder().fit_transform(dataset['native_country'])

    dataset = dataset.drop('workclass', axis=1).drop('education', axis=1).drop('marital_status', axis=1)\
        .drop('occupation', axis=1).drop('relationship', axis=1).drop('race', axis=1).drop('native_country', axis=1)
    
    return dataset

data_cat = transform_dataset(data_cat)
data_cat_100 = transform_dataset(data_cat_100)
    
data_cat.head(3)

Unnamed: 0,sex,label,workclass_encoded,education_encoded,marital_status_encoded,occupation_encoded,relationship_encoded,race_encoded,native_country_encoded
0,Male,<=50K,6,9,4,1,1,4,38
1,Male,<=50K,5,9,2,4,0,4,38
2,Male,<=50K,3,11,0,6,1,4,38


Transform label and sex into binary encoding:

In [171]:
data_cat['female'] = data_cat['sex'].map( {' Male': 0, ' Female': 1} )
data_cat['income'] = data_cat['label'].map( {' <=50K': 0, ' >50K': 1} )

data_cat = data_cat.drop('sex', axis=1).drop('label', axis=1)

data_cat_100['female'] = data_cat_100['sex'].map( {' Male': 0, ' Female': 1} )
data_cat_100['income'] = data_cat_100['label'].map( {' <=50K': 0, ' >50K': 1} )

data_cat_100 = data_cat_100.drop('sex', axis=1).drop('label', axis=1)

data_cat_100 = data_cat_100.reset_index(drop=True)

data_cat.head(3)

Unnamed: 0,workclass_encoded,education_encoded,marital_status_encoded,occupation_encoded,relationship_encoded,race_encoded,native_country_encoded,female,income
0,6,9,4,1,1,4,38,0,0
1,5,9,2,4,0,4,38,0,0
2,3,11,0,6,1,4,38,0,0


Begin with the experiments:

In [172]:
continuous_features_cat = []

outcome_name_cat = 'income'

In [173]:
logging.info("features: categorical, model: decision tree.")

start_time = time.time()

accuracy, sensitivity, specificity = experiment(data_cat, repetitions=100, continuous_features=continuous_features_cat, outcome_name=outcome_name_cat,\
                      clf=DecisionTreeClassifier(random_state=0), random_state=0)

logging.info(f'accuracy: {accuracy}, sensitivity: {sensitivity}, specificity: {specificity}')
results.loc[len(results.index)] = ['categorical', 'decision tree', accuracy, sensitivity, specificity]

print("--- %s seconds ---" % (time.time() - start_time))

100%|██████████| 1/1 [00:00<00:00,  6.96it/s]
100%|██████████| 1/1 [00:00<00:00,  5.09it/s]
100%|██████████| 1/1 [00:00<00:00,  4.46it/s]
100%|██████████| 1/1 [00:00<00:00,  8.08it/s]
100%|██████████| 1/1 [00:00<00:00,  7.19it/s]
100%|██████████| 1/1 [00:00<00:00,  5.21it/s]
100%|██████████| 1/1 [00:00<00:00,  4.60it/s]
100%|██████████| 1/1 [00:00<00:00,  8.28it/s]
100%|██████████| 1/1 [00:00<00:00,  7.09it/s]
100%|██████████| 1/1 [00:00<00:00,  5.03it/s]
100%|██████████| 1/1 [00:00<00:00,  4.59it/s]
100%|██████████| 1/1 [00:00<00:00,  8.44it/s]
100%|██████████| 1/1 [00:00<00:00,  6.97it/s]
100%|██████████| 1/1 [00:00<00:00,  5.08it/s]
100%|██████████| 1/1 [00:00<00:00,  7.07it/s]
100%|██████████| 1/1 [00:00<00:00,  5.08it/s]
100%|██████████| 1/1 [00:00<00:00,  7.16it/s]
100%|██████████| 1/1 [00:00<00:00,  5.08it/s]
100%|██████████| 1/1 [00:00<00:00,  7.14it/s]
100%|██████████| 1/1 [00:00<00:00,  5.13it/s]
100%|██████████| 1/1 [00:00<00:00,  7.03it/s]
100%|██████████| 1/1 [00:00<00:00,

Membership Inference Accuracy: 0.51, Sensitivity: 0.02, Specificity: 1.0
--- 36.802029609680176 seconds ---





In [174]:
logging.info("features: categorical, model: random forest.")

start_time = time.time()

accuracy, sensitivity, specificity = experiment(data_cat, repetitions=100, continuous_features=continuous_features_cat, outcome_name=outcome_name_cat,\
                      clf=es.RandomForestClassifier(random_state=0), random_state=0)

logging.info(f'accuracy: {accuracy}, sensitivity: {sensitivity}, specificity: {specificity}')
results.loc[len(results.index)] = ['categorical', 'random forest', accuracy, sensitivity, specificity]

print("--- %s seconds ---" % (time.time() - start_time))

100%|██████████| 1/1 [00:00<00:00,  2.60it/s]
100%|██████████| 1/1 [00:00<00:00,  2.32it/s]
100%|██████████| 1/1 [00:00<00:00,  2.19it/s]
100%|██████████| 1/1 [00:00<00:00,  2.80it/s]
100%|██████████| 1/1 [00:00<00:00,  2.62it/s]
100%|██████████| 1/1 [00:00<00:00,  2.31it/s]
100%|██████████| 1/1 [00:00<00:00,  2.21it/s]
100%|██████████| 1/1 [00:00<00:00,  2.79it/s]
100%|██████████| 1/1 [00:00<00:00,  2.63it/s]
100%|██████████| 1/1 [00:00<00:00,  2.32it/s]
100%|██████████| 1/1 [00:00<00:00,  2.20it/s]
100%|██████████| 1/1 [00:00<00:00,  2.77it/s]
100%|██████████| 1/1 [00:00<00:00,  2.63it/s]
100%|██████████| 1/1 [00:00<00:00,  2.29it/s]
100%|██████████| 1/1 [00:00<00:00,  2.61it/s]
100%|██████████| 1/1 [00:00<00:00,  2.31it/s]
100%|██████████| 1/1 [00:00<00:00,  2.61it/s]
100%|██████████| 1/1 [00:00<00:00,  2.30it/s]
100%|██████████| 1/1 [00:00<00:00,  2.63it/s]
100%|██████████| 1/1 [00:00<00:00,  2.32it/s]
100%|██████████| 1/1 [00:00<00:00,  2.62it/s]
100%|██████████| 1/1 [00:00<00:00,

Membership Inference Accuracy: 0.5, Sensitivity: 0.0, Specificity: 1.0
--- 95.0367271900177 seconds ---





# Results

The results of all variations of the membership inference experiment with counterfactuals. In each experiment, half the samples were picked randomly from the training data, while the other half were picked randomly from the control data not used for training. Both datasets originate from the same source dataset.

Accuracy is the percentage of samples whose membership (true or false) was correctly inferred. An algorithm guessing at random would achieve an accuracy of 50 percent.

Sensitivity is the percentage of training samples whose membership (true) was correctly inferred.

Specificity is the percentage of control samples (not used for training) whose membership (false) was correctly inferred.

In [175]:
results

Unnamed: 0,dataset,model,accuracy,sensitivity,specificity
0,continuous,decision tree,0.62,0.24,1.0
1,continuous,random forest,0.62,0.24,1.0
2,categorical,decision tree,0.51,0.02,1.0
3,categorical,random forest,0.5,0.0,1.0


In [11]:
from torch.utils.data import Dataset

class PandasDataset(Dataset):
    def __init__(self, features, targets):
        self.features = features.to_numpy().astype(np.float32)
        self.targets = targets.to_numpy().astype(np.int_)
        
        # for normaliziation of features:
        self.max = self.features.max(axis=0)
        self.min = self.features.min(axis=0)

    def __len__(self):
        return self.features.shape[0]

    def __getitem__(self, idx):
        return [np.divide(self.features[idx] - self.min, self.max - self.min), int(self.targets[idx])]
    
features = data_num.drop('heart_disease_label', axis=1)
labels = data_num['heart_disease_label']
    
train_dataset = PandasDataset(features, labels)
train_loader = data_utils.DataLoader(dataset = train_dataset, batch_size = 64, shuffle = True)
test_loader = train_loader

We now generate five counterfactuals for the first sample from the training data to demonstrate counterfactual explanations in general.

In [69]:
device = 'cpu'

class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(8, 32),
            nn.ReLU(),
            nn.Linear(32, 32),
            nn.ReLU(),
            nn.Linear(32, 2),
            nn.Softmax(dim=1)
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits
    
model = NeuralNetwork().to(device)

In [70]:
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 10 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

def test_loop(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0

    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
            
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

epochs = 10
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(train_loader, model, loss_fn, optimizer)
    test_loop(test_loader, model, loss_fn)
print("Done!")

Epoch 1
-------------------------------
loss: 0.740018  [    0/ 3800]
loss: 0.736335  [  640/ 3800]
loss: 0.738221  [ 1280/ 3800]
loss: 0.732312  [ 1920/ 3800]
loss: 0.742196  [ 2560/ 3800]
loss: 0.728314  [ 3200/ 3800]
Test Error: 
 Accuracy: 15.3%, Avg loss: 0.730158 

Epoch 2
-------------------------------
loss: 0.726276  [    0/ 3800]
loss: 0.730729  [  640/ 3800]
loss: 0.730464  [ 1280/ 3800]
loss: 0.727209  [ 1920/ 3800]
loss: 0.716302  [ 2560/ 3800]
loss: 0.717541  [ 3200/ 3800]
Test Error: 
 Accuracy: 15.3%, Avg loss: 0.722390 

Epoch 3
-------------------------------
loss: 0.719298  [    0/ 3800]
loss: 0.713963  [  640/ 3800]
loss: 0.715096  [ 1280/ 3800]
loss: 0.715887  [ 1920/ 3800]
loss: 0.714086  [ 2560/ 3800]
loss: 0.711129  [ 3200/ 3800]
Test Error: 
 Accuracy: 15.3%, Avg loss: 0.714681 

Epoch 4
-------------------------------
loss: 0.711479  [    0/ 3800]
loss: 0.713766  [  640/ 3800]
loss: 0.715216  [ 1280/ 3800]
loss: 0.710189  [ 1920/ 3800]
loss: 0.702829  [ 2560/ 

In [84]:
ptif.init_logging()
config = ptif.get_default_config()

# calculate on CPU
config['gpu'] = -1

# calculate 100 images (influence package seems to believe this has 10 classes)
config['test_sample_num'] = 10

influences = ptif.calc_img_wise(config, model, train_loader, test_loader)

2022-01-20 07:51:34,231: Running on: 10 images per class.
2022-01-20 07:51:34,232: Starting at img number: 0 per class.




2022-01-20 08:04:50,470: The results for this run are:
2022-01-20 08:04:50,471: Influences: 
2022-01-20 08:04:50,473: [tensor(-0.0001), tensor(-0.0001), tensor(-0.0001)]
2022-01-20 08:04:50,479: Most harmful img IDs: 
2022-01-20 08:04:50,482: [1932, 3552, 1461]
2022-01-20 08:04:50,483: Most helpful img IDs: 
2022-01-20 08:04:50,485: [3441, 752, 1832]


In [86]:
is_influential = 0
is_most_influential = 0

for i in range(100):
    harmful_ids = influences[str(i)]['harmful']
    helpful_ids = influences[str(i)]['helpful']
    
    index_of_equal_ids_harmful = [j for j in range(len(harmful_ids)) if harmful_ids[j] == i]
    index_of_equal_ids_helpful = [j for j in range(len(helpful_ids)) if helpful_ids[j] == i]
    
    print(f'Index of sample {i} in helpful samples: {index_of_equal_ids_helpful}')
    print(f'Index of sample {i} in harmful samples: {index_of_equal_ids_harmful}')
    
    if len(index_of_equal_ids_harmful) > 0 or len(index_of_equal_ids_helpful) > 0:
        is_influential += 1
    
    if 0 in index_of_equal_ids_harmful or 0 in index_of_equal_ids_helpful:
        is_most_influential += 1

print(f'In Top 500: {is_influential}')
print(f'In Top 1: {is_most_influential}')



Index of sample 0 in helpful samples: []
Index of sample 0 in harmful samples: []
Index of sample 1 in helpful samples: []
Index of sample 1 in harmful samples: []
Index of sample 2 in helpful samples: []
Index of sample 2 in harmful samples: []
Index of sample 3 in helpful samples: []
Index of sample 3 in harmful samples: [14]
Index of sample 4 in helpful samples: []
Index of sample 4 in harmful samples: [408]
Index of sample 5 in helpful samples: []
Index of sample 5 in harmful samples: [159]
Index of sample 6 in helpful samples: []
Index of sample 6 in harmful samples: [24]
Index of sample 7 in helpful samples: []
Index of sample 7 in harmful samples: [283]
Index of sample 8 in helpful samples: []
Index of sample 8 in harmful samples: []
Index of sample 9 in helpful samples: []
Index of sample 9 in harmful samples: []
Index of sample 10 in helpful samples: []
Index of sample 10 in harmful samples: []
Index of sample 11 in helpful samples: []
Index of sample 11 in harmful samples: []

In [3]:

import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# Source: https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html
# Last access: 2019-11-20


def load_data():
    transform = transforms.Compose(
        [transforms.ToTensor(),
         transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

    trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                            download=True, transform=transform)
    
    trainloader = torch.utils.data.DataLoader(trainset, batch_size=4,
                                              shuffle=True, num_workers=2)

    testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                           download=True, transform=transform)
    testloader = torch.utils.data.DataLoader(testset, batch_size=4,
                                             shuffle=False, num_workers=2)

    return trainloader, testloader


class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 5 * 5)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


def train(trainloader, testloader, net):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

    for epoch in range(10):  # loop over the dataset multiple times
        running_loss = 0.0
        for i, data in enumerate(trainloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            #inputs, labels = data
            #inputs, labels = data[0].cuda(), data[1].cuda()
            inputs, labels = data[0], data[1]

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            if i % 2000 == 1999:    # print every 2000 mini-batches
                print('[%d, %5d] loss: %.3f' %
                      (epoch + 1, i + 1, running_loss / 2000))
                running_loss = 0.0

    print('Finished Training')


def save_model(net):
    PATH = './cifar_net.pth'
    torch.save(net.state_dict(), PATH)


def load_model():
    PATH = './cifar_net.pth'
    net = Net()
    net.load_state_dict(torch.load(PATH))
    #net.cuda()
    return net


def test(testloader, net):
    correct = 0
    total = 0
    class_correct = list(0. for i in range(10))
    class_total = list(0. for i in range(10))
    with torch.no_grad():
        for data in testloader:
            #images, labels = data
            #images, labels = data[0].cuda(), data[1].cuda()
            images, labels = data[0], data[1]
            outputs = net(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            _, pred = torch.max(outputs, 1)
            c = (pred == labels).squeeze()
            for i in range(4):
                label = labels[i]
                class_correct[label] += c[i].item()
                class_total[label] += 1

    print('Accuracy of the network on the 10000 test images: %d %%' % (
        100 * correct / total))
    classes = ('plane', 'car', 'bird', 'cat',
               'deer', 'dog', 'frog', 'horse', 'ship', 'truck')
    for i in range(10):
        print('Accuracy of %5s : %2d %%' % (
            classes[i], 100 * class_correct[i] / class_total[i]))
        

In [None]:
trainloader, testloader = load_data()
model = Net()
#model.cuda()
train(trainloader, testloader, model)
test(testloader, model)
save_model(model)

In [4]:
import pytorch_influence_functions as ptif

config = ptif.get_default_config()
config['gpu'] = -1
model = load_model()
trainloader, testloader = load_data()
ptif.init_logging('logfile.log')
influences, harmful, helpful = ptif.calc_img_wise(config, model, trainloader, testloader)

Files already downloaded and verified
Files already downloaded and verified
2022-01-19 07:03:28,735: Running on: 1 images per class.
2022-01-19 07:03:28,736: Starting at img number: 0 per class.


  y = torch.nn.functional.log_softmax(y)




KeyboardInterrupt: 

In [5]:
config = ptif.get_default_config()
config['gpu'] = -1
model = load_model()
trainloader, testloader = load_data()
ptif.init_logging('logfile.log')

# check influences for training data
ptif.calc_img_wise(config, model, trainloader, trainloader)

Files already downloaded and verified
Files already downloaded and verified
2022-01-19 07:11:13,610: Running on: 1 images per class.
2022-01-19 07:11:13,612: Starting at img number: 0 per class.
2022-01-19 08:00:06,883: The results for this run are:
2022-01-19 08:00:06,886: Influences: 
2022-01-19 08:00:06,887: [tensor(-0.0004), tensor(1.9828e-05), tensor(-0.0004)]
2022-01-19 08:00:06,904: Most harmful img IDs: 
2022-01-19 08:00:06,905: [9, 20378, 35904]
2022-01-19 08:00:06,906: Most helpful img IDs: 
2022-01-19 08:00:06,908: [8364, 49572, 21750]


{'0': {'label': 6,
  'num_in_dataset': 0,
  'time_calc_influence_s': 284.0752925872803,
  'influence': [-0.046648312360048294,
   -0.0011204129550606012,
   -0.00028360699070617557,
   0.0004783914191648364,
   -7.644175639143214e-05,
   0.0004961627419106662,
   -0.00042483280412852764,
   -1.2061991683243178e-10,
   -0.00019218747911509126,
   -0.0009243402164429426,
   -0.0007151507306843996,
   -0.00039599076262675226,
   -0.0021720260847359896,
   0.003783398075029254,
   -0.00023693832918070257,
   2.8773845770047046e-05,
   -0.0006794303772039711,
   0.0005989709752611816,
   0.0022702955175191164,
   -0.0015205983072519302,
   0.0005958551191724837,
   -3.795156953856349e-05,
   -0.0008110064663924277,
   -0.004994499962776899,
   8.488010644214228e-05,
   -0.0038057882338762283,
   0.0002694202121347189,
   -0.0012982647167518735,
   -0.0038283327594399452,
   0.0002936441160272807,
   -6.587471261809696e-07,
   -0.0009052716195583344,
   -1.0484135337662792e-08,
   -0.0003689

In [6]:
type(helpful)

NameError: name 'helpful' is not defined