# Influential Instances Membership Inference Experiment

In [1]:
import pandas as pd
import numpy as np
import logging
import time
import torch
from torch import nn
from torch.utils.data import Dataset
import torch.utils.data as data_utils
import pytorch_influence_functions as ptif

In [2]:
%run experiment_setup.ipynb

This notebook will test whether membership inference is possible with influential instances. Membership inference means an attacker with access to the explanation can determine for any sample whether it was included in the training data or not.

First we define a few helper classes and functions for the experiment. This includes a wrapper for the explainer "pytorch influence functions" that only prints out the single most helpful and single most harmful training sample for a test sample. The training sample IDs provided by the explainer are converted by the wrapper to the corresponding training samples. This is equivalent to an example explanation on the explainers github page where the most influential training pictures were shown as explanation: https://github.com/nimarb/pytorch_influence_functions#output-variables

A neural network has to be used for this experiment because the explainer package needs gradient information (TODO: confirm this).

In [3]:
class PandasDataset(Dataset):
    def __init__(self, features, targets):
        self.features = features.to_numpy().astype(np.float32)
        self.targets = targets.to_numpy().astype(np.int_)
        
        # for normaliziation of features:
        self.max = self.features.max(axis=0)
        self.min = self.features.min(axis=0)
        
    def __len__(self):
        return self.features.shape[0]

    def __getminmax__(self):
        return self.min, self.max
        
    def setminmax(self, min, max):
        self.min = min
        self.max = max
        
    def __getitem__(self, idx):
        return [np.divide(self.features[idx] - self.min, self.max - self.min), int(self.targets[idx])]

class NeuralNetwork(nn.Module):
    def __init__(self, num_features):
        super(NeuralNetwork, self).__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(num_features, 32),
            nn.ReLU(),
            nn.Linear(32, 32),
            nn.ReLU(),
            nn.Linear(32, 2),
            nn.Softmax(dim=1)
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits
    
class InfluentialInstances():
    def __init__(self, model, train_loader, train_df, outcome_name):
        self.model = model
        self.train_loader = train_loader
        self.train_df = train_df
        
        features = train_df.drop(outcome_name, axis=1).to_numpy(np.float32)
        self.max = features.max(axis=0)
        self.min = features.min(axis=0)
        
        ptif.init_logging()
        
        self.config = ptif.get_default_config()
        # calculate on CPU
        self.config['gpu'] = -1
        # calculate for one test sample:
        self.config['test_sample_num'] = 1
        self.config['num_classes'] = 1
        
        self.outcome_name = outcome_name
        
    def explain(self, sample):
        test_data = PandasDataset(sample.drop(self.outcome_name, axis=1), sample[self.outcome_name])
        test_data.setminmax(self.min, self.max)
        test_loader = data_utils.DataLoader(dataset = test_data, batch_size = 1, shuffle = True)

        influences = ptif.calc_img_wise(self.config, self.model, self.train_loader, test_loader)
                                  
        harmful_id = influences['0']['harmful'][0]
        helpful_id = influences['0']['helpful'][0]
                                  
        return self.train_df.iloc[harmful_id], self.train_df.iloc[helpful_id]
    
# function adapted from the following tutorial: https://pytorch.org/tutorials/beginner/basics/optimization_tutorial.html
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 10 == 0:
            loss, current = loss.item(), batch * len(X)
            logging.debug(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

def ii_membership_inference(sample, explainer):
    harmful_instance, helpful_instance = explainer.explain(sample)

    harmful_equal = np.isclose(sample.to_numpy().astype(float), harmful_instance.to_numpy().astype(float)).all()
    helpful_equal = np.isclose(sample.to_numpy().astype(float), helpful_instance.to_numpy().astype(float)).all()

    inferred_membership = harmful_equal or helpful_equal

    logging.debug(f'Testing instance: {sample.to_numpy()}')
    logging.debug(f'Harmful instance: {harmful_instance.to_numpy()}')
    logging.debug(f'Helpful instance: {helpful_instance.to_numpy()}')
    logging.debug(f'Inferred membership as {inferred_membership}')

    return inferred_membership

This function will run the experiment for the different variations. The attacker enters the test sample into the explainer and checks whether it matches the returned helpful sample or the returned harmful sample. If either one matches, the attacker can deduct that the test sample is part of the training data.

In [4]:
def experiment(data, outcome_name, repetitions, model, random_state=0):
    # create random state from seed. This will be used to draw the test samples for the experiment.
    rs = np.random.RandomState(seed=random_state)
    
    # split data into two halves. One is used for training, the other as control data that is not part of the training data.
    # this control data will be needed as test samples that do not belong to the training data.
    idx_mid = int(data.shape[0] / 2)
    
    data_ctrl = data.iloc[:idx_mid, :]

    data_train = data.iloc[idx_mid:, :]
    
    train_dataset = PandasDataset(data_train.drop(outcome_name, axis=1), data_train[outcome_name])
    train_loader = data_utils.DataLoader(dataset = train_dataset, batch_size = 64, shuffle = True)
    
    # train model on training data
    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

    epochs = 10
    for t in range(epochs):
        logging.debug(f"Epoch {t+1}\n-------------------------------")
        train_loop(train_loader, model, loss_fn, optimizer)
    logging.debug("Done!")
    
    # init explainer (new class)
    explainer = InfluentialInstances(model, train_loader, data_train, outcome_name)
    
    # boolean numpy arrays for actual and inferred membership of the test samples
    sample_membership = np.empty(repetitions)
    inferred_membership = np.empty(repetitions)
    
    for i in range(repetitions):
        if i % 2 == 0:
            # choose sample from training data.
            sample = data_train.sample(random_state=rs)
            sample_membership[i] = True
            logging.debug('%s taken from training data' % sample.to_numpy())
        else:
            # choose sample from control data.
            sample = data_ctrl.sample(random_state=rs)
            sample_membership[i] = False
            logging.debug('%s taken from control data' % sample.to_numpy())
        
        inferred_membership[i] = ii_membership_inference(sample, explainer)
    
    # calculate accuracy, sensitivity and specificity
    samples_in_training_data = np.count_nonzero(sample_membership)
    samples_not_in_training_data = repetitions - samples_in_training_data
        
    pred_positives = np.count_nonzero(inferred_membership)
        
    correct_predictions = np.count_nonzero(np.equal(inferred_membership, sample_membership))
    true_positives = np.count_nonzero(inferred_membership[sample_membership == True])
                
    accuracy = correct_predictions / repetitions
    
    if pred_positives > 0:
        precision = true_positives / pred_positives
    else:
        precision = None
        
    recall = true_positives / samples_in_training_data
        
    print('Membership Inference Accuracy: %s, Precision: %s, Recall: %s'\
          % (accuracy, precision, recall))
    
    return accuracy, precision, recall

# Dataset 1: Heart Disease

We will now do a small proof of concept of the experiment with logging enabled to demonstrate how it works.

In [5]:
logging.root.setLevel(logging.DEBUG)

experiment(data_num, repetitions=10, outcome_name=outcome_name_num, model=NeuralNetwork(num_features=8).to('cpu'))

logging.root.setLevel(logging.WARNING)

DEBUG:root:Epoch 1
-------------------------------
DEBUG:root:loss: 0.659487  [    0/ 1900]
DEBUG:root:loss: 0.652118  [  640/ 1900]
DEBUG:root:loss: 0.656090  [ 1280/ 1900]
DEBUG:root:Epoch 2
-------------------------------
DEBUG:root:loss: 0.657618  [    0/ 1900]
DEBUG:root:loss: 0.657528  [  640/ 1900]
DEBUG:root:loss: 0.648832  [ 1280/ 1900]
DEBUG:root:Epoch 3
-------------------------------
DEBUG:root:loss: 0.650082  [    0/ 1900]
DEBUG:root:loss: 0.659148  [  640/ 1900]
DEBUG:root:loss: 0.644900  [ 1280/ 1900]
DEBUG:root:Epoch 4
-------------------------------
DEBUG:root:loss: 0.649769  [    0/ 1900]
DEBUG:root:loss: 0.653384  [  640/ 1900]
DEBUG:root:loss: 0.649674  [ 1280/ 1900]
DEBUG:root:Epoch 5
-------------------------------
DEBUG:root:loss: 0.642846  [    0/ 1900]
DEBUG:root:loss: 0.643934  [  640/ 1900]
DEBUG:root:loss: 0.651462  [ 1280/ 1900]
DEBUG:root:Epoch 6
-------------------------------
DEBUG:root:loss: 0.645377  [    0/ 1900]
DEBUG:root:loss: 0.668427  [  640/ 190



INFO:root:The results for this run are:
INFO:root:Influences: 
INFO:root:[tensor(-0.0002), tensor(-0.0002), tensor(-0.0002)]
INFO:root:Most harmful img IDs: 
INFO:root:[1893, 1756, 1503]
INFO:root:Most helpful img IDs: 
INFO:root:[1541, 102, 1007]
DEBUG:root:Testing instance: [[ 52.     0.   216.   125.    72.    24.98  75.    95.     0.  ]]
DEBUG:root:Harmful instance: [ 50.     0.   260.   190.   130.    43.67  85.   260.     0.  ]
DEBUG:root:Helpful instance: [ 62.    20.   358.   215.   110.    37.62 110.   368.     1.  ]
DEBUG:root:Inferred membership as False
DEBUG:root:[[ 45.    0.  258.  114.   80.   26.6  80.   68.    0. ]] taken from control data
INFO:root:Running on: 1 images per class.
INFO:root:Starting at img number: 0 per class.




INFO:root:The results for this run are:
INFO:root:Influences: 
INFO:root:[tensor(-0.0002), tensor(-0.0002), tensor(-0.0002)]
INFO:root:Most harmful img IDs: 
INFO:root:[918, 1207, 1613]
INFO:root:Most helpful img IDs: 
INFO:root:[1541, 724, 481]
DEBUG:root:Testing instance: [[ 45.    0.  258.  114.   80.   26.6  80.   68.    0. ]]
DEBUG:root:Harmful instance: [ 51.     9.   696.   157.    87.    24.44  95.    84.     0.  ]
DEBUG:root:Helpful instance: [ 62.    20.   358.   215.   110.    37.62 110.   368.     1.  ]
DEBUG:root:Inferred membership as False
DEBUG:root:[[ 49.    20.   291.   160.    99.    29.91  85.    88.     0.  ]] taken from training data
INFO:root:Running on: 1 images per class.
INFO:root:Starting at img number: 0 per class.




INFO:root:The results for this run are:
INFO:root:Influences: 
INFO:root:[tensor(-0.0002), tensor(-0.0002), tensor(-0.0002)]
INFO:root:Most harmful img IDs: 
INFO:root:[918, 790, 995]
INFO:root:Most helpful img IDs: 
INFO:root:[724, 1541, 481]
DEBUG:root:Testing instance: [[ 49.    20.   291.   160.    99.    29.91  85.    88.     0.  ]]
DEBUG:root:Harmful instance: [ 51.     9.   696.   157.    87.    24.44  95.    84.     0.  ]
DEBUG:root:Helpful instance: [ 57.    43.   283.   207.5  118.    38.61 100.    83.     1.  ]
DEBUG:root:Inferred membership as False
DEBUG:root:[[ 50.    20.   235.   121.    78.    23.01  52.    78.     0.  ]] taken from control data
INFO:root:Running on: 1 images per class.
INFO:root:Starting at img number: 0 per class.




INFO:root:The results for this run are:
INFO:root:Influences: 
INFO:root:[tensor(-0.0002), tensor(-0.0002), tensor(-0.0002)]
INFO:root:Most harmful img IDs: 
INFO:root:[254, 1604, 1503]
INFO:root:Most helpful img IDs: 
INFO:root:[1541, 102, 130]
DEBUG:root:Testing instance: [[ 50.    20.   235.   121.    78.    23.01  52.    78.     0.  ]]
DEBUG:root:Harmful instance: [ 56.    30.   303.   136.5   97.    26.64  93.   106.     0.  ]
DEBUG:root:Helpful instance: [ 62.    20.   358.   215.   110.    37.62 110.   368.     1.  ]
DEBUG:root:Inferred membership as False
DEBUG:root:[[ 41.   40.  242.  124.5  86.5  28.8  87.   67.    0. ]] taken from training data
INFO:root:Running on: 1 images per class.
INFO:root:Starting at img number: 0 per class.




INFO:root:The results for this run are:
INFO:root:Influences: 
INFO:root:[tensor(-0.0002), tensor(-0.0002), tensor(-0.0002)]
INFO:root:Most harmful img IDs: 
INFO:root:[790, 902, 528]
INFO:root:Most helpful img IDs: 
INFO:root:[724, 1541, 921]
DEBUG:root:Testing instance: [[ 41.   40.  242.  124.5  86.5  28.8  87.   67.    0. ]]
DEBUG:root:Harmful instance: [ 40.    70.   210.   132.    86.    31.57  98.    80.     0.  ]
DEBUG:root:Helpful instance: [ 57.    43.   283.   207.5  118.    38.61 100.    83.     1.  ]
DEBUG:root:Inferred membership as False
DEBUG:root:[[ 53.    10.   261.   136.    99.    21.02  85.    94.     0.  ]] taken from control data
INFO:root:Running on: 1 images per class.
INFO:root:Starting at img number: 0 per class.




INFO:root:The results for this run are:
INFO:root:Influences: 
INFO:root:[tensor(-0.0002), tensor(-0.0002), tensor(-0.0002)]
INFO:root:Most harmful img IDs: 
INFO:root:[254, 1604, 245]
INFO:root:Most helpful img IDs: 
INFO:root:[1541, 102, 130]
DEBUG:root:Testing instance: [[ 53.    10.   261.   136.    99.    21.02  85.    94.     0.  ]]
DEBUG:root:Harmful instance: [ 56.    30.   303.   136.5   97.    26.64  93.   106.     0.  ]
DEBUG:root:Helpful instance: [ 62.    20.   358.   215.   110.    37.62 110.   368.     1.  ]
DEBUG:root:Inferred membership as False
DEBUG:root:[[ 46.     0.   213.   136.    77.    31.02  75.    73.     0.  ]] taken from training data
INFO:root:Running on: 1 images per class.
INFO:root:Starting at img number: 0 per class.




INFO:root:The results for this run are:
INFO:root:Influences: 
INFO:root:[tensor(-0.0002), tensor(-0.0002), tensor(-0.0002)]
INFO:root:Most harmful img IDs: 
INFO:root:[918, 32, 1613]
INFO:root:Most helpful img IDs: 
INFO:root:[724, 1541, 481]
DEBUG:root:Testing instance: [[ 46.     0.   213.   136.    77.    31.02  75.    73.     0.  ]]
DEBUG:root:Harmful instance: [ 51.     9.   696.   157.    87.    24.44  95.    84.     0.  ]
DEBUG:root:Helpful instance: [ 57.    43.   283.   207.5  118.    38.61 100.    83.     1.  ]
DEBUG:root:Inferred membership as False
DEBUG:root:[[ 58.     0.   210.   102.    60.    26.98  71.    90.     0.  ]] taken from control data
INFO:root:Running on: 1 images per class.
INFO:root:Starting at img number: 0 per class.




INFO:root:The results for this run are:
INFO:root:Influences: 
INFO:root:[tensor(-0.0002), tensor(-0.0002), tensor(-0.0002)]
INFO:root:Most harmful img IDs: 
INFO:root:[1893, 1503, 1756]
INFO:root:Most helpful img IDs: 
INFO:root:[1541, 1007, 102]
DEBUG:root:Testing instance: [[ 58.     0.   210.   102.    60.    26.98  71.    90.     0.  ]]
DEBUG:root:Harmful instance: [ 50.     0.   260.   190.   130.    43.67  85.   260.     0.  ]
DEBUG:root:Helpful instance: [ 62.    20.   358.   215.   110.    37.62 110.   368.     1.  ]
DEBUG:root:Inferred membership as False
DEBUG:root:[[ 54.     0.   265.   121.    82.    23.52  60.    67.     0.  ]] taken from training data
INFO:root:Running on: 1 images per class.
INFO:root:Starting at img number: 0 per class.




INFO:root:The results for this run are:
INFO:root:Influences: 
INFO:root:[tensor(-0.0002), tensor(-0.0002), tensor(-0.0002)]
INFO:root:Most harmful img IDs: 
INFO:root:[1893, 1604, 1503]
INFO:root:Most helpful img IDs: 
INFO:root:[1541, 102, 1007]
DEBUG:root:Testing instance: [[ 54.     0.   265.   121.    82.    23.52  60.    67.     0.  ]]
DEBUG:root:Harmful instance: [ 50.     0.   260.   190.   130.    43.67  85.   260.     0.  ]
DEBUG:root:Helpful instance: [ 62.    20.   358.   215.   110.    37.62 110.   368.     1.  ]
DEBUG:root:Inferred membership as False
DEBUG:root:[[ 41.    5.  218.  129.5  93.   27.8  58.   83.    0. ]] taken from control data
INFO:root:Running on: 1 images per class.
INFO:root:Starting at img number: 0 per class.




INFO:root:The results for this run are:
INFO:root:Influences: 
INFO:root:[tensor(-0.0002), tensor(-0.0002), tensor(-0.0002)]
INFO:root:Most harmful img IDs: 
INFO:root:[918, 790, 1207]
INFO:root:Most helpful img IDs: 
INFO:root:[724, 1541, 481]
DEBUG:root:Testing instance: [[ 41.    5.  218.  129.5  93.   27.8  58.   83.    0. ]]
DEBUG:root:Harmful instance: [ 51.     9.   696.   157.    87.    24.44  95.    84.     0.  ]
DEBUG:root:Helpful instance: [ 57.    43.   283.   207.5  118.    38.61 100.    83.     1.  ]
DEBUG:root:Inferred membership as False


Membership Inference Accuracy: 0.5, Precision: None, Recall: 0.0


In [6]:
results_ = {'dataset': [], 'model': [], 'accuracy': [], 'precision': [], 'recall': []}

results = pd.DataFrame(data = results_)

We can now begin with the actual experiments. For this experiment, we can not use a decision tree or random forest as the model since influence functions require access to a gradient. For this reason, a neural network is chosen.

In [7]:
logging.info("features: continuous")

start_time = time.time()

accuracy, precision, recall = experiment(data_num, repetitions=100, outcome_name=outcome_name_num, \
                                         model=NeuralNetwork(num_features=8).to('cpu'))

results.loc[len(results.index)] = ['continuous', 'neural network', accuracy, precision, recall]

print("--- %s seconds ---" % (time.time() - start_time))





Membership Inference Accuracy: 0.5, Precision: None, Recall: 0.0
--- 440.83237767219543 seconds ---


# Dataset 2: Census Income (categorical)

Transform workclass, education, marital_status, occupation, relationship, race, sex and native_country into one-hot encoded features:

In [8]:
data_cat = pd.concat([data_cat, pd.get_dummies(data_cat['workclass_encoded'], prefix='workclass_encoded')], axis=1)
data_cat = pd.concat([data_cat, pd.get_dummies(data_cat['education_encoded'], prefix='education_encoded')], axis=1)
data_cat = pd.concat([data_cat, pd.get_dummies(data_cat['marital_status_encoded'], prefix='marital_status_encoded')], axis=1)
data_cat = pd.concat([data_cat, pd.get_dummies(data_cat['occupation_encoded'], prefix='occupation_encoded')], axis=1)
data_cat = pd.concat([data_cat, pd.get_dummies(data_cat['relationship_encoded'], prefix='relationship_encoded')], axis=1)
data_cat = pd.concat([data_cat, pd.get_dummies(data_cat['race_encoded'], prefix='race_encoded')], axis=1)
data_cat = pd.concat([data_cat, pd.get_dummies(data_cat['native_country_encoded'], prefix='native_country_encoded')], axis=1)

data_cat = data_cat.drop('workclass_encoded', axis=1).drop('education_encoded', axis=1).drop('marital_status_encoded', axis=1)\
    .drop('occupation_encoded', axis=1).drop('relationship_encoded', axis=1).drop('race_encoded', axis=1)\
    .drop('native_country_encoded', axis=1)

data_cat.head(3)

Unnamed: 0,female,income,workclass_encoded_0,workclass_encoded_1,workclass_encoded_2,workclass_encoded_3,workclass_encoded_4,workclass_encoded_5,workclass_encoded_6,education_encoded_0,...,native_country_encoded_31,native_country_encoded_32,native_country_encoded_33,native_country_encoded_34,native_country_encoded_35,native_country_encoded_36,native_country_encoded_37,native_country_encoded_38,native_country_encoded_39,native_country_encoded_40
0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


Begin with the experiment:

In [9]:
logging.info("features: categorical")

start_time = time.time()

accuracy, precision, recall = experiment(data_cat, repetitions=100, outcome_name=outcome_name_cat, \
                                         model=NeuralNetwork(num_features=97).to('cpu'))

results.loc[len(results.index)] = ['categorical', 'neural network', accuracy, precision, recall]

print("--- %s seconds ---" % (time.time() - start_time))

[KCalc. influence function: [=...........................................] 83 / 4309

  y = torch.nn.functional.log_softmax(y)








Membership Inference Accuracy: 0.97, Precision: 1.0, Recall: 0.94
--- 1041.0219056606293 seconds ---


# Why is a training sample always its own (most) harmful sample?

There may be a bug in the influence package which reverses helpful and harmful images due to a sign error: https://github.com/nimarb/pytorch_influence_functions/issues/11#issuecomment-751780387

# Results

The results of all variations of the membership inference experiment with influential instances. In each experiment, half the samples were picked randomly from the training data, while the other half were picked randomly from the control data not used for training. Both datasets originate from the same source dataset.

Accuracy is the percentage of samples whose membership (true or false) was correctly inferred. An algorithm guessing at random would achieve an accuracy of 50 percent.

Precision is the percentage of identified training samples that actually appear in the training data.

Recall is the percentage of training samples that was correctly identified.

You may want to delete the folder 'outdir' after the experiment. It was created automatically by pytorch influence functions.

In [10]:
results

Unnamed: 0,dataset,model,accuracy,precision,recall
0,continuous,neural network,0.5,,0.0
1,categorical,neural network,0.97,1.0,0.94


In [11]:
results.to_csv('results/1-5-ii-membership-inference-results.csv', index=False, na_rep='NaN')