# Experimente - Datensatz Herzproblem

## Settings und Imports

In [1]:
# suppress warnings
import warnings
warnings.filterwarnings('ignore')

#autoreload other packages when code changed
%load_ext autoreload
%autoreload 2

In [2]:
import torch
torch.manual_seed(42) #Reproduzierbarkeit
from torch import nn
from torch.utils.data import DataLoader

from opacus import PrivacyEngine
from opacus.accountants import RDPAccountant

import pandas as pd

In [3]:
from privacyflow.configs import path_configs
from privacyflow.datasets import heart_dataset
from privacyflow.preprocessing import heart_preprocess
from privacyflow.models import heart_models

In [4]:
#Check if GPU is available
if torch.cuda.is_available():
    print("GPU will be used")
    device = torch.device('cpu')
else:
    print("No GPU available")
    device = torch.device('cpu')

GPU will be used


## Data Prep

Der Heart-Datensatz ist eine einzelne .csv Datei. Diese wird mittels einer Preprocessing-Methode bereinigt und in Train-Val-Test gesplittet.
Diese können anschließend als PyTorch Dataset genutzt werden.

In [5]:
heart_preprocess.preprocess_heart_data()

In [6]:
train_dataset = heart_dataset.HeartDataset(mode="train")
train_dataloader = DataLoader(
    dataset=train_dataset,
    batch_size=32,
    shuffle=True
)

val_dataset = heart_dataset.HeartDataset(mode="val")
val_dataloader = DataLoader(
    dataset=val_dataset,
    batch_size=32,
    shuffle=False
)

test_dataset = heart_dataset.HeartDataset(mode="test")
test_dataloader = DataLoader(
    dataset=test_dataset,
    batch_size=32,
    shuffle=False
)

## Model - Base

Als Optimizer wird SGD mit Momentum genutzt, da dieser am ähnlichsten zum DPSGD ist.
Andere Optimizer, z.B. Adam, erziehlen in weniger Epochen eine vergleichbare Güte

In [7]:
model_base = heart_models.HeartModelBase(13,2)
model_base = model_base.to(device)

In [8]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model_base.parameters(),lr=0.01)

In [9]:
for epoch in range(10):
    model_base.train()
    epoch_loss = 0.0
    for batch in train_dataloader:
        inputs,labels = batch
        inputs = inputs.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model_base(inputs)
        loss = criterion(outputs,labels)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    model_base.eval()
    num_correct = 0.0
    for batch in val_dataloader:
        inputs,labels = batch
        inputs = inputs.to(device)
        labels = labels.to(device)

        outputs = model_base(inputs)
        _, predicted = torch.max(outputs,1)
        num_correct += (predicted == labels).sum()
    print(f"Epoch: {epoch+1:2}, Train Loss:{epoch_loss/len(train_dataloader):.5f}, Val Acc:{num_correct/len(val_dataset):.5f}")


Epoch:  1, Train Loss:0.44177, Val Acc:0.92473
Epoch:  2, Train Loss:0.25355, Val Acc:0.92473
Epoch:  3, Train Loss:0.18998, Val Acc:0.92473
Epoch:  4, Train Loss:0.15231, Val Acc:0.91398
Epoch:  5, Train Loss:0.17687, Val Acc:0.94624
Epoch:  6, Train Loss:0.10023, Val Acc:0.97849
Epoch:  7, Train Loss:0.03949, Val Acc:0.97849
Epoch:  8, Train Loss:0.04541, Val Acc:0.95699
Epoch:  9, Train Loss:0.11407, Val Acc:0.95699
Epoch: 10, Train Loss:0.07807, Val Acc:0.95699


In [10]:
num_correct = 0.0
model_base.eval()
for batch in test_dataloader:
    inputs,labels = batch
    inputs = inputs.to(device)
    labels = labels.to(device)

    outputs = model_base(inputs)
    _, predicted = torch.max(outputs,1)
    num_correct += (predicted == labels).sum()

accuracy_base_model = num_correct/len(test_dataset)
print(accuracy_base_model)

tensor(0.9126)


## Membership Inference Attacke

We need some test datapoints that must either be inside a dataset or not.
For this we get the original train and test data and collect datapoints from there, since we know if they are inside the data or not

In [11]:
train_data = pd.read_csv(path_configs.HEART_DATA_TRAIN)
val_data = pd.read_csv(path_configs.HEART_DATA_VAL)
test_data = pd.read_csv(path_configs.HEART_DATA_TRAIN)

In [12]:
dataset1 = pd.concat([train_data, val_data, test_data])
dataset2 = pd.concat([train_data.tail(-100), test_data])
dataset3 = pd.concat([train_data,val_data])
dataset4 = pd.concat([train_data.tail(-100), val_data])

In [13]:
torch_dataset1 = heart_dataset.HeartDataset(mode="custom", custom_df=dataset1)
dataloader1 = DataLoader(dataset=torch_dataset1,batch_size=32)

torch_dataset2 = heart_dataset.HeartDataset(mode="custom", custom_df=dataset2)
dataloader2 = DataLoader(dataset=torch_dataset2,batch_size=32)

torch_dataset3 = heart_dataset.HeartDataset(mode="custom", custom_df=dataset3)
dataloader3 = DataLoader(dataset=torch_dataset3,batch_size=32)

torch_dataset4 = heart_dataset.HeartDataset(mode="custom", custom_df=dataset4)
dataloader4 = DataLoader(dataset=torch_dataset4,batch_size=32)

Membership Inference Attack needs a bunch of shadow models, which are similar to the original model.
Thus we use the same model arch and also a smaller and bigger version.
Each of the models is trained twice, once with a specific datapoint and once without it.

In [14]:
#Train Shadow Models
shadow_models = []
for ds in [dataloader1,dataloader2,dataloader3,dataloader4]:
    for size in ['base','base','base','base','small','large']:
        if size == "small":
            shadow_model = heart_models.HeartModelSmall(13,2).to(device)
        elif size == "large":
            shadow_model = heart_models.HeartModelLarge(13,2).to(device)
        else:
            shadow_model = heart_models.HeartModelBase(13,2).to(device)
        criterion = nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(shadow_model.parameters(), lr=0.01)

        for epoch in range(10):
            for batch in ds:
                inputs,labels = batch
                inputs = inputs.to(device)
                labels = labels.to(device)
                optimizer.zero_grad()
                outputs = shadow_model(inputs)
                loss = criterion(outputs,labels)
                loss.backward()
                optimizer.step()
        shadow_models.append(shadow_model)

In [15]:
#Construct Training Data for the Meta Classifier
#It consits of the input (original data) and the logits of the shadow model and target=was_used_in_training
dfs = []
for i,mod in enumerate(shadow_models):
    for mode in ["train", "val"]:
        label = int(i < 12) if mode =="test" else int(i < 6 or i >= 12) #Label if the data was included or not
        df_inp = test_data.copy() if mode == "test" else val_data.copy()
        ds = heart_dataset.HeartDataset(mode="custom", custom_df=df_inp)
        dl = DataLoader(dataset=ds,batch_size=1, shuffle=False)
        prop1, prop2 = [], []
        for inputs,labels in dl:
            inputs = inputs.to(device)
            preds = torch.softmax(mod(inputs),dim=1)
            prop1.append(float(preds[0][0]))
            prop2.append(float(preds[0][1]))
        df_inp['logit1'] = prop1
        df_inp['logit2'] = prop2
        df_inp['target'] = label
        dfs.append(df_inp)
meta_classifier_data_df = pd.concat(dfs).reset_index()

In [16]:
#Train Meta Classifier
mi_model = heart_models.HeartMIModel(15,2).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(mi_model.parameters(),lr=0.01)

for epoch in range(15):
    mi_model.train()
    for batch in train_dataloader:
        inputs,labels = batch
        inputs = inputs.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model_base(inputs)
        loss = criterion(outputs,labels)
        loss.backward()
        optimizer.step()


In [17]:
#Build Dataset for testing MI
df_mi_eval = pd.concat([train_data.head(100),val_data])
df_mi_ds = heart_dataset.HeartDataset(mode="custom", custom_df=df_mi_eval)
df_mi_dl = DataLoader(dataset=df_mi_ds,batch_size=1,shuffle=False)

In [18]:
def test_mi_attack(test_model) -> float:
    #get predictions from original model
    df_mi_model_data = df_mi_eval.copy()
    prop1, prop2 = [], []
    test_model.eval()
    for inputs,labels in df_mi_dl:
        inputs = inputs.to(device)
        preds = torch.softmax(test_model(inputs),dim=1)
        prop1.append(float(preds[0][0]))
        prop2.append(float(preds[0][1]))
    df_mi_model_data['logit1'] = prop1
    df_mi_model_data['logit2'] = prop2
    df_mi_model_data['target'] = [int(num<100) for num in range(len(df_mi_model_data.index))]

    mi_ds = heart_dataset.MembershipInferenceDataset(df_mi_model_data)
    mi_dl = DataLoader(dataset=mi_ds,batch_size=1,shuffle=False)

    #test with mi model
    num_correct_mi = 0.0
    mi_model.eval()
    for inputs,labels in mi_dl:
        inputs = inputs.to(device)
        labels = labels.to(device)
        outputs = mi_model(inputs)
        _, predicted = torch.max(outputs,1)
        num_correct_mi += (predicted == labels).sum()

    accuracy_mi = num_correct_mi/len(df_mi_model_data.index)
    print(f"Accuracy MI:{accuracy_mi}")
    return accuracy_mi

In [19]:
test_mi_attack(model_base)

Accuracy MI:0.5388600826263428


tensor(0.5389)

## Model - DPSGD

In [20]:
model_dpsgd = heart_models.HeartModelBase(13,2).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model_dpsgd.parameters(),lr=0.01)

In [21]:
privacy_engine = PrivacyEngine(
    secure_mode=False,#Kryptographischer Zufall wird hier nicht benötigt
    accountant="rdp", #Renyi Differential Privacy
)
model_dpsgd, optimizer, train_dataloader = privacy_engine.make_private(
    module=model_dpsgd,
    optimizer=optimizer,
    data_loader=train_dataloader,
    noise_multiplier=2.0, #Wie viel Rauschen wird hinzugefügt - Höher = weniger Rauschen
    max_grad_norm=1.0 #Gradienten größer als dieser Wert werden geclippt
)

epsilon = 3

In [22]:
for epoch in range(10):
    if privacy_engine.accountant.get_epsilon(delta=1e-5) > epsilon:
        break
    model_dpsgd.train()
    epoch_loss = 0.0
    for batch in train_dataloader:
        inputs,labels = batch
        inputs = inputs.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()

        outputs = model_dpsgd(inputs)
        loss = criterion(outputs,labels)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
    model_dpsgd.eval()
    num_correct = 0.0
    for batch in val_dataloader:
        inputs,labels = batch
        inputs = inputs.to(device)
        labels = labels.to(device)
        outputs = model_dpsgd(inputs)
        _, predicted = torch.max(outputs,1)
        num_correct += (predicted == labels).sum()
    print(f"Epoch: {epoch+1:2}, Train Loss:{epoch_loss/len(train_dataloader):.5f}, Val Acc:{num_correct/len(val_dataset):.5f}")
    print(f"ε:{privacy_engine.accountant.get_epsilon(delta=1e-5)}")


Epoch:  1, Train Loss:0.63053, Val Acc:0.59140
ε:0.5077396867750541
Epoch:  2, Train Loss:0.53597, Val Acc:0.79570
ε:0.6786679248122859
Epoch:  3, Train Loss:0.51576, Val Acc:0.78495
ε:0.8184904120481029
Epoch:  4, Train Loss:0.49286, Val Acc:0.83871
ε:0.9382069497545129
Epoch:  5, Train Loss:0.58939, Val Acc:0.83871
ε:1.0460510394184226
Epoch:  6, Train Loss:0.56701, Val Acc:0.84946
ε:1.1442454385839282
Epoch:  7, Train Loss:0.56337, Val Acc:0.81720
ε:1.2348563129090557
Epoch:  8, Train Loss:0.63033, Val Acc:0.79570
ε:1.320323189685948
Epoch:  9, Train Loss:0.60279, Val Acc:0.81720
ε:1.402160842402668
Epoch: 10, Train Loss:0.67275, Val Acc:0.84946
ε:1.4781191411143204


In [23]:
num_correct = 0.0
model_dpsgd.eval()
for batch in test_dataloader:
    inputs,labels = batch
    inputs = inputs.to(device)
    labels = labels.to(device)

    outputs = model_dpsgd(inputs)
    _, predicted = torch.max(outputs,1)
    num_correct += (predicted == labels).sum()

accuracy_dpsgd = num_correct/len(test_dataset)
print(accuracy_dpsgd)

tensor(0.8155)


In [24]:
test_mi_attack(model_dpsgd)

Accuracy MI:0.5440414547920227


tensor(0.5440)