# Lab 2: Membership Inference Attacks

## Objectives
- Understand membership inference
- Implement shadow model attack
- Test confidence-based attacks
- Evaluate privacy leakage

In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import matplotlib.pyplot as plt

# Detect device (supports CUDA, Apple Silicon MPS, and CPU)
if torch.cuda.is_available():
    device = 'cuda'
elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
    device = 'mps'
else:
    device = 'cpu'
print(f'Device: {device}')

Device: mps


## Part 1: Train Target Model

In [2]:
# Simple model
class SimpleNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(784, 128),
            nn.ReLU(),
            nn.Linear(128, 10)
        )
    
    def forward(self, x):
        return self.fc(x.view(-1, 784))

# Create synthetic data
X_train = torch.randn(1000, 784)
y_train = torch.randint(0, 10, (1000,))
X_test = torch.randn(200, 784)
y_test = torch.randint(0, 10, (200,))

# Train target model
target_model = SimpleNet().to(device)
optimizer = torch.optim.Adam(target_model.parameters())
criterion = nn.CrossEntropyLoss()

for epoch in range(10):
    optimizer.zero_grad()
    outputs = target_model(X_train.to(device))
    loss = criterion(outputs, y_train.to(device))
    loss.backward()
    optimizer.step()

print('✓ Target model trained')

✓ Target model trained


## Part 2: Membership Inference Attack

In [3]:
def membership_inference(model, X, y):
    """Check if samples were in training set using true label confidence"""
    model.eval()
    with torch.no_grad():
        outputs = model(X.to(device))
        probs = torch.softmax(outputs, dim=1)
        # Use confidence for the true label instead of max confidence
        true_label_confidence = probs[range(len(y)), y].cpu().numpy()
    
    # High confidence on true label suggests membership
    threshold = 0.5
    predictions = (true_label_confidence > threshold)
    return predictions, true_label_confidence

# Test on training data (members)
member_preds, member_conf = membership_inference(target_model, X_train[:100], y_train[:100])

# Test on test data (non-members)
non_member_preds, non_member_conf = membership_inference(target_model, X_test, y_test)

print(f'Member confidence: {member_conf.mean():.3f}')
print(f'Non-member confidence: {non_member_conf.mean():.3f}')
print(f'Attack accuracy: {(member_preds.sum() + (1-non_member_preds).sum()) / (len(member_preds) + len(non_member_preds)):.2%}')

Member confidence: 0.212
Non-member confidence: 0.154
Attack accuracy: 66.67%


## Exercise: Improve Attack

Implement a more sophisticated membership inference attack using shadow models.

**Advanced Techniques to Explore:**
- Shadow model training (Shokri et al.)
- Loss-based membership inference
- Metric-based attacks

**Reference:** [Membership Inference Attacks Against Machine Learning Models](https://arxiv.org/pdf/1610.05820) - Shokri et al. (2017)

This paper introduces the foundational shadow model approach for more sophisticated membership inference attacks.

In [4]:
# Your code here
