# 1. Imports

In [66]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier


# 2. Load Data

In [53]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
adult = fetch_ucirepo(id=2) 
  
# data (as pandas dataframes) 
X = adult.data.features 
y = adult.data.targets 

# Merge the features and targets into a single dataframe
df = pd.concat([X, y], axis=1)

# 3. Preprocessing

In [None]:
# Drop rows with missing values
df = df.dropna()

In [60]:
df

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48836,33,Private,245211,Bachelors,13,Never-married,Prof-specialty,Own-child,White,Male,0,0,40,United-States,<=50K.
48837,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States,<=50K.
48839,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,<=50K.
48840,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States,<=50K.


In [102]:
# Split X and y
X = df.drop("income", axis=1)
y = df["income"]

In [103]:
# Identify categorical and numerical features
categorical_features = X.select_dtypes(include=["object"]).columns.tolist()
numerical_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

In [104]:
# Test-Train Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [105]:
# 4. Preprocessing Pipelines definieren
categorical_transformer = Pipeline(steps=[
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

numerical_transformer = Pipeline(steps=[
    ("scaler", StandardScaler())
])

# Fit + Transform auf Trainingsdaten
X_preprocessed = preprocessor.fit_transform(X_train)

# Nur transform auf Testdaten
X_test_preprocessed = preprocessor.transform(X_test)

In [106]:
X_test_preprocessed

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 133349 stored elements and shape (9525, 107)>

# 4. Generate Synthetic Data

In [85]:
import sdv
from sdv.metadata import Metadata
from sdv.single_table import CTGANSynthesizer


## 4.1 SDV Metadata erstellen

In [80]:
metadata = Metadata.detect_from_dataframe(
    data=df,
    table_name='adult')

In [82]:
df

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48836,33,Private,245211,Bachelors,13,Never-married,Prof-specialty,Own-child,White,Male,0,0,40,United-States,<=50K.
48837,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States,<=50K.
48839,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,<=50K.
48840,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States,<=50K.


In [83]:
metadata

{
    "tables": {
        "adult": {
            "columns": {
                "age": {
                    "sdtype": "numerical"
                },
                "workclass": {
                    "sdtype": "categorical"
                },
                "fnlwgt": {
                    "sdtype": "numerical"
                },
                "education": {
                    "sdtype": "categorical"
                },
                "education-num": {
                    "sdtype": "numerical"
                },
                "marital-status": {
                    "sdtype": "categorical"
                },
                "occupation": {
                    "sdtype": "categorical"
                },
                "relationship": {
                    "sdtype": "categorical"
                },
                "race": {
                    "sdtype": "categorical"
                },
                "sex": {
                    "sdtype": "categorical"
                },
           

In [84]:
metadata.save_to_json(filepath='synth_data/metadata/adult_metadata_v1.json')

## 4.2 Modelle trainieren

In [None]:
synthesizer = CTGANSynthesizer(metadata)
synthesizer.fit(df)

In [91]:
synthetic_data = synthesizer.sample(num_rows=len(df))

In [92]:
synthetic_data

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,37,Private,200987,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,2,0,50,United-States,<=50K
1,60,Private,72678,7th-8th,3,Married-spouse-absent,Sales,Unmarried,White,Male,12,0,40,United-States,<=50K
2,29,Private,200629,Some-college,10,Never-married,Exec-managerial,Own-child,White,Female,6,0,37,United-States,<=50K.
3,33,Private,53841,Bachelors,13,Divorced,Exec-managerial,Not-in-family,White,Female,0,0,40,United-States,<=50K.
4,37,Private,90400,HS-grad,9,Never-married,Craft-repair,Not-in-family,White,Male,0,0,40,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47616,41,Self-emp-not-inc,202564,Some-college,10,Married-spouse-absent,Exec-managerial,Unmarried,White,Male,6,0,50,Germany,<=50K
47617,32,Private,71157,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K
47618,33,Private,256359,HS-grad,9,Married-civ-spouse,Adm-clerical,Husband,Asian-Pac-Islander,Male,0,0,40,Philippines,>50K
47619,28,Private,159752,HS-grad,9,Divorced,Other-service,Unmarried,White,Female,0,0,40,United-States,<=50K


In [93]:
# save synthetic data
synthetic_data.to_csv('synth_data/synthetic_data_adult_v1.csv', index=False)

In [113]:
# Split X and y
X_synth = synthetic_data.drop("income", axis=1)
y_synth = synthetic_data["income"]

In [114]:
# Test-Train Split
X_train_synth, X_test_synth, y_train_synth, y_test_synth = train_test_split(X_synth, y_synth, test_size=0.2, random_state=42)

In [115]:
# 4. Preprocessing Pipelines definieren
categorical_transformer = Pipeline(steps=[
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

numerical_transformer = Pipeline(steps=[
    ("scaler", StandardScaler())
])

# Fit + Transform auf Trainingsdaten
X_preprocessed_synth = preprocessor.fit_transform(X_train_synth)

# Nur transform auf Testdaten
X_test_preprocessed_synth = preprocessor.transform(X_test_synth)

In [None]:
# Black Box Interface
# nimmt ein Modell und Eingabedaten und gibt die Vorhersage-Confidence zurück
def query_model(model, x):
    return model.predict_proba(x)

# Schritt 2: Simuliere Referenz-Daten
def simulate_reference_data(model, X_reference, y_reference, train_indices):
    probs = query_model(model, X_reference)
    # Label: 1 = war im Trainingsset (Mitglied), 0 = war nicht im Trainingsset
    membership_labels = np.isin(np.arange(len(X_reference)), train_indices).astype(int)
    # Merkmale: z.B. höchste Wahrscheinlichkeit (Top-1 Confidence)
    top1_confidence = np.max(probs, axis=1)
    features = np.vstack([top1_confidence]).T
    return features, membership_labels

# Schritt 3: Training und Attacke
def membership_inference_attack(X, y):
    # Splitte Trainings- und Testdaten
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

    # Trainiere Zielmodell (Opfermodell)
    from sklearn.ensemble import GradientBoostingClassifier
    target_model = GradientBoostingClassifier()
    target_model.fit(X_train, y_train)
    
    # Kombiniere Trainings- und Testdaten als Referenz-Set
    X_ref = np.vstack([X_train, X_test])
    y_ref = np.hstack([y_train, y_test])
    train_indices = np.arange(len(X_train))  # Indizes, die im Trainingsset waren

    # Generiere Features und Labels für Angreifer
    attack_features, membership_labels = simulate_reference_data(
        target_model, X_ref, y_ref, train_indices
    )

    # Trainiere Angriffsmodell (könnte auch ein einfacher Threshold sein)
    X_feat_train, X_feat_test, y_mem_train, y_mem_test = train_test_split(
        attack_features, membership_labels, test_size=0.3, random_state=42
    )
    attack_model = RandomForestClassifier()
    attack_model.fit(X_feat_train, y_mem_train)

    # Bewertung
    y_pred = attack_model.predict(X_feat_test)
    acc = accuracy_score(y_mem_test, y_pred)
    auc = roc_auc_score(y_mem_test, attack_model.predict_proba(X_feat_test)[:, 1])
    print(f"Angriff Accuracy: {acc:.4f} | AUC: {auc:.4f}")
    
    return acc, auc

In [112]:

from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
#X, y = data.data, data.target
X = X_preprocessed.toarray()
y = y_train

# Real
acc_real = []
auc_real = []
for i in range(0, 5):
    acc, auc = membership_inference_attack(X, y)
    acc_real.append(acc)
    auc_real.append(auc)
print(f"Real: Accuracy: {np.mean(acc_real):.4f} | AUC: {np.mean(auc_real):.4f}")

Angriff Accuracy: 0.5036 | AUC: 0.5021
Angriff Accuracy: 0.5041 | AUC: 0.5002
Angriff Accuracy: 0.4995 | AUC: 0.5002
Angriff Accuracy: 0.5037 | AUC: 0.5007
Angriff Accuracy: 0.4989 | AUC: 0.4999
Real: Accuracy: 0.5020 | AUC: 0.5006


In [117]:
X = X_preprocessed_synth.toarray()
y = y_train_synth

# Synthetic
acc_synth = []
auc_synth = []
for i in range(0, 5):
    acc, auc = membership_inference_attack(X, y)
    acc_synth.append(acc)
    auc_synth.append(auc)
print(f"Synth: Accuracy: {np.mean(acc_synth):.4f} | AUC: {np.mean(auc_synth):.4f}")

Angriff Accuracy: 0.5022 | AUC: 0.5053
Angriff Accuracy: 0.5022 | AUC: 0.5055
Angriff Accuracy: 0.5016 | AUC: 0.5057
Angriff Accuracy: 0.5022 | AUC: 0.5039
Angriff Accuracy: 0.5016 | AUC: 0.5058
Synth: Accuracy: 0.5020 | AUC: 0.5053


In [None]:
data = load_breast_cancer()
X, y = data.data, data.target


# Real
acc_real = []
auc_real = []
for i in range(0, 5):
    acc, auc = membership_inference_attack(X, y)
    acc_real.append(acc)
    auc_real.append(auc)
print(f"Real: Accuracy: {np.mean(acc_real):.4f} | AUC: {np.mean(auc_real):.4f}")

1. Split echter Datensatz in Use / NotUse
2. split Use in Train/Test and Train ML Modell
3. Use NotUse and Part of Train for MIA
4. MIA Attack tbd
5. Calc Metrics


--> Repeat for each Modell 5 Times (CV)

In [96]:
attack_model