# 1. Imports

In [33]:
import numpy as np
import pandas as pd
from ucimlrepo import fetch_ucirepo 
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier


import sdv
from sdv.metadata import Metadata
from sdv.single_table import CTGANSynthesizer
from sdv.single_table import GaussianCopulaSynthesizer

# 2. Load Data

In [34]:
# fetch dataset 
adult = fetch_ucirepo(id=2) 
  
# data (as pandas dataframes) 
X = adult.data.features 
y = adult.data.targets 

# Merge the features and targets into a single dataframe
df = pd.concat([X, y], axis=1)

# 3. Dataset Preprocessing

In [35]:
# Replace '?' with NaN
df = df.replace('?', np.nan)

# Drop rows with missing values
df = df.dropna()

# Replace '.' in income
df['income'] = df['income'].replace('<=50K.', '<=50K')
df['income'] = df['income'].replace('>50K.', '>50K')

# Rpelace Target values with binary
df['income'] = df['income'].replace({'>50K': 1, '<=50K': 0}) # >50k = 1, <=50k = 0

# Group all Countries except the top 3 into 'Other'
top_countries = df['native-country'].value_counts().index[:3]
df['native-country'] = df['native-country'].apply(lambda x: x if x in top_countries else 'Other')

  df['income'] = df['income'].replace({'>50K': 1, '<=50K': 0}) # >50k = 1, <=50k = 0


In [36]:
df

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Other,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48836,33,Private,245211,Bachelors,13,Never-married,Prof-specialty,Own-child,White,Male,0,0,40,United-States,0
48837,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States,0
48839,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,0
48840,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States,0


In [None]:
# Proprecess Catgorical Columns to minimze risk of missmatch in matrices shape after synthesizing

# drop rows where worklass = without-pay
df = df[df['workclass'] != 'Without-pay']

# replace 5th-6th , 1st-4th and Preschool with 'Primary'
df['education'] = df['education'].replace('5th-6th', 'Primary')
df['education'] = df['education'].replace('1st-4th', 'Primary')
df['education'] = df['education'].replace('Preschool', 'Primary')

# drop rows where Marital status = Married-AF-spouse
df = df[df['marital-status'] != 'Married-AF-spouse']

# drop rows wehere occupation = Armed-Forces
df = df[df['occupation'] != 'Armed-Forces']

## 3.1. Real - Ref Split

In [40]:
def real_ref_split(df):
    real, ref = train_test_split(df, test_size=0.5, random_state=187)
    return real, ref

In [41]:
real, ref = real_ref_split(df)

## 3.2 In-Out Split

In [42]:
def in_out_split(df):
    real_in, real_out = train_test_split(real, test_size=0.5, random_state=42)
    return real_in, real_out

In [43]:
real_in, real_out = in_out_split(real)

# 4. Synthetic data

In [44]:
# load metadata
metadata = Metadata.load_from_json(filepath='synth_data/metadata/adult_metadata_v1.json')

In [45]:
# CTGAN Training
#synthesizer = CTGANSynthesizer(metadata)
#synthesizer.fit(real_in)

In [46]:
# GaussienCopula Training
synthesizer = GaussianCopulaSynthesizer(metadata)
synthesizer.fit(real_in)

In [47]:
synth_in = synthesizer.sample(num_rows=len(real_in))

# 5. Data Preprocessing

In [48]:
# Funktion zum Data Preprocessing

def preprocess_data(df):
    # Split X and y
    X = df.drop("income", axis=1)
    y = df["income"]

    # Identify categorical and numerical features
    categorical_features = X.select_dtypes(include=["object"]).columns.tolist()
    numerical_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

    # 4. Preprocessing Pipelines definieren
    categorical_transformer = Pipeline(steps=[
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ])

    numerical_transformer = Pipeline(steps=[
        ("scaler", StandardScaler())
    ])

    # Preprocessing Pipeline wie gehabt:
    preprocessor = ColumnTransformer(transformers=[
        ("num", numerical_transformer, numerical_features),
        ("cat", categorical_transformer, categorical_features)
    ])

    # Fit + Transform auf Trainingsdaten
    X_preprocessed = preprocessor.fit_transform(X)
    X_preprocessed = X_preprocessed.toarray()

    return X_preprocessed, y

In [49]:
real_in

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
40482,45,Private,118714,Assoc-voc,11,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,0
33634,62,Private,499971,11th,7,Widowed,Handlers-cleaners,Not-in-family,Black,Female,0,0,40,United-States,0
42344,21,Private,301915,11th,7,Separated,Sales,Not-in-family,Other,Female,0,0,30,Mexico,0
33489,39,Self-emp-not-inc,41017,10th,6,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,70,United-States,0
18245,44,Self-emp-not-inc,216921,10th,6,Married-civ-spouse,Other-service,Husband,White,Male,0,0,70,United-States,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45447,48,Private,224087,10th,6,Married-civ-spouse,Transport-moving,Husband,Black,Male,0,0,40,United-States,0
2303,90,Private,52386,Some-college,10,Never-married,Other-service,Not-in-family,Asian-Pac-Islander,Male,0,0,35,United-States,0
34704,40,Private,87771,HS-grad,9,Never-married,Adm-clerical,Not-in-family,White,Female,0,0,40,United-States,0
23364,25,Private,219130,Some-college,10,Never-married,Other-service,Not-in-family,Other,Female,0,0,40,United-States,0


In [50]:
# Aufruf der Preprocessing Funktion für die realen Daten
X_preprocessed_real, y_real = preprocess_data(real_in)

# Aufruf der Preprocessing Funktion für die synthetischen Daten
X_preprocessed_synt, y_synth = preprocess_data(synth_in)

In [51]:
X_preprocessed_real.shape
#X_preprocessed_synt.shape

(11288, 62)

# 6. Training ML Model (Victim)

In [52]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

In [53]:
X_preprocessed_synt.shape

(11288, 62)

In [54]:
dtc_real = DecisionTreeClassifier(random_state=42, max_depth=12)
dtc_real.fit(X_preprocessed_real, y_real)


dtc_synth = DecisionTreeClassifier(random_state=42, max_depth=12)
dtc_synth.fit(X_preprocessed_synt, y_synth)


## 7.1 Train Shadow Model

In [55]:
ref_in, ref_out = in_out_split(ref)
X_preprocessed_ref_in, y_ref_in = preprocess_data(ref_in)
X_preprocessed_ref_out, y_ref_out = preprocess_data(ref_out)

In [56]:
# Model Setup (max_depth=3 da sonst immer 1 oder 0 als Proba)
dtc_shadow = DecisionTreeClassifier(random_state=42, max_depth=12)

# Train Shadow Model (ref_in)
dtc_shadow.fit(X_preprocessed_ref_in, y_ref_in)

# 7.2 Create Trainset for Attack Model

In [57]:
# Querry Shadow Model with X_preprocessed_ref_in and X_preprocessed_ref_out
probs_in = dtc_shadow.predict_proba(X_preprocessed_ref_in)
probs_out = dtc_shadow.predict_proba(X_preprocessed_ref_out)

In [58]:
# add label 1 to in and 0 to out
y_shadow_in = np.ones(len(probs_in))
y_shadow_out = np.zeros(len(probs_out))

# Merge in and out
X_shadow = np.concatenate([probs_in, probs_out])
y_shadow = np.concatenate([y_shadow_in, y_shadow_out])

# 8. Train Attack Model

In [59]:
# Train Attack Model
attack_model = RandomForestClassifier(n_estimators=100, random_state=42)
attack_model.fit(X_shadow, y_shadow)

# 9. Attack on ML Model

## 9.1 Data Preparation

In [60]:
# make y_label Vector for real_in and real_out
y_label_in = np.ones(len(real_in))
y_label_out = np.zeros(len(real_out))

# Merge y_label_in and y_label_out
y_label = np.concatenate([y_label_in, y_label_out])

In [61]:
# Data Prep real_in and real_out / will be used as test set

# Preprocessing of both datasets
X_preprocessed_real_in, y_real_in = preprocess_data(real_in)
X_preprocessed_real_out, y_real_out = preprocess_data(real_out)

## 9.1 Attack on ML Model (real data) 

In [62]:
# Attack on Model
attack_pred_in = attack_model.predict(dtc_real.predict_proba(X_preprocessed_real_in))
attack_pred_out = attack_model.predict(dtc_real.predict_proba(X_preprocessed_real_out))

# Merge attack_pred_in and attack_pred_out
y_pred_label = np.concatenate([attack_pred_in, attack_pred_out])
# calculate accuracy
accuracy_score(y_label, y_pred_label)
print("MIA Attack Accuracy ML(real Data):", accuracy_score(y_label, y_pred_label))

MIA Attack Accuracy ML(real Data): 0.5149488417415954


## 9.2 Attack on ML Model (synthetic data)

In [63]:
X_preprocessed_real_in.shape

(11288, 62)

In [64]:
# Prediction of the attack model
attack_pred_in_synth = attack_model.predict(dtc_synth.predict_proba(X_preprocessed_real_in))
attack_pred_out_synth = attack_model.predict(dtc_synth.predict_proba(X_preprocessed_real_out))

# Merge attack_pred_in and attack_pred_out (synthetic data)
y_pred_label_synth = np.concatenate([attack_pred_in_synth, attack_pred_out_synth])

# calculate accuracy
accuracy_score(y_label, y_pred_label_synth)
print("MIA Attack Accuracy ML(synth Data):", accuracy_score(y_label, y_pred_label_synth))

MIA Attack Accuracy ML(synth Data): 0.5008194179917616


In [71]:
accuracy_real = [50.2, 49.99, 50.1, 50.0, 50.1]
accuracy_synth = [50.0, 50.1, 50.0, 50.1, 50.0]

In [75]:
# create datframe from accuracy lists
df_result = pd.DataFrame(list(zip(accuracy_real, accuracy_synth)), columns=['accuracy_real', 'accuracy_synth'])
df_result['VictomModel'] = ['DecisionTree', 'DecisionTree', 'DecisionTree', 'DecisionTree', 'DecisionTree']
df_result['AttackModel'] = ['RandomForest', 'RandomForest', 'RandomForest', 'RandomForest', 'RandomForest']
df_result['Dataset'] = ['Adult', 'Adult', 'Adult', 'Adult', 'Adult']
df_result['Synthesizer'] = ['GaussianCopula', 'GaussianCopula', 'GaussianCopula', 'GaussianCopula', 'GaussianCopula']

df_result

Unnamed: 0,accuracy_real,accuracy_synth,VictomModel,AttackModel,Dataset,Synthesizer
0,50.2,50.0,DecisionTree,RandomForest,Adult,GaussianCopula
1,49.99,50.1,DecisionTree,RandomForest,Adult,GaussianCopula
2,50.1,50.0,DecisionTree,RandomForest,Adult,GaussianCopula
3,50.0,50.1,DecisionTree,RandomForest,Adult,GaussianCopula
4,50.1,50.0,DecisionTree,RandomForest,Adult,GaussianCopula


In [None]:
# save results to csv
df_result.to_csv('results/MIA_Attack_Results.csv', index=False)
