# 1. Imports

In [1]:
import numpy as np
import pandas as pd
from ucimlrepo import fetch_ucirepo 
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier


import sdv
from sdv.metadata import Metadata
from sdv.single_table import CTGANSynthesizer
from sdv.single_table import GaussianCopulaSynthesizer

# 2. Load Data

In [2]:
  # fetch dataset 
adult = fetch_ucirepo(id=2) 
  
# data (as pandas dataframes) 
X = adult.data.features 
y = adult.data.targets 

# Merge the features and targets into a single dataframe
df = pd.concat([X, y], axis=1)

# 3. Dataset Preprocessing

In [3]:
# Replace '?' with NaN
df = df.replace('?', np.nan)

# Drop rows with missing values
df = df.dropna()

# Replace '.' in income
df['income'] = df['income'].replace('<=50K.', '<=50K')
df['income'] = df['income'].replace('>50K.', '>50K')



# Group all Countries except the top 3 into 'Other'
top_countries = df['native-country'].value_counts().index[:3]
df['native-country'] = df['native-country'].apply(lambda x: x if x in top_countries else 'Other')



## 3.1. Real - Ref Split

In [4]:
def real_ref_split(df):
    real, ref = train_test_split(df, test_size=0.5, random_state=42)
    return real, ref

In [5]:
real, ref = real_ref_split(df)

## 3.2 In-Out Split

In [6]:
def in_out_split(df):
    real_in, real_out = train_test_split(real, test_size=0.5, random_state=42)
    return real_in, real_out

In [7]:
real_in, real_out = in_out_split(real)

In [8]:
'''
# add labels to the datasets for MIA Test set
real_in['label'] = 1
real_out['label'] = 0
'''

"\n# add labels to the datasets for MIA Test set\nreal_in['label'] = 1\nreal_out['label'] = 0\n"

## 3.3 Create MIA Testset

In [9]:

# merge the real_in and real_out datasets
#mia_test = pd.concat([real_in, real_out])

# 4. Synthetic data

In [10]:
# In / Out Label entfernen, da dies nicht für das Training benötigt wird
#real_in.drop('label', axis=1, inplace=True)

In [11]:
# load metadata
metadata = Metadata.load_from_json(filepath='synth_data/metadata/adult_metadata_v1.json')

In [12]:
# CTGAN Training
#synthesizer = CTGANSynthesizer(metadata)
#synthesizer.fit(real_in)

In [13]:
# GaussienCopula Training
synthesizer = GaussianCopulaSynthesizer(metadata)
synthesizer.fit(real_in)

In [14]:
synth_in = synthesizer.sample(num_rows=len(real_in))

In [15]:
def inject_missing_categories_auto(real_data, synthetic_data):

    synthetic_data = synthetic_data.copy()

    # Automatische Erkennung kategorischer Spalten
    categorical_columns = real_data.select_dtypes(include=["object"]).columns.tolist()
    print(categorical_columns)

    for col in categorical_columns:
        real_cats = set(real_data[col].dropna().unique())
        synth_cats = set(synthetic_data[col].dropna().unique())

        missing = real_cats - synth_cats

        if missing:
            # Häufigste Kategorie in synthetischen Daten
            if synthetic_data[col].nunique() == 0:
                continue  # Überspringen, wenn Spalte leer
            most_common_cat = synthetic_data[col].value_counts().idxmax()

            # Beispiel-Record aus häufigster Kategorie
            base_record = synthetic_data[synthetic_data[col] == most_common_cat].iloc[0]

            for cat in missing:
                new_record = base_record.copy()
                new_record[col] = cat
                synthetic_data = pd.concat([synthetic_data, new_record.to_frame().T], ignore_index=True)

    return synthetic_data


In [16]:
#synth_in = inject_missing_categories_auto(real_in, synth_raw)

In [17]:
#synth_in = synth_raw.copy()

# 5. Data Preprocessing

In [18]:
# Funktion zum Data Preprocessing

def preprocess_data(df):
    # Split X and y
    X = df.drop("income", axis=1)
    y = df["income"]

    # Identify categorical and numerical features
    categorical_features = X.select_dtypes(include=["object"]).columns.tolist()
    numerical_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

    # 4. Preprocessing Pipelines definieren
    categorical_transformer = Pipeline(steps=[
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ])

    numerical_transformer = Pipeline(steps=[
        ("scaler", StandardScaler())
    ])

    # Preprocessing Pipeline wie gehabt:
    preprocessor = ColumnTransformer(transformers=[
        ("num", numerical_transformer, numerical_features),
        ("cat", categorical_transformer, categorical_features)
    ])

    # Fit + Transform auf Trainingsdaten
    X_preprocessed = preprocessor.fit_transform(X)
    X_preprocessed = X_preprocessed.toarray()

    return X_preprocessed, y

In [19]:
# Aufruf der Preprocessing Funktion für die realen Daten
X_preprocessed_real, y_real = preprocess_data(real_in)

# Aufruf der Preprocessing Funktion für die synthetischen Daten
X_preprocessed_synt, y_synth = preprocess_data(synth_in)

# 6. Training ML Model (Victim)

In [20]:
y_real = y_real.replace({'>50K': 1, '<=50K': 0})
y_synth = y_synth.replace({'>50K': 1, '<=50K': 0})

  y_real = y_real.replace({'>50K': 1, '<=50K': 0})
  y_synth = y_synth.replace({'>50K': 1, '<=50K': 0})


In [21]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

In [22]:
dtc_real = DecisionTreeClassifier(random_state=42, max_depth=12)
dtc_real.fit(X_preprocessed_real, y_real)


dtc_synth = DecisionTreeClassifier(random_state=42, max_depth=12)
dtc_synth.fit(X_preprocessed_synt, y_synth)


In [23]:
'''
# Model Setup
dtc = DecisionTreeClassifier(random_state=42, max_depth=12)
clf = LogisticRegression(max_iter=1000, random_state=42)

log_reg = LogisticRegression(random_state=42)

# Train Victim Model (real data)
#dtc_real = train_victim(X_preprocessed_real, y_real, dtf)
dtc_real = train_victim(X_preprocessed_real, y_real, clf)


# Train Victim Model (synthetic data)
#dtc_synth = train_victim(X_preprocessed_synt, y_synth, dtc)
dtc_synth = train_victim(X_preprocessed_synt, y_synth, clf)
'''

'\n# Model Setup\ndtc = DecisionTreeClassifier(random_state=42, max_depth=12)\nclf = LogisticRegression(max_iter=1000, random_state=42)\n\nlog_reg = LogisticRegression(random_state=42)\n\n# Train Victim Model (real data)\n#dtc_real = train_victim(X_preprocessed_real, y_real, dtf)\ndtc_real = train_victim(X_preprocessed_real, y_real, clf)\n\n\n# Train Victim Model (synthetic data)\n#dtc_synth = train_victim(X_preprocessed_synt, y_synth, dtc)\ndtc_synth = train_victim(X_preprocessed_synt, y_synth, clf)\n'

## 7.1 Train Shadow Model

In [24]:
ref_in, ref_out = in_out_split(ref)
X_preprocessed_ref_in, y_ref_in = preprocess_data(ref_in)
X_preprocessed_ref_out, y_ref_out = preprocess_data(ref_out)

In [25]:
# Model Setup (max_depth=3 da sonst immer 1 oder 0 als Proba)
dtc_shadow = DecisionTreeClassifier(random_state=42, max_depth=12)

# Train Shadow Model (ref_in)
dtc_shadow.fit(X_preprocessed_ref_in, y_ref_in)

# 7.2 Create Trainset for Attack Model

In [26]:
# Querry Shadow Model with X_preprocessed_ref_in and X_preprocessed_ref_out
probs_in = dtc_shadow.predict_proba(X_preprocessed_ref_in)
probs_out = dtc_shadow.predict_proba(X_preprocessed_ref_out)

In [27]:
# add label 1 to in and 0 to out
y_shadow_in = np.ones(len(probs_in))
y_shadow_out = np.zeros(len(probs_out))

# Merge in and out
X_shadow = np.concatenate([probs_in, probs_out])
y_shadow = np.concatenate([y_shadow_in, y_shadow_out])

# 8. Train Attack Model

In [28]:
from sklearn.ensemble import RandomForestClassifier

attack_model = RandomForestClassifier(n_estimators=100, random_state=42)
attack_model.fit(X_shadow, y_shadow)

# 9. Attack on ML Model

## 9.1 Data Preparation

In [29]:
# make y_label Vector for real_in and real_out
y_label_in = np.ones(len(real_in))
y_label_out = np.zeros(len(real_out))

# Merge y_label_in and y_label_out
y_label = np.concatenate([y_label_in, y_label_out])

In [30]:
#real_out.drop('label', axis=1, inplace=True)

In [31]:
# Data Prep real_in and real_out / will be used as test set

# Preprocessing of both datasets
X_preprocessed_real_in, y_real_in = preprocess_data(real_in)
X_preprocessed_real_out, y_real_out = preprocess_data(real_out)

## 9.1 Attack on ML Model (real data) 

In [32]:
# Attack on Model
attack_pred_in = attack_model.predict(dtc_real.predict_proba(X_preprocessed_real_in))
attack_pred_out = attack_model.predict(dtc_real.predict_proba(X_preprocessed_real_out))

# Merge attack_pred_in and attack_pred_out
y_pred_label = np.concatenate([attack_pred_in, attack_pred_out])
# calculate accuracy
accuracy_score(y_label, y_pred_label)
print("MIA Attack Accuracy ML(real Data):", accuracy_score(y_label, y_pred_label))

MIA Attack Accuracy ML(real Data): 0.5162089248595816


## 9.2 Attack on ML Model (synthetic data)

In [33]:
# Prediction of the attack model
attack_pred_in_synth = attack_model.predict(dtc_synth.predict_proba(X_preprocessed_real_in))
attack_pred_out_synth = attack_model.predict(dtc_synth.predict_proba(X_preprocessed_real_out))

# Merge attack_pred_in and attack_pred_out (synthetic data)
y_pred_label_synth = np.concatenate([attack_pred_in_synth, attack_pred_out_synth])

# calculate accuracy
accuracy_score(y_label, y_pred_label_synth)
print("MIA Attack Accuracy ML(synth Data):", accuracy_score(y_label, y_pred_label_synth))

MIA Attack Accuracy ML(synth Data): 0.5061253372252443
