# DPGAN vs K-Anonymity on Adult Dataset
This notebook compares the performance of DNN classifiers trained on:
- Original data
- K-Anonymized data (using Mondrian algorithm)
- Synthetic data generated by a DP-GAN.

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score


2025-04-17 15:06:08.000897: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744902368.023161    3289 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744902368.029942    3289 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
# ============ Data Preprocessing ============
def preprocess_data(df):
    df = df.copy().apply(LabelEncoder().fit_transform)
    X = df.drop(columns=["income"])
    y = df["income"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    scaler = StandardScaler()
    return scaler.fit_transform(X_train), scaler.transform(X_test), y_train, y_test

In [3]:
# ============ DNN Model ============
def train_dnn(X_train, X_test, y_train, y_test):
    model = models.Sequential([
        layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
        layers.Dense(32, activation='relu'),
        layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(0.002),
                  loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=0, validation_data=(X_test, y_test))
    y_pred = (model.predict(X_test) > 0.5).astype("int32")
    y_prob = model.predict(X_test).flatten()
    return {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1-score": f1_score(y_test, y_pred),
        "AUC": roc_auc_score(y_test, y_prob),
        "Misclassification Error": 1 - accuracy_score(y_test, y_pred)
    }

In [4]:
# ============ Mondrian K-Anonymity ============
class MondrianKAnonymity:
    def __init__(self, k): self.k = k
    def partition(self, data, quasi_identifiers):
        if len(data) < 2 * self.k: return [data]
        attr = self._max_spread_attribute(data, quasi_identifiers)
        median = np.median(data[attr])
        left = data[data[attr] <= median]
        right = data[data[attr] > median]
        if len(left) < self.k or len(right) < self.k: return [data]
        return self.partition(left, quasi_identifiers) + self.partition(right, quasi_identifiers)
    def _max_spread_attribute(self, data, quasi_identifiers):
        return max(quasi_identifiers, key=lambda attr: data[attr].max() - data[attr].min())
    def anonymize(self, data, quasi_identifiers):
        partitions = self.partition(data, quasi_identifiers)
        result = []
        for p in partitions:
            gen_vals = {q: f"[{p[q].min()}-{p[q].max()}]" for q in quasi_identifiers}
            for _, row in p.iterrows():
                row_copy = row.copy()
                for q in quasi_identifiers:
                    row_copy[q] = gen_vals[q]
                result.append(row_copy)
        return pd.DataFrame(result)

def convert_to_numeric(df, quasi_identifiers):
    for col in quasi_identifiers:
        df[col] = df[col].apply(lambda x: np.mean([float(i) for i in x.strip("[]").split("-")]))
    return df

In [5]:
# ============ Differentially Private GAN ============
class DPGAN(tf.keras.Model):
    def __init__(self, input_dim, latent_dim=100, clip_norm=1.0, noise_multiplier=1.1):
        super(DPGAN, self).__init__()
        self.latent_dim = latent_dim
        self.input_dim = input_dim
        self.clip_norm = clip_norm
        self.noise_multiplier = noise_multiplier
        self.generator = self.build_generator()
        self.discriminator = self.build_discriminator()

    def build_generator(self):
        return tf.keras.Sequential([
            layers.Dense(128, activation='relu', input_shape=(self.latent_dim,)),
            layers.Dense(256, activation='relu'),
            layers.Dense(self.input_dim, activation='sigmoid')
        ])

    def build_discriminator(self):
        return tf.keras.Sequential([
            layers.Dense(256, activation='relu', input_shape=(self.input_dim,)),
            layers.Dense(128, activation='relu'),
            layers.Dense(1)
        ])

    def _apply_dp(self, grads):
        clipped_grads = [tf.clip_by_norm(g, self.clip_norm) for g in grads]
        noised_grads = [g + tf.random.normal(tf.shape(g), stddev=self.noise_multiplier * self.clip_norm)
                        for g in clipped_grads]
        return noised_grads

    def train(self, real_data, epochs=200, batch_size=64):
        dataset = tf.data.Dataset.from_tensor_slices(real_data).shuffle(10000).batch(batch_size)
        optimizer_g = tf.keras.optimizers.Adam(1e-4)
        optimizer_d = tf.keras.optimizers.Adam(1e-4)

        for epoch in range(epochs):
            for real_batch in dataset:
                noise = tf.random.normal((batch_size, self.latent_dim))
                with tf.GradientTape() as tape_d:
                    fake = self.generator(noise)
                    logits_real = self.discriminator(real_batch)
                    logits_fake = self.discriminator(fake)
                    loss_d = tf.reduce_mean(logits_fake) - tf.reduce_mean(logits_real)
                grads_d = tape_d.gradient(loss_d, self.discriminator.trainable_variables)
                dp_grads_d = self._apply_dp(grads_d)
                optimizer_d.apply_gradients(zip(dp_grads_d, self.discriminator.trainable_variables))

                noise = tf.random.normal((batch_size, self.latent_dim))
                with tf.GradientTape() as tape_g:
                    fake = self.generator(noise)
                    logits_fake = self.discriminator(fake)
                    loss_g = -tf.reduce_mean(logits_fake)
                grads_g = tape_g.gradient(loss_g, self.generator.trainable_variables)
                dp_grads_g = self._apply_dp(grads_g)
                optimizer_g.apply_gradients(zip(dp_grads_g, self.generator.trainable_variables))

    def sample(self, num_samples):
        noise = tf.random.normal((num_samples, self.latent_dim))
        return self.generator(noise).numpy()

In [6]:
# ============ Main Execution ============
if __name__ == "__main__":
    # Load dataset
    columns = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
               "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss",
               "hours-per-week", "native-country", "income"]
    data = pd.read_csv("/kaggle/input/adultdata/adult.data", names=columns, sep=", ", engine="python")

    results = {}

    # 1. Original
    X_train, X_test, y_train, y_test = preprocess_data(data)
    results["Original DNN"] = train_dnn(X_train, X_test, y_train, y_test)

    # 2. Mondrian K-Anonymity
    k = 5
    qid = ["age", "education-num", "hours-per-week"]
    k_anon = MondrianKAnonymity(k)
    anon_data = k_anon.anonymize(data, qid)
    anon_data = convert_to_numeric(anon_data, qid)
    X_train_k, X_test_k, y_train_k, y_test_k = preprocess_data(anon_data)
    results["K-Anonymity DNN"] = train_dnn(X_train_k, X_test_k, y_train_k, y_test_k)

    # 3. DP-GAN
    df_enc = data.copy().apply(LabelEncoder().fit_transform)
    X_data = df_enc.drop(columns=["income"])
    y_data = df_enc["income"]
    dpgan = DPGAN(input_dim=X_data.shape[1])
    dpgan.train(X_data.values)

    synthetic = dpgan.sample(10000)
    synth_df = pd.DataFrame(synthetic, columns=X_data.columns)
    synth_df["income"] = np.random.choice([0, 1], size=len(synth_df))
    synth_df = synth_df[columns]
    X_train_dp, X_test_dp, y_train_dp, y_test_dp = preprocess_data(synth_df)
    results["DP-GAN DNN"] = train_dnn(X_train_dp, X_test_dp, y_train_dp, y_test_dp)

    # 결과 출력
    for model_name, metric in results.items():
        print(f"\n{model_name}:")
        for m, v in metric.items():
            print(f"  {m}: {v:.4f}")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
I0000 00:00:1744902371.736518    3289 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15513 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0
I0000 00:00:1744902373.726278    3320 service.cc:148] XLA service 0x7cf14000aa40 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1744902373.726316    3320 service.cc:156]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0
I0000 00:00:1744902373.880356    3320 cuda_dnn.cc:529] Loaded cuDNN version 90300
I0000 00:00:1744902374.298838    3320 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step

Original DNN:
  Accuracy: 0.8572
  Precision: 0.7486
  Recall: 0.6143
  F1-score: 0.6748
  AUC: 0.9098
  Misclassification Error: 0.1428

K-Anonymity DNN:
  Accuracy: 0.8435
  Precision: 0.7490
  Recall: 0.4951
  F1-score: 0.5961
  AUC: 0.8925
  Misclassification Error: 0.1565

DP-GAN DNN:
  Accuracy: 0.4865
  Precision: 0.4924
  Recall: 0.6445
  F1-score: 0.5583
  AUC: 0.4863
  Misclassification Error: 0.5135
