### **Data Preprocessing and Model Training**

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Load the dataset
data = pd.read_csv("/content/drive/MyDrive/diabetes.csv")
print(data)

def _df(data):
    df = pd.DataFrame(data)
    for c in range(df.shape[1]):
        mapping = {df.columns[c]: c}
        df = df.rename(columns=mapping)
    return df

# Preprocess data
X = data.drop(columns=["Outcome"]).values
y = data["Outcome"].values

X = KNNImputer().fit_transform(X)
data = pd.DataFrame(StandardScaler().fit_transform(np.column_stack((X, y))))

# Binarize the "Outcome" column in the original data
original_outcome = np.where(data.iloc[:, -1] > 0, 1, 0)

# Split the original data into train and test sets
X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(data.drop(columns=[data.columns[-1]]), original_outcome, test_size=0.2, random_state=42)

# Train a model on the training set of the original data
clf_orig = LogisticRegression()
clf_orig.fit(X_train_orig, y_train_orig)

class Gan():
    def __init__(self, data):
        self.data = data
        self.n_epochs = 200

    # Noise generation function
    def _noise(self):
        noise = np.random.normal(0, 1, self.data.shape)
        return noise

    # Noise generation function

    def _noise_with_seed(self, seed):
        np.random.seed(seed)
        noise = np.random.normal(0, 1, self.data.shape)
        return noise


    # Generator model architecture
    def _generator(self):
        model = tf.keras.Sequential(name="Generator_model")
        model.add(tf.keras.layers.Dense(15, activation='relu',
                                         kernel_initializer='he_uniform',
                                         input_dim=self.data.shape[1]))
        model.add(tf.keras.layers.Dense(30, activation='relu'))
        model.add(tf.keras.layers.Dense(self.data.shape[1], activation='linear'))
        return model

    # Discriminator model architecture
    def _discriminator(self):
        model = tf.keras.Sequential(name="Discriminator_model")
        model.add(tf.keras.layers.Dense(25, activation='relu',
                                         kernel_initializer='he_uniform',
                                         input_dim=self.data.shape[1]))
        model.add(tf.keras.layers.Dense(50, activation='relu'))
        model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
        model.compile(loss='binary_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])
        return model

    # GAN model architecture
    def _GAN(self, generator, discriminator):
        discriminator.trainable = False
        generator.trainable = True
        model = tf.keras.Sequential(name="GAN")
        model.add(generator)
        model.add(discriminator)
        model.compile(loss='binary_crossentropy', optimizer='adam')
        return model

    # Training function
    def train(self, generator, discriminator, gan):
        for epoch in range(self.n_epochs):
            generated_data = generator.predict(self._noise())
            labels = np.concatenate([np.ones(self.data.shape[0]), np.zeros(self.data.shape[0])])
            X = np.concatenate([self.data, generated_data])
            discriminator.trainable = True
            d_loss, _ = discriminator.train_on_batch(X, labels)

            noise = self._noise()
            g_loss = gan.train_on_batch(noise, np.ones(self.data.shape[0]))

            print('>%d, d1=%.3f, d2=%.3f' % (epoch + 1, d_loss, g_loss))

        return generator

    # Data generation function
    def generate_data(self, generator, num_samples):
        noise = self._noise()
        generated_data = generator.predict(noise)
        return generated_data[:num_samples, :]

      # Data generation function with seed
    def generate_data_with_seed(self, generator, num_samples, seed=1234):
        noise = self._noise_with_seed(seed=seed)
        generated_data = generator.predict(noise)
        return generated_data[:num_samples, :]

    def compute_identifiability(self, original_data, generated_data, weights):
        distances = []
        for xi in original_data:
            min_distance = np.inf
            for xj in original_data:
                if not np.array_equal(xi, xj):
                    distance = np.linalg.norm((xi - xj) * weights)
                    min_distance = min(min_distance, distance)
            distances.append(min_distance)

        synthetic_distances = []
        for xi in original_data:
            min_distance = np.inf
            for xj in generated_data:
                distance = np.linalg.norm((xi - xj) * weights)
                min_distance = min(min_distance, distance)
            synthetic_distances.append(min_distance)

        num_identifiable = sum(1 for s, o in zip(synthetic_distances, distances) if s < o)
        identifiability = num_identifiable / len(original_data)

        return identifiability


# Instantiate GAN class and train the model
model = Gan(data=data)
generator = model._generator()
discriminator = model._discriminator()
gan_model = model._GAN(generator=generator, discriminator=discriminator)
trained_model = model.train(generator=generator, discriminator=discriminator, gan=gan_model)



In [None]:
df_last=data.iloc[ :, -1]
sum(df_last>=0)/len(df_last)

### **Dataset Generation**


In [None]:

# Define different sizes of datasets to be generated
dataset_sizes = [2 ** i for i in range(4, 11)]

identifiability_scores = {size: [] for size in dataset_sizes}
num_seeds = 20  # Number of seeds to use for each dataset size
test_accuracy_diffs = []
weights = np.array([2, 2, 1, 1, 1, 1, 1, 1, 1])

for size in dataset_sizes:
    test_accuracies_orig = []
    test_accuracies_gen = []
    for seed in range(num_seeds):
      flag = True
      while flag:
        # Generate new data of the specified size
        new_data = model.generate_data_with_seed(generator, size, seed=seed)
        # Binarize the "Outcome" column in the generated data
        generated_outcome = np.where(new_data[:, -1] > 0, 1, 0)
        if sum(generated_outcome)>0 and sum(generated_outcome)<len(generated_outcome) :
          flag = False
        else:
          seed = seed + 100

      # Train a model on the generated data
      clf_gen = LogisticRegression()
      clf_gen.fit(new_data[:, :-1], generated_outcome)


      # Test the two models on the test set of the original data
      accuracy_orig = accuracy_score(y_test_orig, clf_orig.predict(X_test_orig))
      accuracy_gen = accuracy_score(y_test_orig, clf_gen.predict(X_test_orig))

      # Compute the difference in test accuracies
      difference_accuracy = abs(accuracy_orig - accuracy_gen)

      test_accuracies_orig.append(accuracy_orig)
      test_accuracies_gen.append(accuracy_gen)

      # Compute identifiability
      identifiability_measure = model.compute_identifiability(data.values, new_data, weights)
      identifiability_scores[size].append(identifiability_measure)
      print(f"Seed {seed + 1}: ε-Identifiability Score: {identifiability_measure}")

    diff_accuracy = (abs(np.array(test_accuracies_orig) - np.array(test_accuracies_gen)))

    test_accuracy_diffs.append(diff_accuracy)

    print(f"Dataset size: {size}, Difference in test accuracies: {diff_accuracy}")


### **Evalution of the quality**

In [None]:
# Plot the variations of the quality with different dataset sizes using box plots
plt.figure(figsize=(10, 6))
plt.boxplot(test_accuracy_diffs, labels=dataset_sizes)
plt.title('Variations in Quality with Different Dataset Sizes')
plt.xlabel('Dataset Size')
plt.ylabel('Difference in Test Accuracies')
plt.grid(True)
plt.show()

### **Evaluation of the privacy**

In [None]:
# Plot the variations of identifiability with different dataset sizes using box plots
identifiability_scores_list = [scores for _, scores in identifiability_scores.items()]
plt.figure(figsize=(10, 6))
plt.boxplot(identifiability_scores_list, labels=dataset_sizes)
plt.title('Variations in Privacy with Different Dataset Sizes')
plt.xlabel('Dataset Size')
plt.ylabel('ε-Identifiability Score')
plt.grid(True)
plt.show()


### **Quality Vs Privacy**

In [None]:
import matplotlib.pyplot as plt

# Assuming identifiability_scores and test_accuracy_diffs are already computed

# Define marker styles for each dataset size
marker_styles = ['o', 's', '^', 'D', 'v', '>', '<', 'p']

# Plot the variations of quality vs. identifiability scores
plt.figure(figsize=(10, 6))
for i, size in enumerate(dataset_sizes):
    plt.scatter(identifiability_scores[size], test_accuracy_diffs[i], label=f'Dataset Size: {size}', marker=marker_styles[i])
plt.title('Variations of Quality vs. Identifiability Scores')
plt.xlabel('ε-Identifiability Score')
plt.ylabel('Difference in Test Accuracies')
plt.legend()
plt.grid(True)
plt.show()


### **Distribution of the generated data**

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20, 6))
ax[0].scatter(data.iloc[:, 0], data.iloc[:, 1])
ax[1].scatter(new_data[:, 0], new_data[:, 1])
ax[0].set_title("Original Data")
ax[1].set_title("synthetic Data")