In [None]:
#Date: Dec 11, 2024
#Author: Sonal Allana
#Purpose: To create DP models with different privacy budgets on each dataset

In [None]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow_privacy.privacy.analysis.compute_noise_from_budget_lib import compute_noise
from tensorflow_privacy.privacy.analysis import compute_dp_sgd_privacy
import time
import numpy as np
from numpy import savetxt
from numpy import loadtxt
import sklearn as sk
import os
import utilities

In [None]:
base_folder = "dp_nn"


In [None]:
#Options (1) adult (2) credit (3) compas (4) hospital
dataset_name = "hospital"

#preprocess the dataset and return X and Y
X, Y = utilities.preprocess(dataset_name)


In [None]:
print(np.shape(X))
print(np.shape(Y))
print(X)
print(Y)

if dataset_name == "adult":
    print(np.unique(X[:,3])) #verify SEX is binary (sensitive attribute 1)
    print(np.unique(X[:,2])) #verify RACE is binary (sensitive attribute 2)    
elif dataset_name == "credit":
    print(np.unique(X[:,1])) #verify SEX is binary (sensitive attribute 1)
    print(np.unique(X[:,4])) #verify AGE is binary (sensitive attribute 2)
elif dataset_name == "compas":
    print(np.unique(X[:,0])) #verify SEX is binary (sensitive attribute 1)
    print(np.unique(X[:,2])) #verify RACE is binary (sensitive attribute 2)    
elif dataset_name == "hospital":
    print(np.unique(X.iloc[:,1])) #verify GENDER is binary (sensitive attribute 1)
    print(np.unique(X.iloc[:,0])) #verify RACE is binary (sensitive attribute 2)

In [None]:
#Training hyperparameters
learning_rate = 15e-5   #Adult 15e-5; Credit 15e-5; Compas 15e-5; Hospital 0.01
epochs = 50  
batch_size = 48
l2_norm_clip = 1e-5    #Adult 1e-5; Credit 1e-5; Compas 1e-3; Hospital 1
delta = 1e-6 # delta << 1/(size of training set)
min_noise = 1e-2

if dataset_name == "adult":
    batch_size = 5
    microbatches = 5
elif dataset_name == "credit":
    microbatches = 12
elif dataset_name == "compas":
    batch_size = 1
    microbatches = 1
    l2_norm_clip = 1e-3
elif dataset_name == "hospital":
    batch_size = 4096
    microbatches = 16 
    learning_rate = 0.01
    l2_norm_clip = 1

In [None]:
for i in range(1,6):
    #create a new split for each iteration
    if dataset_name == "hospital": #extraction of balanced sets
        X_train, X_test, Y_train, Y_test = utilities.getTrainTestSets(X,Y)
    else: 
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33)
        
    #Extract the sensitive attributes for attack later
    Z_train = np.zeros([X_train.shape[0],2])
    Z_test = np.zeros([X_test.shape[0],2])
    
    if dataset_name == "adult":       
        Z_train[:,0] = X_train[:,3] #sex column
        Z_train[:,1] = X_train[:,2] #race column
       
        Z_test[:,0] = X_test[:,3] #sex column
        Z_test[:,1] = X_test[:,2] #race column 
    elif dataset_name == "credit":
        Z_train[:,0] = X_train[:,1] #sex column
        Z_train[:,1] = X_train[:,4] #age column

        Z_test[:,0] = X_test[:,1] #sex column
        Z_test[:,1] = X_test[:,4] #age column
    elif dataset_name == "compas":
        Z_train[:,0] = X_train[:,0] #sex column
        Z_train[:,1] = X_train[:,2] #race column

        Z_test[:,0] = X_test[:,0] #sex column
        Z_test[:,1] = X_test[:,2] #race column       
    elif dataset_name == "hospital":
        Z_train[:,0] = X_train[:,1] #gender column
        Z_train[:,1] = X_train[:,0] #race column

        Z_test[:,0] = X_test[:,1] #gender column
        Z_test[:,1] = X_test[:,0] #race column 
        
    
    #nm=14.68 for eps=5.006, nm = 65.84 for eps = 0.972, nm = 500 for eps 0.1, nm = 4000 for eps = 0.01
    noise_multiplier = 4000
    print(noise_multiplier)
    
    
    #Instantiate network
    model_dp = utilities.create_dp_nn(dataset_name, X_train[0].shape, noise_multiplier, l2_norm_clip, microbatches, learning_rate)
    
    # Train network 
    start_time = time.time()
    r = model_dp.fit(X_train,
                    Y_train,
                    validation_data=(X_test, Y_test),
                    epochs=epochs,
                    batch_size=batch_size
                   )
    end_time = time.time()
    time_elapsed = (end_time - start_time)
    print("Training time for iter ",i,": ",time_elapsed)
    
    #Evaluate the model
    score = model_dp.evaluate(X_test,Y_test,verbose=0)
    model_loss = score[0] 
    model_acc = score[1]
    print("Test loss: ",model_loss,", Test accuracy: ",model_acc)
    
    fmodel = "model_dp_iter{0}.keras".format(i)
    basepath = '../models/{0}/{1}/nm{2:.0f}/'.format(base_folder,dataset_name,noise_multiplier)
    
    if not os.path.exists(basepath):
        os.mkdir(basepath)
    if not os.path.exists(basepath + "iter" + str(i) + "/"):
        os.mkdir(basepath + "iter" +str(i) + "/")
    basepath += "iter{0}/".format(i)
    
    model_dp.save(basepath + fmodel)
    savetxt(basepath + 'X_train.csv',X_train,delimiter=',')
    savetxt(basepath + 'X_test.csv',X_test,delimiter=',')
    savetxt(basepath + 'Y_train.csv',Y_train,delimiter=',')
    savetxt(basepath + 'Y_test.csv',Y_test,delimiter=',')
    savetxt(basepath + 'Z_train.csv',Z_train,delimiter=',')
    savetxt(basepath + 'Z_test.csv',Z_test,delimiter=',')

In [None]:
#https://www.tensorflow.org/responsible_ai/privacy/api_docs/python/tf_privacy/compute_dp_sgd_privacy
n = X_train.shape[0]
compute_dp_sgd_privacy.compute_dp_sgd_privacy_statement(number_of_examples=n,
                                              batch_size=batch_size,
                                              noise_multiplier=noise_multiplier,
                                              num_epochs=epochs,
                                              delta=delta)  