In [None]:
#Date: Dec 11, 2024
#Author: Sonal Allana
#Purpose: To create NN models on each synthetic dataset

In [None]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
import time
import numpy as np
from numpy import savetxt
from numpy import loadtxt
import sklearn as sk
import os
import utilities

In [None]:
base_folder = "syn_nn"
#Options (1) ctgan (2) gausscopula (3) tvae 
syndataType = "gausscopula"

In [None]:
#Options (1) adult (2) credit (3) compas (4) hospital
dataset_name = "hospital"

#preprocess the dataset and return X and Ys
X, Y = utilities.loadSynDataset(dataset_name, syndataType)
#X,Y = utilities.preprocess_diabetes()


In [None]:
print(np.shape(X))
print(np.shape(Y))
print(X)
print(np.unique(Y))

if dataset_name == "adult":
    print(np.unique(X[:,3])) #verify SEX is binary (sensitive attribute 1)
    print(np.unique(X[:,2])) #verify RACE is binary (sensitive attribute 2)    
elif dataset_name == "credit":
    print(np.unique(X[:,1])) #verify SEX is binary (sensitive attribute 1)
    print(np.unique(X[:,4])) #verify AGE is binary (sensitive attribute 2)
elif dataset_name == "compas":
    print(np.unique(X[:,0])) #verify SEX is binary (sensitive attribute 1)
    print(np.unique(X[:,2])) #verify RACE is binary (sensitive attribute 2)    
elif dataset_name == "hospital":
    print(np.unique(X[:,1])) #verify GENDER is binary (sensitive attribute 1)
    print(np.unique(X[:,0])) #verify RACE is binary (sensitive attribute 2)
    

In [None]:
#Training hyperparameters
learning_rate = 15e-5   #lr = 15e-1  gives good result for eps 0.3
epochs = 50
batch_size = 48

In [None]:
for i in range(1,6):
    #create a new split for each iteration
    if dataset_name == "hospital":
        X_train, X_test, Y_train, Y_test = utilities.getTrainTestSets(X,Y)
    else: 
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33)

    #For synthetic data, train set should be synthetic and test set should be original (non-synthetic)
    bpath = '../models/baseline_nn/{0}/iter{1}/'.format(dataset_name,i) 
    X_test = loadtxt(bpath + 'X_test.csv',delimiter=',')
    Y_test = loadtxt(bpath + 'Y_test.csv',delimiter=',')

    #Extract the sensitive attributes for attack later
    Z_train = np.zeros([X_train.shape[0],2])
    Z_test = np.zeros([X_test.shape[0],2])
    
    if dataset_name == "adult":       
        Z_train[:,0] = X_train[:,3] #sex column
        Z_train[:,1] = X_train[:,2] #race column
       
        Z_test[:,0] = X_test[:,3] #sex column
        Z_test[:,1] = X_test[:,2] #race column 
    elif dataset_name == "credit":
        Z_train[:,0] = X_train[:,1] #sex column
        Z_train[:,1] = X_train[:,4] #age column

        Z_test[:,0] = X_test[:,1] #sex column
        Z_test[:,1] = X_test[:,4] #age column
    elif dataset_name == "compas":
        Z_train[:,0] = X_train[:,0] #sex column
        Z_train[:,1] = X_train[:,2] #race column

        Z_test[:,0] = X_test[:,0] #sex column
        Z_test[:,1] = X_test[:,2] #race column        
    elif dataset_name == "hospital":
        Z_train[:,0] = X_train[:,1] #gender column
        Z_train[:,1] = X_train[:,0] #race column

        Z_test[:,0] = X_test[:,1] #gender column
        Z_test[:,1] = X_test[:,0] #race column 
    
    # Instantiate network
    model = utilities.create_nn(dataset_name, X_train[0].shape)
    
    # Train network 
    start_time = time.time()
    r = model.fit(X_train,
                    Y_train,
                    validation_data=(X_test, Y_test),
                    epochs=epochs,
                    batch_size=batch_size
                   )
    end_time = time.time()
    time_elapsed = (end_time - start_time)
    print("Training time for iter ",i,": ",time_elapsed)
    
    #Evaluate the model
    score = model.evaluate(X_test,Y_test,verbose=0)
    model_loss = score[0] 
    model_acc = score[1]
    print("Test loss: ",model_loss,", Test accuracy: ",model_acc)
    
    fmodel = "model_wodp_iter{0}.keras".format(i)
    basepath = "../models/{0}/{1}/{2}/iter{3}/".format(base_folder,dataset_name,syndataType,i)
    
    if not os.path.exists(basepath):
        os.mkdir(basepath)
    
    model.save(basepath + fmodel)
    savetxt(basepath + 'X_train.csv',X_train,delimiter=',')
    savetxt(basepath + 'X_test.csv',X_test,delimiter=',')
    savetxt(basepath + 'Y_train.csv',Y_train,delimiter=',')
    savetxt(basepath + 'Y_test.csv',Y_test,delimiter=',')
    savetxt(basepath + 'Z_train.csv',Z_train,delimiter=',')
    savetxt(basepath + 'Z_test.csv',Z_test,delimiter=',')