In [None]:
#Date: Dec 11, 2024
#Author: Sonal Allana
#Purpose: To create baseline NN models on each dataset

In [1]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
import time
import numpy as np
from numpy import savetxt
from numpy import loadtxt
import sklearn as sk
import os
import utilities

2024-10-09 11:07:38.151203: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
base_folder = "baseline_nn"


In [3]:
#Options (1) adult (2) credit (3) compas (4) hospital
dataset_name = "hospital"

#preprocess the dataset and return X and Y
X, Y = utilities.preprocess(dataset_name)


[0 1]
[1 0]
Index(['race', 'gender', 'time_in_hospital', 'num_lab_procedures',
       'num_procedures', 'num_medications', 'number_outpatient',
       'number_emergency', 'number_inpatient', 'number_diagnoses',
       ...
       'glyburide-metformin_Down', 'glyburide-metformin_No',
       'glyburide-metformin_Steady', 'glyburide-metformin_Up',
       'glipizide-metformin_Steady', 'glimepiride-pioglitazone_Steady',
       'metformin-rosiglitazone_No', 'metformin-pioglitazone_Steady',
       'change_No', 'diabetesMed_Yes'],
      dtype='object', length=121)


In [4]:
print(np.shape(X))
print(np.shape(Y))
print(X)
print(np.unique(Y))

if dataset_name == "adult":
    print(np.unique(X[:,3])) #verify SEX is binary (sensitive attribute 1)
    print(np.unique(X[:,2])) #verify RACE is binary (sensitive attribute 2)    
elif dataset_name == "credit":
    print(np.unique(X[:,1])) #verify SEX is binary (sensitive attribute 1)
    print(np.unique(X[:,4])) #verify AGE is binary (sensitive attribute 2)
elif dataset_name == "compas":
    print(np.unique(X[:,0])) #verify SEX is binary (sensitive attribute 1)
    print(np.unique(X[:,2])) #verify RACE is binary (sensitive attribute 2)    
elif dataset_name == "hospital":
    print(np.unique(X.iloc[:,1])) #verify GENDER is binary (sensitive attribute 1)
    print(np.unique(X.iloc[:,0])) #verify RACE is binary (sensitive attribute 2)

(97287, 120)
(97287,)
       race  gender  time_in_hospital  num_lab_procedures  num_procedures  \
0       1.0     0.0               1.0                41.0             0.0   
1       1.0     0.0               3.0                59.0             0.0   
2       0.0     0.0               2.0                11.0             5.0   
3       1.0     1.0               2.0                44.0             1.0   
4       1.0     1.0               1.0                51.0             0.0   
...     ...     ...               ...                 ...             ...   
97282   1.0     0.0               3.0                41.0             5.0   
97283   0.0     1.0               4.0                64.0             0.0   
97284   1.0     0.0               3.0                46.0             2.0   
97285   1.0     0.0               2.0                 1.0             0.0   
97286   1.0     1.0               5.0                10.0             0.0   

       num_medications  number_outpatient  number_eme

In [None]:
#Training hyperparameters
learning_rate = 1e-3 #using the default keras lr instead of 15e-5 from Blanco-Justicia et al., 2022  
epochs = 50
batch_size = 48

In [None]:
for i in range(1,6):
    #create a new split for each iteration
    if dataset_name == "hospital":
        X_train, X_test, Y_train, Y_test = utilities.getTrainTestSets(X,Y)
    else: 
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33)

    #Extract the sensitive attributes for attack later
    Z_train = np.zeros([X_train.shape[0],2])
    Z_test = np.zeros([X_test.shape[0],2])
    
    if dataset_name == "adult":       
        Z_train[:,0] = X_train[:,3] #sex column
        Z_train[:,1] = X_train[:,2] #race column
       
        Z_test[:,0] = X_test[:,3] #sex column
        Z_test[:,1] = X_test[:,2] #race column 
    elif dataset_name == "credit":
        Z_train[:,0] = X_train[:,1] #sex column
        Z_train[:,1] = X_train[:,4] #age column

        Z_test[:,0] = X_test[:,1] #sex column
        Z_test[:,1] = X_test[:,4] #age column
    elif dataset_name == "compas":
        Z_train[:,0] = X_train[:,0] #sex column
        Z_train[:,1] = X_train[:,2] #race column

        Z_test[:,0] = X_test[:,0] #sex column
        Z_test[:,1] = X_test[:,2] #race column        
    elif dataset_name == "diabetes":
        Z_train[:,0] = X_train[:,1] #gender column
        Z_train[:,1] = X_train[:,0] #race column

        Z_test[:,0] = X_test[:,1] #gender column
        Z_test[:,1] = X_test[:,0] #race column 
    
    # Instantiate network
    model = utilities.create_nn(dataset_name, X_train[0].shape)
    
    # Train network 
    start_time = time.time()
    r = model.fit(X_train,
                    Y_train,
                    validation_data=(X_test, Y_test),
                    epochs=epochs,
                    batch_size=batch_size,
                   )
    end_time = time.time()
    time_elapsed = (end_time - start_time)
    print("Training time for iter ",i,": ",time_elapsed)
    
    #Evaluate the model
    score = model.evaluate(X_test,Y_test,verbose=0)
    model_loss = score[0] 
    model_acc = score[1]
    print("Test loss: ",model_loss,", Test accuracy: ",model_acc)
    
    fmodel = "model_wodp_iter{0}.keras".format(i)
    basepath = "../models/{0}/{1}/iter{2}/".format(base_folder,dataset_name, i)
    
    if not os.path.exists(basepath):
        os.mkdir(basepath)
    
    model.save(basepath + fmodel)
    savetxt(basepath + 'X_train.csv',X_train,delimiter=',')
    savetxt(basepath + 'X_test.csv',X_test,delimiter=',')
    savetxt(basepath + 'Y_train.csv',Y_train,delimiter=',')
    savetxt(basepath + 'Y_test.csv',Y_test,delimiter=',')
    savetxt(basepath + 'Z_train.csv',Z_train,delimiter=',')
    savetxt(basepath + 'Z_test.csv',Z_test,delimiter=',')