## Import statements

In [59]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import pandas as pd
import numpy as np
import tensorflow as tf

import keras
from keras.layers import Dense
from keras.models import Sequential
from keras.utils import to_categorical

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedStratifiedKFold

# Privacy Package
from privacy.analysis.rdp_accountant import compute_rdp
from privacy.analysis.rdp_accountant import get_privacy_spent
from privacy.optimizers.dp_optimizer import DPGradientDescentGaussianOptimizer

from tabulate import tabulate

## Function definitions

In [40]:
def compute_epsilon(steps, noise_multiplier, batch_size):
    if noise_multiplier == 0.0:
        return float('inf')
    orders = [1 + x / 10. for x in range(1, 100)] + list(range(12, 64))
    sampling_probability = batch_size / 60000
    rdp = compute_rdp(q=sampling_probability,
                    noise_multiplier=noise_multiplier,
                    steps=steps,
                    orders=orders)
    return get_privacy_spent(orders, rdp, target_delta=1e-5)[0]

In [15]:
def data_processing_df(datafile):
    data = pd.read_csv(datafile)
    data = data.sample(frac=1, axis=0).reset_index(drop=True)
    
    predictors_df = data.drop(['action_taken_name'], axis=1)
    predictors_df = predictors_df.drop(predictors_df.columns[0], axis=1)
    
    target_df = data['action_taken_name']
    
    n_cols = predictors_df.shape[1]
    
    return n_cols, predictors_df, target_df

In [12]:
def auc(y_true, y_pred):
    auc = tf.metrics.auc(y_true, y_pred)[1]
    keras.backend.get_session().run(tf.local_variables_initializer())
    return auc

In [63]:
def run_model(datafile='nc_sc_ga_va_clean_v0.csv', kfold_n_splits=5, kfold_n_repeats=1, epochs=10, batch_size=256, validation_split=0.2, learning_rate=0.15, l2_norm_clip =1, microbatches=1):
    
    # Roughly corresponds to epsilon of [infinity (non-private), 2, 1, 0.5, 0.25, 0.125, 0.2, 0.188]
    noise_multipliers = [0, 0.84262, 1.145, 1.61203, 2.357066, 4, 10]
    
    # Process datafile
    n_cols, X, Y = data_processing_df(datafile)

    # Create kfold obejct
    seed = 2019
    np.random.seed(seed)
    kfold = RepeatedStratifiedKFold(n_splits=kfold_n_splits, n_repeats=1, random_state=seed)    

    # Train model based on different noise multipliers (different epsilons)
    for nm in noise_multipliers:
        eps = compute_epsilon(epochs * 60000 // batch_size, nm, batch_size)
        kfold_results = []
        for train_indices, test_indices in kfold.split(X, Y):
            
            # Define private optimizer
            optimizer = DPGradientDescentGaussianOptimizer(
                l2_norm_clip=l2_norm_clip,
                noise_multiplier=nm,
                num_microbatches=microbatches,
                learning_rate=learning_rate,
                unroll_microbatches=True)

            # Define layers
            model = keras.models.Sequential([
                    keras.layers.Dense(32, activation=tf.nn.relu, input_shape = (n_cols,)),
                    keras.layers.Dense(25, activation=tf.nn.softmax),
                    keras.layers.Dense(20, activation=tf.nn.relu),
                    keras.layers.Dense(2, activation=tf.nn.softmax)
                ])
            
            # Compile model
            model.compile(optimizer=optimizer, loss = 'categorical_crossentropy', metrics=['accuracy', auc])

            # Split training and testing data based on k-fold split
            x_train = X.values[train_indices]
            y_train = to_categorical(Y.values[train_indices])
            x_test = X.values[test_indices]
            y_test = to_categorical(Y.values[test_indices])

            # Train model
            model.fit(x_train, y_train, validation_split=validation_split, epochs=epochs, batch_size=batch_size, verbose = 0, shuffle=True)
            
            # Evaluate model
            results = model.evaluate(x_test, y_test, batch_size=batch_size, verbose=0)
            kfold_results.append(list(results))

        kfold_results.append([np.mean(item) for item in np.transpose(kfold_results)])
        table = kfold_results
        header = ['Loss', 'Accuracy', 'AUC']
        rowIndex = ['Split {}'.format(item) for item in range(kfold_n_splits)]
        rowIndex.append('Mean')
        print(tabulate(table, headers=header, showindex=rowIndex, floatfmt=".4f"))
        print()

## Run model