## private_hmda_NC_cnn_v10
### Neural Network using North Carolina HMDA LAR Data with Privacy

#### Changes:
* calculate DI with dataset v1

In [46]:
# Import General Libraries 
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import tensorflow as tf
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Deep Learning Libraries
import keras
from keras.layers import Dense
from keras.models import Sequential
from keras.callbacks import EarlyStopping
from sklearn.utils import class_weight
from sklearn import preprocessing
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split

# Privacy Package
from privacy.analysis.rdp_accountant import compute_rdp
from privacy.analysis.rdp_accountant import get_privacy_spent
from privacy.optimizers.dp_optimizer import DPGradientDescentGaussianOptimizer

In [68]:
# Parameters
dpsgd = True
learning_rate = 0.15
noise_multiplier = 0.5
l2_norm_clip = 1.0
batch_size = 1
epochs = 2
microbatches = 1
model_dir = None

In [69]:
def compute_epsilon(steps):
  """Computes epsilon value for given hyperparameters."""
  if noise_multiplier == 0.0:
    return float('inf')
  orders = [1 + x / 10. for x in range(1, 100)] + list(range(12, 64))
  sampling_probability = batch_size / 60000
  rdp = compute_rdp(q=sampling_probability,
                    noise_multiplier=noise_multiplier,
                    steps=steps,
                    orders=orders)
  # Delta is set to 1e-5 because MNIST has 60000 training points.
  return get_privacy_spent(orders, rdp, target_delta=1e-5)[0]

In [70]:
def data_processing(version):
    data = pd.read_csv('hmda_nc_cleaned_v2.csv')

    target = data['action_taken_name']
    target = to_categorical(target) 
    predictors = data.drop(['action_taken_name'], axis=1)
    predictors = predictors.drop(predictors.columns[0], axis=1)
    
    return predictors, target

In [71]:
optimizer = DPGradientDescentGaussianOptimizer(
    l2_norm_clip=l2_norm_clip,
    noise_multiplier=noise_multiplier,
    num_microbatches=microbatches,
    learning_rate=learning_rate,
    unroll_microbatches=True)

In [72]:
X, Y = data_processing(i)
n_cols = X.shape[1]

model = tf.keras.Sequential([
    tf.keras.layers.Dense(32, activation='relu', input_shape = (n_cols,)),
      tf.keras.layers.Dense(25, activation='relu'),
      tf.keras.layers.Dense(20, activation='relu'),
      tf.keras.layers.Dense(2, activation='softmax'),
])

model.compile(optimizer=optimizer, loss = 'categorical_crossentropy', metrics=['accuracy'])
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.20, random_state = 0)
model.fit(x_train, y_train, validation_split=0.2, epochs=epochs, batch_size=batch_size, verbose = 1)
scores = model.evaluate(x_test, y_test, verbose=0)
print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
eps = compute_epsilon(epochs * 60000 // batch_size)
print('For delta=1e-5, the current epsilon is: %.2f' % eps)

Train on 178272 samples, validate on 44568 samples
Epoch 1/2
Epoch 2/2
acc: 20.91%
For delta=1e-5, the current epsilon is: 2.77


In [73]:
y_true = y_test

y_true_one_col = []

for row in y_true:
    if row[1] == 1:
        y_true_one_col.append(1)
    else:
        y_true_one_col.append(0)
        
y_true_one_col = np.asarray(y_true_one_col)

from sklearn.metrics import confusion_matrix
y_pred = model.predict(x_test)

In [74]:
y_pred=np.argmax(y_pred, axis=1)

In [75]:
print('-' * 30)
print('Overall confusion matrix:')
print(confusion_matrix(y_true_one_col, y_pred))
print('-' * 30)
print()

------------------------------
Overall confusion matrix:
[[ 9658   297]
 [43767  1989]]
------------------------------



In [76]:
def conf_mat(var):
    predictors = x_test[x_test[var] == 1]
    target = y_test[x_test[var] == 1]
    
    y_true = target

    y_true_one_col = []

    for row in y_true:
        if row[1] == 1:
            y_true_one_col.append(1)
        else:
            y_true_one_col.append(0)

    y_true_one_col = np.asarray(y_true_one_col)

    y_pred = model.predict(predictors)
    y_pred = np.argmax(y_pred, axis=1)
    
    return confusion_matrix(y_true_one_col, y_pred)

In [77]:
features = ['applicant_race_name_1_0', 'applicant_race_name_1_1', 'applicant_ethnicity_name_Hispanic or Latino', 'applicant_ethnicity_name_Not Hispanic or Latino', 'applicant_sex_name_Female', 'applicant_sex_name_Male']
print('-' * 30)
print('Confusion matrix by features:')
for ft in features:
    print()
    print(ft)
    print(conf_mat(ft))
print('-' * 30)

------------------------------
Confusion matrix by features:

applicant_race_name_1_0
[[2845  170]
 [6963  698]]

applicant_race_name_1_1
[[ 6813   127]
 [36804  1291]]

applicant_ethnicity_name_Hispanic or Latino
[[ 515    6]
 [2174   36]]

applicant_ethnicity_name_Not Hispanic or Latino
[[ 9143   291]
 [41593  1953]]

applicant_sex_name_Female
[[ 3614   132]
 [14147   672]]

applicant_sex_name_Male
[[ 6044   165]
 [29620  1317]]
------------------------------


In [78]:
# cm stands for confusion matrix

def calc_di2(prot_cm, unprot_cm):
    prot_total = np.sum(prot_cm)
    unprot_total = np.sum(unprot_cm)
    
    prot_1 = np.sum(prot_cm, axis = 0)[1]
    unprot_1 = np.sum(unprot_cm, axis = 0)[1]
    
    return (prot_1/prot_total)/(unprot_1/unprot_total)

In [79]:
def calc_di(prot_var_name, unprot_var_name):
    prot_df = x_test[prot_var_name]
    unprot_df = x_test[unprot_var_name]
    
    prot_total = prot_df.value_counts()[1]
    unprot_total = unprot_df.value_counts()[1]
    
    prot_pred = np.argmax(model.predict(x_test[prot_df == 1]), axis = 1)
    unprot_pred = np.argmax(model.predict(x_test[unprot_df == 1]), axis = 1)
    
    prot_1 = np.count_nonzero(prot_pred)
    unprot_1 = np.count_nonzero(unprot_pred)
    
    return (prot_1/prot_total)/(unprot_1/unprot_total)

In [80]:
print('-' * 30)
print('DI by features:')
for i in range(0, len(features), 2):
    print()
    print('DI between \'{:s}\' and \'{:s}\''.format(features[i], features[i+1]))
    print(calc_di(features[i], features[i+1]))
print('-' * 30)

------------------------------
DI by features:

DI between 'applicant_race_name_1_0' and 'applicant_race_name_1_1'
2.5821715765982622

DI between 'applicant_ethnicity_name_Hispanic or Latino' and 'applicant_ethnicity_name_Not Hispanic or Latino'
0.36309200954773574

DI between 'applicant_sex_name_Female' and 'applicant_sex_name_Male'
1.0854877980964137
------------------------------
