In [5]:
'''import the required packages and read the file.'''

import tensorflow as tf
import tensorflow.feature_column as fc
import numpy as np
import os
import sys

import matplotlib.pyplot as plt
from IPython.display import clear_output
from sklearn.impute import SimpleImputer

from sklearn.metrics import recall_score


import pandas as pd
import copy
import random

random.seed( 10 )

#######################################################


# Load and preprocess data
data = pd.read_csv('input_file_2.csv', sep=',', index_col=0)
data['issue_d'] = pd.to_datetime(data['issue_d'])

# Split data into training and test sets
train_df = data.loc[data['issue_d'] < data['issue_d'].quantile(0.75)]
test_df = data.loc[data['issue_d'] >= data['issue_d'].quantile(0.75)]

# Drop the 'issue_d' column
train_df = train_df.drop('issue_d', axis=1)
test_df = test_df.drop('issue_d', axis=1)

# Define columns to scale
all_cols = list(train_df.columns)
all_cols.remove('charged_off')
to_drop_categorical = ['home_ownership', 'verification_status', 'purpose', 'application_type']
for i in to_drop_categorical:
    all_cols.remove(i)

# Fill null values by mean imputation
train_df[all_cols] = train_df[all_cols].fillna(train_df[all_cols].mean())
test_df[all_cols] = test_df[all_cols].fillna(train_df[all_cols].mean())

# Scale values of numerical columns
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler(copy=False)
train_df[all_cols] = scaler.fit_transform(train_df[all_cols])
test_df[all_cols] = scaler.transform(test_df[all_cols])

# Balance classes for training and testing
train_dat_1s = train_df[train_df['charged_off'] == 1]
train_dat_0s = train_df[train_df['charged_off'] == 0]
keep_0s = train_dat_0s.sample(frac=train_dat_1s.shape[0]/train_dat_0s.shape[0])
train_df = pd.concat([keep_0s,train_dat_1s],axis=0)

test_dat_1s = test_df[test_df['charged_off'] == 1]
test_dat_0s = test_df[test_df['charged_off'] == 0]
keep_0s = test_dat_0s.sample(frac=test_dat_1s.shape[0]/test_dat_0s.shape[0])
test_df = pd.concat([keep_0s,test_dat_1s],axis=0)



def easy_input_function(df, label_key, num_epochs, shuffle, batch_size):
  label = df[label_key]
  ds = tf.data.Dataset.from_tensor_slices((dict(df),label))

  if shuffle:
    ds = ds.shuffle(10000)

  ds = ds.batch(batch_size).repeat(num_epochs)

  return ds

###################################################################
import functools

'''Define training and test input functions with their parameters.'''

train_inpf = functools.partial(easy_input_function, train_df, label_key='charged_off',  num_epochs=5, shuffle=True, batch_size=20000)#300000 #230934
test_inpf = functools.partial(easy_input_function, test_df, label_key='charged_off', num_epochs=1, shuffle=False, batch_size=200000) #200000
###################################################################

#DEFINE ALL NUMERIC COLUMNS

loan_amnt = fc.numeric_column('loan_amnt')
term = fc.numeric_column('term')
installment = fc.numeric_column('installment')
emp_length = fc.numeric_column('emp_length')
dti = fc.numeric_column('dti')
earliest_cr_line = fc.numeric_column('earliest_cr_line')
open_acc = fc.numeric_column('open_acc')
pub_rec = fc.numeric_column('pub_rec')
revol_util = fc.numeric_column('revol_util')
total_acc = fc.numeric_column('total_acc')
mort_acc = fc.numeric_column('mort_acc')
pub_rec_bankruptcies = fc.numeric_column('pub_rec_bankruptcies')
log_annual_inc = fc.numeric_column('log_annual_inc')
fico_score = fc.numeric_column('fico_score')
log_revol_bal = fc.numeric_column('log_revol_bal')

my_numeric_columns = [loan_amnt,
term,
installment,
emp_length,
dti,
earliest_cr_line,
open_acc,
pub_rec,
revol_util,
total_acc,
mort_acc,
pub_rec_bankruptcies,
log_annual_inc,
fico_score,
log_revol_bal]

##############################################

#RETRAIN MODEL ON ALL THESE CATEGORICAL COLUMNS AS WELL


def metric_auc(labels, predictions):
    return {
        'auc_precision_recall': tf.metrics.auc(
            labels=labels, predictions=predictions['logistic'], num_thresholds=200,
            curve='PR', summation_method='careful_interpolation')
    }

def metric_recall_0(labels, predictions):
    return {
        'recall_0': tf.metrics.recall(
            labels=labels, predictions=predictions['logistic'], name = '0')
    }

def metric_recall_1(labels, predictions):
    return {
        'recall_1': tf.metrics.recall(
            labels=labels, predictions=predictions['logistic'], name = '1')
    }

########################################################

#NOW FOR CATEGORICAL COLUMNS...

print('Now encoding categorical columns')

'''relationship = fc.categorical_column_with_vocabulary_list(
    'relationship',
    ['Husband', 'Not-in-family', 'Wife', 'Own-child', 'Unmarried', 'Other-relative'])

print(fc.input_layer(feature_batch, [age, fc.indicator_column(relationship)]))'''

home_ownership = tf.feature_column.categorical_column_with_hash_bucket(
    'home_ownership', hash_bucket_size=100000)

verification_status = tf.feature_column.categorical_column_with_hash_bucket(
    'verification_status', hash_bucket_size=100000)

purpose = tf.feature_column.categorical_column_with_hash_bucket(
    'purpose', hash_bucket_size=100000)

application_type = tf.feature_column.categorical_column_with_hash_bucket(
    'application_type', hash_bucket_size=100000)



print('actual DNN Grid Search with non-linearity')

for layer_1 in [5, 10, 15, 20, 30]:
    for layer_2 in [1, 3, 5, 10]:

        model_l2 = tf.estimator.DNNClassifier(
            hidden_units=[layer_1, layer_2],
            feature_columns=my_numeric_columns,
            activation_fn=tf.nn.tanh,
            dropout=0.2,
            optimizer="Adam")

        #model_l2 = tf.contrib.estimator.add_metrics(model_l2, metric_auc)


        model_l2.train(train_inpf)

        print('TEST RESULTS ', layer_1, layer_2)

        results = model_l2.evaluate(test_inpf)
        clear_output()
        for key in sorted(results):
            print('%s: %0.2f' % (key, results[key]))

##########################################

accuracy: 0.52
accuracy_baseline: 0.50
auc: 0.54
auc_precision_recall: 0.53
average_loss: 0.72
global_step: 60.00
label/mean: 0.50
loss: 0.72
precision: 0.52
prediction/mean: 0.51
recall: 0.55
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'C:\\Users\\ASUSN5~1\\AppData\\Local\\Temp\\tmpaors_lad', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_t