In [4]:
import tensorflow as tf
import tensorflow.feature_column as fc
import numpy as np
import os
import sys
import matplotlib.pyplot as plt
from IPython.display import clear_output
from sklearn.impute import SimpleImputer
from sklearn.metrics import recall_score
import pandas as pd
import copy
import random
import functools
random.seed(10)

# Load and preprocess data
data = pd.read_csv('input_file_2.csv', sep=',', index_col=0)
data['issue_d'] = pd.to_datetime(data['issue_d'])

# Split data into training and test sets
train_df = data.loc[data['issue_d'] < data['issue_d'].quantile(0.75)]
test_df = data.loc[data['issue_d'] >= data['issue_d'].quantile(0.75)]

# Drop the 'issue_d' column
train_df = train_df.drop('issue_d', axis=1)
test_df = test_df.drop('issue_d', axis=1)

# Define columns to scale
all_cols = list(train_df.columns)
all_cols.remove('charged_off')
to_drop_categorical = ['home_ownership', 'verification_status', 'purpose', 'application_type']
for i in to_drop_categorical:
    all_cols.remove(i)

# Fill null values by mean imputation
train_df[all_cols] = train_df[all_cols].fillna(train_df[all_cols].mean())
test_df[all_cols] = test_df[all_cols].fillna(train_df[all_cols].mean())

# Scale values of numerical columns
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler(copy=False)
train_df[all_cols] = scaler.fit_transform(train_df[all_cols])
test_df[all_cols] = scaler.transform(test_df[all_cols])

# Balance classes for training and testing
train_dat_1s = train_df[train_df['charged_off'] == 1]
train_dat_0s = train_df[train_df['charged_off'] == 0]
keep_0s = train_dat_0s.sample(frac=train_dat_1s.shape[0]/train_dat_0s.shape[0])
train_df = pd.concat([keep_0s,train_dat_1s],axis=0)

test_dat_1s = test_df[test_df['charged_off'] == 1]
test_dat_0s = test_df[test_df['charged_off'] == 0]
keep_0s = test_dat_0s.sample(frac=test_dat_1s.shape[0]/test_dat_0s.shape[0])
test_df = pd.concat([keep_0s,test_dat_1s],axis=0)

# Define input function
def easy_input_function(df, label_key, num_epochs, shuffle, batch_size):
  label = df[label_key]
  ds = tf.data.Dataset.from_tensor_slices((dict(df),label))
  if shuffle:
    ds = ds.shuffle(10000)
  ds = ds.batch(batch_size).repeat(num_epochs)
  return ds

# Define training and test input functions
train_inpf = functools.partial(easy_input_function, train_df, label_key='charged_off',  num_epochs=5, shuffle=True, batch_size=20000)
test_inpf = functools.partial(easy_input_function, test_df, label_key='charged_off', num_epochs=1, shuffle=False, batch_size=200000)

# Define all numeric columns
loan_amnt = fc.numeric_column('loan_amnt')
term = fc.numeric_column('term')
installment = fc.numeric_column('installment')
emp_length = fc.numeric_column('emp_length')
dti = fc.numeric_column('dti')
earliest_cr_line = fc.numeric_column('earliest_cr_line')
open_acc = fc.numeric_column('open_acc')
pub_rec = fc.numeric_column('pub_rec')
revol_util = fc.numeric_column('revol_util')
total_acc = fc.numeric_column('total_acc')
mort_acc = fc.numeric_column('mort_acc')
pub_rec_bankruptcies = fc.numeric_column('pub_rec_bankruptcies')
log_annual_inc = fc.numeric_column('log_annual_inc')
fico_score = fc.numeric_column('fico_score')
log_revol_bal = fc.numeric_column('log_revol_bal')

my_numeric_columns = [loan_amnt, term, installment, emp_length, dti, earliest_cr_line, open_acc, pub_rec, revol_util, total_acc, mort_acc, pub_rec_bankruptcies, log_annual_inc, fico_score, log_revol_bal]

# Define metrics
def metric_auc(labels, predictions):
    return {
        'auc_precision_recall': tf.metrics.AUC(
            labels=labels, predictions=predictions['logistic'], num_thresholds=200,
            curve='PR', summation_method='careful_interpolation')
    }

# Train model on all numeric columns
classifier = tf.estimator.LinearClassifier(feature_columns=my_numeric_columns)
#classifier = classifier.add_metrics(metric_auc)
classifier.train(train_inpf)
result = classifier.evaluate(test_inpf)

# Print results
clear_output()
for key,value in sorted(result.items()):
  print('%s: %s' % (key, value))



accuracy: 0.6339758
accuracy_baseline: 0.5
auc: 0.684344
auc_precision_recall: 0.6638764
average_loss: 0.6440559
global_step: 60
label/mean: 0.5
loss: 0.6440559
precision: 0.62323457
prediction/mean: 0.5219454
recall: 0.6775562
