# CREDIT RISK MODEL USING NEURAL NETWORKS

In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split
import datetime

In [2]:
data = pd.read_csv('/home/praveen/Documents/Machine Learning/Tensorflow/tensorflow_tryouts/lending-club-data.csv', low_memory=False)

In [3]:
len(data)

122607

In [4]:
list(data.columns)

['id',
 'member_id',
 'loan_amnt',
 'funded_amnt',
 'funded_amnt_inv',
 'term',
 'int_rate',
 'installment',
 'grade',
 'sub_grade',
 'emp_title',
 'emp_length',
 'home_ownership',
 'annual_inc',
 'is_inc_v',
 'issue_d',
 'loan_status',
 'pymnt_plan',
 'url',
 'desc',
 'purpose',
 'title',
 'zip_code',
 'addr_state',
 'dti',
 'delinq_2yrs',
 'earliest_cr_line',
 'inq_last_6mths',
 'mths_since_last_delinq',
 'mths_since_last_record',
 'open_acc',
 'pub_rec',
 'revol_bal',
 'revol_util',
 'total_acc',
 'initial_list_status',
 'out_prncp',
 'out_prncp_inv',
 'total_pymnt',
 'total_pymnt_inv',
 'total_rec_prncp',
 'total_rec_int',
 'total_rec_late_fee',
 'recoveries',
 'collection_recovery_fee',
 'last_pymnt_d',
 'last_pymnt_amnt',
 'next_pymnt_d',
 'last_credit_pull_d',
 'collections_12_mths_ex_med',
 'mths_since_last_major_derog',
 'policy_code',
 'not_compliant',
 'status',
 'inactive_loans',
 'bad_loans',
 'emp_length_num',
 'grade_num',
 'sub_grade_num',
 'delinq_2yrs_zero',
 'pub_rec

In [5]:
data['loan_status'].unique()

array(['Fully Paid', 'Charged Off', 'Default',
       'Does not meet the credit policy.  Status:Charged Off',
       'Does not meet the credit policy.  Status:Fully Paid'], dtype=object)

In [6]:
data['safe_loans'] = data['bad_loans'].apply(lambda x : +1 if x==0 else 0)

In [7]:
data

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,delinq_2yrs_zero,pub_rec_zero,collections_12_mths_zero,short_emp,payment_inc_ratio,final_d,last_delinq_none,last_record_none,last_major_derog_none,safe_loans
0,1077501,1296599,5000,5000,4975,36 months,10.65,162.87,B,B2,...,1,1,1,0,8.143500,20141201T000000,1,1,1,1
1,1077430,1314167,2500,2500,2500,60 months,15.27,59.83,C,C4,...,1,1,1,1,2.393200,20161201T000000,1,1,1,0
2,1077175,1313524,2400,2400,2400,36 months,15.96,84.33,C,C5,...,1,1,1,0,8.259550,20141201T000000,1,1,1,1
3,1076863,1277178,10000,10000,10000,36 months,13.49,339.31,C,C1,...,1,1,1,0,8.275850,20141201T000000,0,1,1,1
4,1075269,1311441,5000,5000,5000,36 months,7.90,156.46,A,A4,...,1,1,1,0,5.215330,20141201T000000,1,1,1,1
5,1072053,1288686,3000,3000,3000,36 months,18.64,109.43,E,E1,...,1,1,1,0,2.735750,20141201T000000,1,1,1,1
6,1071795,1306957,5600,5600,5600,60 months,21.28,152.39,F,F2,...,1,1,1,0,4.571700,20161201T000000,1,1,1,0
7,1071570,1306721,5375,5375,5350,60 months,12.69,121.45,B,B5,...,1,1,1,1,9.716000,20161201T000000,1,1,1,0
8,1070078,1305201,6500,6500,6500,60 months,14.65,153.45,C,C3,...,1,1,1,0,2.557500,20161201T000000,1,1,1,1
9,1069908,1305008,12000,12000,12000,36 months,12.69,402.54,B,B5,...,1,1,1,0,6.440640,20141201T000000,1,1,1,1


In [8]:
print sum(data['safe_loans']==0)
print sum(data['safe_loans']==1)

23150
99457


In [9]:
col_subset = ['grade',                     # grade of the loan
            'sub_grade',                 # sub-grade of the loan
            'short_emp',                 # one year or less of employment
            'emp_length',                # number of years of employment
            'home_ownership',            # home_ownership status: own, mortgage or rent
            'dti',                       # debt to income ratio
            'purpose',                   # the purpose of the loan
            'term',                      # the term of the loan
            'last_delinq_none',          # has borrower had a delinquincy
            'last_major_derog_none',     # has borrower had 90 day or worse rating
            'revol_util',                # percent of available credit being used
            'total_rec_late_fee',        # total late fees received to day
            'safe_loans'      
           ]

In [10]:
data_clean  = data[col_subset].dropna()

In [11]:
training_set, test_set = train_test_split(data_clean, test_size = 0.2)

In [12]:
print sum(training_set['safe_loans']==0)
print sum(training_set['safe_loans']==1)

18450
79635


In [13]:
print sum(test_set['safe_loans']==0)
print sum(test_set['safe_loans']==1)

4700
19822


In [14]:
def input_fn(df):
    # Creates a dictionary mapping from each continuous feature column name (k) to
    # the values of that column stored in a constant Tensor.
    continuous_cols = {k: tf.constant(df[k].values)
                       for k in CONTINUOUS_COLUMNS}
    # Creates a dictionary mapping from each categorical feature column name (k)
    # to the values of that column stored in a tf.SparseTensor.
    categorical_cols = {k: tf.SparseTensor(
        indices=[[i, 0] for i in range(df[k].size)],
        values=df[k].values,
        shape=[df[k].size, 1])
                        for k in CATEGORICAL_COLUMNS}
    # Merges the two dictionaries into one.
    feature_cols = dict(continuous_cols.items() + categorical_cols.items())
    # Converts the label column into a constant Tensor.
    label = tf.constant(df[LABEL_COLUMN].values)
    # Returns the feature columns and the label.
    return feature_cols, label

def train_input_fn():
    return input_fn(training_set)

def eval_input_fn():
    return input_fn(test_set)

In [15]:
features = ['grade',                     # grade of the loan
            'sub_grade',                 # sub-grade of the loan
            'short_emp',                 # one year or less of employment
            'emp_length',                # number of years of employment
            'home_ownership',            # home_ownership status: own, mortgage or rent
            'dti',                       # debt to income ratio
            'purpose',                   # the purpose of the loan
            'term',                      # the term of the loan
            'last_delinq_none',          # has borrower had a delinquincy
            'last_major_derog_none',     # has borrower had 90 day or worse rating
            'revol_util',                # percent of available credit being used
            'total_rec_late_fee',        # total late fees received to day
           ]

LABEL_COLUMN = 'safe_loans'              # prediction target (y) (+1 means safe, -1 is risky)

In [16]:
tensor_features = {}
CATEGORICAL_COLUMNS = []
CONTINUOUS_COLUMNS = []

for i in features:
    if (data[i].dtype == 'int64') or (data[i].dtype == 'float64'):
        CONTINUOUS_COLUMNS.append(i) 
        tensor_features[i] = tf.contrib.layers.real_valued_column(i)
    else:
        CATEGORICAL_COLUMNS.append(i)
        tensor_features[i] = tf.contrib.layers.sparse_column_with_keys(
                              column_name=i, keys=list(data[i].unique()))

In [17]:
CATEGORICAL_COLUMNS

['grade', 'sub_grade', 'emp_length', 'home_ownership', 'purpose', 'term']

In [18]:
CONTINUOUS_COLUMNS

['short_emp',
 'dti',
 'last_delinq_none',
 'last_major_derog_none',
 'revol_util',
 'total_rec_late_fee']

In [19]:
wide_columns = [tensor_features['short_emp'],
                tensor_features['dti'], 
                tensor_features['last_delinq_none'], 
                tensor_features['last_major_derog_none'], 
                tensor_features['total_rec_late_fee']]

In [20]:
deep_columns = [tf.contrib.layers.embedding_column(tensor_features['grade'], dimension=8),
                tf.contrib.layers.embedding_column(tensor_features['sub_grade'], dimension=8),
                tf.contrib.layers.embedding_column(tensor_features['emp_length'], dimension=8),
                tf.contrib.layers.embedding_column(tensor_features['purpose'], dimension=8),
                tf.contrib.layers.embedding_column(tensor_features['home_ownership'], dimension=8),
                tf.contrib.layers.embedding_column(tensor_features['term'], dimension=8)
               ]

In [21]:
feature_columns = wide_columns + deep_columns

In [22]:
import tempfile

In [31]:
model_dir = tempfile.mkdtemp()
model = tf.contrib.learn.DNNClassifier(feature_columns=feature_columns,
                                            hidden_units=[10, 30],
                                            n_classes=2,
                                            model_dir=model_dir)

In [32]:
print datetime.datetime.now()
model.fit(input_fn=train_input_fn, steps=2000)
print datetime.datetime.now()



2016-09-04 21:06:38.802439
2016-09-04 21:27:06.615079


In [33]:
results = model.evaluate(input_fn=eval_input_fn, steps=1)
for key in sorted(results):
    print "%s: %s" % (key, results[key])



accuracy: 0.811394
eval_auc: 0.704114
loss: 0.446315


In [34]:
pred_labels = model.predict(input_fn=eval_input_fn)



In [35]:
print sum(pred_labels==0)
print sum(pred_labels==1)

417
24105


In [36]:
true_labels = np.array(test_set['safe_loans']).astype(int)
pred_labels = np.array(pred_labels).astype(int)

In [37]:
true_positive = sum((pred_labels == 1) & (true_labels == 1)) 
true_negative = sum((pred_labels == 0) & (true_labels == 0))
false_positive = sum((pred_labels == 1) & (true_labels == 0))
false_negative = sum((pred_labels == 0) & (true_labels == 1))

In [38]:
print 'True Positive: ',true_positive
print 'False Positive: ',false_positive
print 'True Negative: ',true_negative
print 'False Negative: ',false_negative

True Positive:  19651
False Positive:  4454
True Negative:  246
False Negative:  171


# Predicting a bigger dataset

In [58]:
data = pd.read_csv('/home/praveen/Documents/Data/Loan/loan.csv', low_memory=False)

In [59]:
data['safe_loans'] = data['loan_status'].apply(lambda x : 0 
                                               if x in ["Charged Off ",
                                                        "Default",
                                                        "Does not meet the credit policy. Status:Charged Off",
                                                        "In Grace Period", 
                                                        "Default Receiver", 
                                                        "Late (16-30 days)",
                                                        "Late (31-120 days)"] 
                                               else +1)

In [60]:
data['last_delinq_none'] = data['mths_since_last_delinq'].apply(lambda x : 0 if x > 0 else +1)

In [61]:
data['last_major_derog_none'] = data['mths_since_last_major_derog'].apply(lambda x : 0 if x > 0 else +1)

In [62]:
data['short_emp'] = data['emp_length'].apply(lambda x : +1 if x == '< 1 year' else 0)

In [63]:
data_clean  = data[col_subset].dropna()

In [66]:
training_set, test_set = train_test_split(data_clean, test_size = 0.2)

In [67]:
print sum(test_big_set['safe_loans']==0)
print sum(test_big_set['safe_loans']==1)

4623
19899


In [73]:
results = model.evaluate(input_fn=eval_input_fn, steps=1)
for key in sorted(results):
    print "%s: %s" % (key, results[key])



accuracy: 0.964572
eval_auc: 0.66799
loss: 0.271914


In [68]:
pred_labels = model.predict(input_fn=eval_input_fn)



In [69]:
print sum(pred_labels==0)
print sum(pred_labels==1)

2677
174699


In [70]:
true_labels = np.array(test_set['safe_loans']).astype(int)
pred_labels = np.array(pred_labels).astype(int)

In [71]:
true_positive = sum((pred_labels == 1) & (true_labels == 1)) 
true_negative = sum((pred_labels == 0) & (true_labels == 0))
false_positive = sum((pred_labels == 1) & (true_labels == 0))
false_negative = sum((pred_labels == 0) & (true_labels == 1))

In [72]:
print 'True Positive: ',true_positive
print 'False Positive: ',false_positive
print 'True Negative: ',true_negative
print 'False Negative: ',false_negative

True Positive:  170676
False Positive:  4023
True Negative:  416
False Negative:  2261
