   # CREDIT RISK MODEL USING TENSORFLOW

In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split

In [2]:
data = pd.read_csv('/Users/Shared/TensorFlow/tensorflow_tryouts/lending-club-data.csv', low_memory=False)

In [5]:
print data.columns

Index([u'id', u'member_id', u'loan_amnt', u'funded_amnt', u'funded_amnt_inv',
       u'term', u'int_rate', u'installment', u'grade', u'sub_grade',
       u'emp_title', u'emp_length', u'home_ownership', u'annual_inc',
       u'is_inc_v', u'issue_d', u'loan_status', u'pymnt_plan', u'url', u'desc',
       u'purpose', u'title', u'zip_code', u'addr_state', u'dti',
       u'delinq_2yrs', u'earliest_cr_line', u'inq_last_6mths',
       u'mths_since_last_delinq', u'mths_since_last_record', u'open_acc',
       u'pub_rec', u'revol_bal', u'revol_util', u'total_acc',
       u'initial_list_status', u'out_prncp', u'out_prncp_inv', u'total_pymnt',
       u'total_pymnt_inv', u'total_rec_prncp', u'total_rec_int',
       u'total_rec_late_fee', u'recoveries', u'collection_recovery_fee',
       u'last_pymnt_d', u'last_pymnt_amnt', u'next_pymnt_d',
       u'last_credit_pull_d', u'collections_12_mths_ex_med',
       u'mths_since_last_major_derog', u'policy_code', u'not_compliant',
       u'status', u'inactiv

In [4]:
features = ['grade',                     # grade of the loan
            'sub_grade',                 # sub-grade of the loan
            'short_emp',                 # one year or less of employment
            'emp_length_num',            # number of years of employment
            'home_ownership',            # home_ownership status: own, mortgage or rent
            'dti',                       # debt to income ratio
            'purpose',                   # the purpose of the loan
            'term',                      # the term of the loan
            'last_delinq_none',          # has borrower had a delinquincy
            'last_major_derog_none',     # has borrower had 90 day or worse rating
            'revol_util',                # percent of available credit being used
            'total_rec_late_fee',        # total late fees received to day
           ]

target = 'safe_loans'                   # prediction target (y) (+1 means safe, -1 is risky)

In [5]:
data[target] = data['bad_loans'].apply(lambda x : +1 if x==0 else -1)

In [8]:
data['short_emp'] = data['short_emp'].astype('str')
data['last_delinq_none'] = data['last_delinq_none'].astype('str')
data['last_major_derog_none'] = data['last_major_derog_none'].astype('str')

In [9]:
print data.dtypes

id                               int64
member_id                        int64
loan_amnt                        int64
funded_amnt                      int64
funded_amnt_inv                  int64
term                            object
int_rate                       float64
installment                    float64
grade                           object
sub_grade                       object
emp_title                       object
emp_length                      object
home_ownership                  object
annual_inc                     float64
is_inc_v                        object
issue_d                         object
loan_status                     object
pymnt_plan                      object
url                             object
desc                            object
purpose                         object
title                           object
zip_code                        object
addr_state                      object
dti                            float64
delinq_2yrs              

In [10]:
data[features]

Unnamed: 0,grade,sub_grade,short_emp,emp_length_num,home_ownership,dti,purpose,term,last_delinq_none,last_major_derog_none,revol_util,total_rec_late_fee
0,B,B2,0,11,RENT,27.65,credit_card,36 months,1,1,83.70,0.00
1,C,C4,1,1,RENT,1.00,car,60 months,1,1,9.40,0.00
2,C,C5,0,11,RENT,8.72,small_business,36 months,1,1,98.50,0.00
3,C,C1,0,11,RENT,20.00,other,36 months,0,1,21.00,16.97
4,A,A4,0,4,RENT,11.20,wedding,36 months,1,1,28.30,0.00
5,E,E1,0,10,RENT,5.35,car,36 months,1,1,87.50,0.00
6,F,F2,0,5,OWN,5.55,small_business,60 months,1,1,32.60,0.00
7,B,B5,1,1,RENT,18.08,other,60 months,1,1,36.50,0.00
8,C,C3,0,6,OWN,16.12,debt_consolidation,60 months,1,1,20.60,0.00
9,B,B5,0,11,OWN,10.78,debt_consolidation,36 months,1,1,67.10,0.00


In [6]:
CATEGORICAL_COLUMNS = ["grade", "sub_grade", "home_ownership",
                       "purpose", "term"]
CONTINUOUS_COLUMNS = ["emp_length_num", "dti", "revol_util", "total_rec_late_fee", "short_emp", "last_delinq_none", "last_major_derog_none"]

LABEL_COLUMN = [target]

In [7]:
training_set, test_set = train_test_split(data, test_size = 0.2)

In [8]:
print len(training_set)
print len(test_set)

98085
24522


In [9]:
def input_fn(df):
    # Creates a dictionary mapping from each continuous feature column name (k) to
    # the values of that column stored in a constant Tensor.
    continuous_cols = {k: tf.constant(df[k].values)
                       for k in CONTINUOUS_COLUMNS}
    # Creates a dictionary mapping from each categorical feature column name (k)
    # to the values of that column stored in a tf.SparseTensor.
    categorical_cols = {k: tf.SparseTensor(
        indices=[[i, 0] for i in range(df[k].size)],
        values=df[k].values,
        shape=[df[k].size, 1])
                        for k in CATEGORICAL_COLUMNS}
    # Merges the two dictionaries into one.
    feature_cols = dict(continuous_cols.items() + categorical_cols.items())
    # Converts the label column into a constant Tensor.
    label = tf.constant(df[LABEL_COLUMN].values)
    # Returns the feature columns and the label.
    return feature_cols, label

def train_input_fn():
    return input_fn(training_set)

def eval_input_fn():
    return input_fn(test_set)

In [10]:
grade_keys = list(data['grade'].unique())
sub_grade_keys = list(data['sub_grade'].unique())
home_ownership_keys = list(data['home_ownership'].unique())
purpose_keys = list(data['purpose'].unique())
term_keys = list(data['term'].unique())

In [47]:
print "grade_keys: ",grade_keys
print "sub_grade_keys: ",sub_grade_keys
print "home_ownership_keys: ",home_ownership_keys
print "purpose_keys: ",purpose_keys
print "term_keys: ",term_keys

grade_keys:  ['B', 'C', 'A', 'E', 'F', 'D', 'G']
sub_grade_keys:  ['B2', 'C4', 'C5', 'C1', 'A4', 'E1', 'F2', 'B5', 'C3', 'B1', 'D1', 'A1', 'B3', 'B4', 'C2', 'D2', 'A3', 'A5', 'D5', 'A2', 'E4', 'D3', 'D4', 'F3', 'E3', 'F1', 'E5', 'G4', 'E2', 'G2', 'F5', 'F4', 'G5', 'G1', 'G3']
home_ownership_keys:  ['RENT', 'OWN', 'MORTGAGE', 'OTHER']
purpose_keys:  ['credit_card', 'car', 'small_business', 'other', 'wedding', 'debt_consolidation', 'home_improvement', 'major_purchase', 'medical', 'moving', 'vacation', 'house']
term_keys:  [' 36 months', ' 60 months']


In [12]:
grade = tf.contrib.layers.sparse_column_with_keys(
  column_name="grade", keys=grade_keys)
sub_grade = tf.contrib.layers.sparse_column_with_keys(
  column_name="sub_grade", keys=sub_grade_keys)
home_ownership = tf.contrib.layers.sparse_column_with_keys(
  column_name="home_ownership", keys=home_ownership_keys)
purpose = tf.contrib.layers.sparse_column_with_keys(
  column_name="purpose", keys=purpose_keys)
term = tf.contrib.layers.sparse_column_with_keys(
  column_name="term", keys=term_keys)




In [13]:
emp_length_num = tf.contrib.layers.real_valued_column("emp_length_num")
dti = tf.contrib.layers.real_valued_column("dti")
revol_util = tf.contrib.layers.real_valued_column("revol_util")
total_rec_late_fee = tf.contrib.layers.real_valued_column("total_rec_late_fee")
short_emp = tf.contrib.layers.real_valued_column("short_emp")
last_delinq_none = tf.contrib.layers.real_valued_column("last_delinq_none")
last_major_derog_none = tf.contrib.layers.real_valued_column("last_major_derog_none")

In [14]:
import tempfile

In [15]:
model_dir = tempfile.mkdtemp()
model = tf.contrib.learn.LinearClassifier(feature_columns=[grade, sub_grade, short_emp, emp_length_num, 
                                                           home_ownership, dti, purpose, term, last_delinq_none, 
                                                           last_major_derog_none, revol_util, total_rec_late_fee],
  model_dir=model_dir)

In [16]:
model.fit(input_fn=train_input_fn, steps=200)



LinearClassifier()

In [17]:
results = model.evaluate(input_fn=eval_input_fn, steps=1)
for key in sorted(results):
    print "%s: %s" % (key, results[key])



accuracy: 0.675353
eval_auc: 0.0
loss: 0.203671


In [24]:
model_reg_dir = tempfile.mkdtemp()
model_reg = tf.contrib.learn.LinearClassifier(feature_columns=[grade, sub_grade, short_emp, emp_length_num, 
                                                           home_ownership, dti, purpose, term, last_delinq_none, 
                                                           last_major_derog_none, revol_util, total_rec_late_fee],
                                          optimizer=tf.train.FtrlOptimizer(
                                                        learning_rate=0.1,
                                                        l1_regularization_strength=5,
                                                        l2_regularization_strength=5),
                                        model_dir=model_dir)

In [25]:
model_reg.fit(input_fn=train_input_fn, steps=200)



LinearClassifier()

In [26]:
results = model_reg.evaluate(input_fn=eval_input_fn, steps=1)
for key in sorted(results):
    print "%s: %s" % (key, results[key])



accuracy: 0.675353
eval_auc: 0.0
loss: 0.203671


In [28]:
model.linear_weights_

{'linear/dti_weight': array([[-0.03805282]], dtype=float32),
 'linear/emp_length_num_weight': array([[ 0.02238518]], dtype=float32),
 'linear/grade_weights': array([[ 0.45563677],
        [ 0.15242699],
        [ 1.0327388 ],
        [-0.37879944],
        [-0.67924559],
        [-0.19041438],
        [-0.27120754]], dtype=float32),
 'linear/home_ownership_weights': array([[-0.05935721],
        [ 0.06286756],
        [ 0.31478202],
        [-0.02970558]], dtype=float32),
 'linear/last_delinq_none_weight': array([[-0.10200381]], dtype=float32),
 'linear/last_major_derog_none_weight': array([[ 0.04009672]], dtype=float32),
 'linear/purpose_weights': array([[ 0.35529289],
        [ 0.15436907],
        [-0.71492136],
        [-0.09633061],
        [ 0.12145796],
        [ 0.15583244],
        [ 0.05556543],
        [ 0.13654606],
        [-0.05691161],
        [ 0.02876095],
        [ 0.00216244],
        [ 0.03379225]], dtype=float32),
 'linear/revol_util_weight': array([[-0.00319041]],

# ADDING MORE FEATURES

In [29]:
features = ['grade',                     # grade of the loan (categorical)
            'sub_grade_num',             # sub-grade of the loan as a number from 0 to 1
            'short_emp',                 # one year or less of employment
            'emp_length_num',            # number of years of employment
            'home_ownership',            # home_ownership status: own, mortgage or rent
            'dti',                       # debt to income ratio
            'purpose',                   # the purpose of the loan
            'payment_inc_ratio',         # ratio of the monthly payment to income
            'delinq_2yrs',               # number of delinquincies 
            'delinq_2yrs_zero',          # no delinquincies in last 2 years
            'inq_last_6mths',            # number of creditor inquiries in last 6 months
            'last_delinq_none',          # has borrower had a delinquincy
            'last_major_derog_none',     # has borrower had 90 day or worse rating
            'open_acc',                  # number of open credit accounts
            'pub_rec',                   # number of derogatory public records
            'pub_rec_zero',              # no derogatory public records
            'revol_util',                # percent of available credit being used
            'total_rec_late_fee',        # total late fees received to day
            'int_rate',                  # interest rate of the loan
            'total_rec_int',             # interest received to date
            'annual_inc',                # annual income of borrower
            'funded_amnt',               # amount committed to the loan
            'funded_amnt_inv',           # amount committed by investors for the loan
            'installment',               # monthly payment owed by the borrower
           ]

In [39]:
tensor_features = {}
CATEGORICAL_COLUMNS = []
CONTINUOUS_COLUMNS = []

for i in features:
    if (data[i].dtype == 'int64') or (data[i].dtype == 'float64'):
        CONTINUOUS_COLUMNS.append(i) 
        tensor_features[i] = tf.contrib.layers.real_valued_column(i)
    else:
        CATEGORICAL_COLUMNS.append(i)
        tensor_features[i] = tf.contrib.layers.sparse_column_with_keys(
                              column_name=i, keys=list(data[i].unique()))

In [37]:
print tensor_features['grade']

_SparseColumn(column_name='grade', is_integerized=False, bucket_size=None, lookup_config=_SparseIdLookupConfig(vocabulary_file=None, keys=('B', 'C', 'A', 'E', 'F', 'D', 'G'), num_oov_buckets=0, vocab_size=7, default_value=-1), weight_column=None, combiner='sum', dtype=tf.string)


In [38]:
print tensor_features['short_emp']

_RealValuedColumn(column_name='short_emp', dimension=1, default_value=None, dtype=tf.float32)


In [40]:
CATEGORICAL_COLUMNS

['grade', 'home_ownership', 'purpose']

In [41]:
CONTINUOUS_COLUMNS

['sub_grade_num',
 'short_emp',
 'emp_length_num',
 'dti',
 'payment_inc_ratio',
 'delinq_2yrs',
 'delinq_2yrs_zero',
 'inq_last_6mths',
 'last_delinq_none',
 'last_major_derog_none',
 'open_acc',
 'pub_rec',
 'pub_rec_zero',
 'revol_util',
 'total_rec_late_fee',
 'int_rate',
 'total_rec_int',
 'annual_inc',
 'funded_amnt',
 'funded_amnt_inv',
 'installment']

In [43]:
model_dir = tempfile.mkdtemp()
model = tf.contrib.learn.LinearClassifier(feature_columns=[tensor_features['grade'],                     # grade of the loan (categorical)
            tensor_features['sub_grade_num'],             # sub-grade of the loan as a number from 0 to 1
            tensor_features['short_emp'],                 # one year or less of employment
            tensor_features['emp_length_num'],            # number of years of employment
            tensor_features['home_ownership'],            # home_ownership status: own, mortgage or rent
            tensor_features['dti'],                       # debt to income ratio
            tensor_features['purpose'],                   # the purpose of the loan
            tensor_features['payment_inc_ratio'],         # ratio of the monthly payment to income
            tensor_features['delinq_2yrs'],               # number of delinquincies 
            tensor_features['delinq_2yrs_zero'],          # no delinquincies in last 2 years
            tensor_features['inq_last_6mths'],            # number of creditor inquiries in last 6 months
            tensor_features['last_delinq_none'],          # has borrower had a delinquincy
            tensor_features['last_major_derog_none'],     # has borrower had 90 day or worse rating
            tensor_features['open_acc'],                  # number of open credit accounts
            tensor_features['pub_rec'],                   # number of derogatory public records
            tensor_features['pub_rec_zero'],              # no derogatory public records
            tensor_features['revol_util'],                # percent of available credit being used
            tensor_features['total_rec_late_fee'],        # total late fees received to day
            tensor_features['int_rate'],                  # interest rate of the loan
            tensor_features['total_rec_int'],             # interest received to date
            tensor_features['annual_inc'],                # annual income of borrower
            tensor_features['funded_amnt'],               # amount committed to the loan
            tensor_features['funded_amnt_inv'],           # amount committed by investors for the loan
            tensor_features['installment']],               # monthly payment owed by the borrower
  model_dir=model_dir)

In [49]:
model.fit(input_fn=train_input_fn, steps=200)

ERROR:tensorflow:Model diverged with loss = NaN.


NanLossDuringTrainingError: NaN loss during training.

In [50]:
results = model.evaluate(input_fn=eval_input_fn, steps=1)
for key in sorted(results):
    print "%s: %s" % (key, results[key])



accuracy: 0.813147
eval_auc: 0.0
loss: nan
