In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import ensemble
from sklearn.model_selection import cross_val_score

%matplotlib inline

In [2]:
# Replace the path with the correct path for your data.
y2015 = pd.read_csv(
    'https://www.dropbox.com/s/0so14yudedjmm5m/LoanStats3d.csv?dl=1',
    skipinitialspace=True,
    header=1,
    low_memory=False
)

In [3]:
df = y2015.loc[((y2015["id"] != "")),]
# Convert ID and Interest Rate to numeric.
df['id'] = pd.to_numeric(df['id'], errors='coerce')
df['int_rate'] = pd.to_numeric(df['int_rate'].str.strip('%'), errors='coerce')

# Drop other columns with many unique variables
df.drop(['url', 'emp_title', 'zip_code', 'earliest_cr_line', 'revol_util',
            'sub_grade', 'addr_state', 'desc'], 1, inplace=True)
df = df[:-2]
pd.get_dummies(df)
df.head(1)


Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,emp_length,...,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit
0,68009401.0,72868139.0,16000.0,16000.0,16000.0,60 months,14.85,379.39,C,10+ years,...,0.0,2.0,78.9,0.0,0.0,2.0,298100.0,31329.0,281300.0,13400.0


In [4]:
categorical = df.select_dtypes(include=['object'])
for i in categorical:
    column = categorical[i]
    print(i)
    print(column.nunique())

term
2
grade
7
emp_length
11
home_ownership
4
verification_status
3
issue_d
12
loan_status
7
pymnt_plan
1
purpose
14
title
27
initial_list_status
2
last_pymnt_d
25
next_pymnt_d
4
last_credit_pull_d
26
application_type
2
verification_status_joint
3


In [5]:
rfc = ensemble.RandomForestClassifier()
X = df.drop('loan_status', 1,errors="ignore")
Y = df['loan_status']
X = pd.get_dummies(X)
X = X.dropna(axis=1)

In [6]:
# Please uncomment. This step runs for a long time.
#cross_val_score(rfc, X, Y, cv=10)

### So here's your task. Get rid of as much data as possible without dropping below an average of 90% accuracy in a 10-fold cross validation.

You'll want to do a few things in this process. First, dive into the data that we have and see which features are most important. This can be the raw features or the generated dummies. You may want to use PCA or correlation matrices.

Can you do it without using anything related to payment amount or outstanding principal? How do you know?

#### Answer: No, I cannot do it. Cross val variance is too high (see below).

In [7]:
c = df.corr()
c = c.reset_index()
c.head(1)

Unnamed: 0,index,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,int_rate,installment,annual_inc,dti,...,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit
0,id,1.0,0.99754,-0.008288,-0.008288,-0.008554,-0.053402,-0.012919,0.015055,0.005692,...,-0.000612,0.008886,-0.010366,-0.034963,0.004595,-0.000991,0.015609,0.012864,0.017944,0.02087


In [8]:
#Drop some columns that are highly correlated with total_pymnt and out_prncp columns.
for f in df.keys():
    w = c.loc[((c["index"] == f) & (c["total_pymnt"] > 0.4)),["total_pymnt"]]
    if f == "out_prncp" or f == "total_pymnt" :
        print("*********" + f)
    else:
        if not w.empty:
            print("=============total_pymnt=======================")
            print("key " + f)
            print(w)
            df.drop(f,1,inplace=True,errors="ignore")
for f in df.keys():
    w = c.loc[((c["index"] == f) & (c["out_prncp"] > 0.4)),["out_prncp"]]
    if f == "out_prncp" or f == "total_pymnt" :
        print("*********" + f)
    else:
        if not w.empty:
            print("======out_prncp================================")
            print("key " + f)
            print(w)
            df.drop(f,1,inplace=True,errors="ignore")

key loan_amnt
   total_pymnt
2     0.706184
key funded_amnt
   total_pymnt
3     0.706184
key funded_amnt_inv
   total_pymnt
4     0.706259
key installment
   total_pymnt
6     0.732579
*********out_prncp
*********total_pymnt
key total_pymnt_inv
    total_pymnt
20     0.999997
key total_rec_prncp
    total_pymnt
21     0.964876
key total_rec_int
    total_pymnt
22     0.460466
key last_pymnt_amnt
    total_pymnt
26     0.742504
*********out_prncp
key out_prncp_inv
    out_prncp
18   0.999997
*********total_pymnt


In [9]:
rfc = ensemble.RandomForestClassifier()
X2= df.drop('loan_status', 1,errors="ignore")
Y2= df['loan_status']
X2= pd.get_dummies(X2)
X2= X2.dropna(axis=1)

In [10]:
# Please uncomment. This step runs for a long time.
#cross_val_score(rfc, X2, Y2, cv=10)

# END