In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn import ensemble
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA

pd.options.display.max_columns = 999

Here we'll use data from Lending Club to predict the state of a loan given some information about it. You can find the dataset [here](https://www.lendingclub.com/info/download-data.action). We'll use 2015 data. ([Thinkful mirror](https://www.dropbox.com/s/m7z42lubaiory33/LoanStats3d.csv?dl=0))

In [2]:
# Read the data file:
y2015 = pd.read_csv('LoanStats3d.csv',
    skipinitialspace=True,
    header=1
)

In [3]:
y2015.shape

(421097, 111)

## Data Cleaning

In [4]:
# Convert ID and Interest Rate to numeric.
y2015['id'] = pd.to_numeric(y2015['id'], errors='coerce')
y2015['int_rate'] = pd.to_numeric(y2015['int_rate'].str.strip('%'), errors='coerce')

# Drop other columns with many unique variables
y2015.drop(['url', 'emp_title', 'zip_code', 'earliest_cr_line', 'revol_util',
            'sub_grade', 'addr_state', 'desc'], 1, inplace=True)

# Remove two summary rows at the end that don't actually contain data.
y2015 = y2015[:-2]

X = y2015.drop('loan_status', 1)
Y = y2015['loan_status']
X = pd.get_dummies(X)
X = X.dropna(axis=1)

## DRILL: Third Attempt

So here's your task. Get rid of as much data as possible without dropping below an average of 90% accuracy in a 10-fold cross validation.

You'll want to do a few things in this process. First, dive into the data that we have and see which features are most important. This can be the raw features or the generated dummies. You may want to use PCA or correlation matrices.

Can you do it without using anything related to payment amount or outstanding principal? How do you know?

In [5]:
# Set the dataframe index:
X = X.set_index('id')

print('shape of X:',X.shape)

corr_matrix = X.corr().abs()

shape of X: (421095, 200)


In [6]:
# Return column and row indexes of variables within a desired range of correlation:
correlated_variables = np.where((corr_matrix<0.65) & (corr_matrix>0.60))

# Return tuples with desired column names:
correlated_variables=[(corr_matrix.index[x], corr_matrix.columns[y]) for x,y in zip(
    *correlated_variables) if x!=y and x<y]

# Preview the first 10 tuples of columns with correlations within desired range:
correlated_variables[:10]

[('loan_amnt', 'out_prncp'),
 ('loan_amnt', 'out_prncp_inv'),
 ('funded_amnt', 'out_prncp'),
 ('funded_amnt', 'out_prncp_inv'),
 ('funded_amnt_inv', 'out_prncp'),
 ('funded_amnt_inv', 'out_prncp_inv'),
 ('int_rate', 'grade_A'),
 ('installment', 'total_rec_prncp'),
 ('delinq_2yrs', 'num_tl_90g_dpd_24m'),
 ('open_acc', 'num_bc_sats')]

In [7]:
# Create a set of the important variable names:
important_features = set()
important_features.update(set([x[0] for x in correlated_variables]))
important_features.update(set([x[1] for x in correlated_variables]))
len(important_features)

21

In [8]:
# Filter only within our desired correlation range:
X2 = X[list(important_features)]

In [9]:
X2.shape

(421095, 21)

In [10]:
pca = PCA(n_components=1)

x_pca = pca.fit_transform(X)
                                      
print('% of total variance in the dataset explained by PCA:',
    pca.explained_variance_ratio_)

% of total variance in the dataset explained by PCA: [ 0.99933842]


In [11]:
rfc = ensemble.RandomForestClassifier()
cv_score = cross_val_score(rfc, X2, Y, cv=10)
cv_score

array([ 0.92928214,  0.96760941,  0.96466481,  0.96150649,  0.96048445,
        0.96048445,  0.96019853,  0.96005605,  0.96033914,  0.96038569])

#### The average accuracy is higher than 90%

In [12]:
cv_score.mean()

0.95850111553526651

In [13]:
X2.head()

Unnamed: 0_level_0,num_op_rev_tl,installment,num_bc_tl,grade_A,delinq_2yrs,out_prncp,num_sats,funded_amnt_inv,loan_amnt,num_bc_sats,pub_rec_bankruptcies,int_rate,total_acc,out_prncp_inv,mo_sin_rcnt_tl,num_tl_90g_dpd_24m,total_rec_prncp,pub_rec,open_acc,mo_sin_rcnt_rev_tl_op,funded_amnt
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
68009401.0,9.0,379.39,8.0,0,0.0,13668.88,11.0,16000.0,16000.0,6.0,0.0,14.85,19.0,13668.88,6.0,0.0,2331.12,2.0,11.0,11.0,16000.0
68354783.0,4.0,298.58,3.0,1,0.0,6635.69,7.0,9600.0,9600.0,3.0,0.0,7.49,9.0,6635.69,9.0,0.0,2964.31,0.0,7.0,9.0,9600.0
68466916.0,5.0,777.55,6.0,1,0.0,0.0,9.0,25000.0,25000.0,5.0,0.0,7.49,19.0,0.0,13.0,0.0,25000.0,0.0,9.0,13.0,25000.0
68466961.0,13.0,858.05,11.0,1,0.0,19263.77,16.0,28000.0,28000.0,9.0,0.0,6.49,24.0,19263.77,19.0,0.0,8736.23,0.0,16.0,19.0,28000.0
68495092.0,17.0,320.99,2.0,0,0.0,0.0,18.0,8650.0,8650.0,2.0,1.0,19.89,19.0,0.0,0.0,0.0,8650.0,1.0,18.0,0.0,8650.0
