# DRILL: Random Forest, Third Attempt

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import ensemble
from sklearn.model_selection import cross_val_score
import seaborn as sns
%matplotlib inline

df = pd.read_csv('https://www.dropbox.com/s/0so14yudedjmm5m/LoanStats3d.csv?dl=1', skipinitialspace=True, header=1)
df.head()

In [None]:
print(df.shape)
df = df[300000:]

df.shape

## First Attempt

In [None]:
categorical = df.select_dtypes(include=['object'])
for i in categorical:
    column = categorical[i]
    print(i, column.nunique())

In [None]:
# Convert ID and Interest Rate to numeric.
df['id'] = pd.to_numeric(df['id'], errors='coerce')
df['int_rate'] = pd.to_numeric(df['int_rate'].str.strip('%'), errors='coerce')

# Drop other columns with many unique variables
df.drop(['url', 'emp_title', 'zip_code', 'earliest_cr_line', 'revol_util',
            'sub_grade', 'addr_state', 'desc'], 1, inplace=True)

In [None]:
df.tail()

In [None]:
df = df[:-2]
df.tail()

## Second Attempt

In [None]:
rfc = ensemble.RandomForestClassifier()
X = df.drop('loan_status', 1)
Y = df['loan_status']
X = pd.get_dummies(X)
X = X.dropna(axis=1)

cross_val_score(rfc, X, Y, cv=10)

## Third Attempt

Get rid of as much data as possible without dropping below an average of 90% accuracy in a 10-fold cross validation.

First, dive into the data that we have and see which features are most important. This can be the raw features or the generated dummies. You may want to use PCA or correlation matrices.

In [None]:
print(df.columns)
df = df.drop('id', 1)
df = df.drop('member_id', 1)

df.head()

In [None]:
df.dtypes

In [None]:
# Make term number
df.term = df.term.map({' 60 months': 60, ' 36 months': 36})

# Make emp_length number
df['emp_length'] = df['emp_length'].str.extract('(\d+)')
df = df.dropna(subset=['emp_length'])
df['emp_length'] = df['emp_length'].astype(int)

# Make grade, issue_d, last_pymnt_d, next_pymnt_d, last_credit_pull_d a string
# df['grade'] = df['grade'].astype(str)
# df.issue_d = df.issue_d.astype(str)
# df.last_pymnt_d = df.last_pymnt_d.astype(str)
# df = df.dropna(subset=['next_pymnt_d'])
# df.next_pymnt_d = df.next_pymnt_d.astype(str)
# df = df.dropna(subset=['last_credit_pull_d'])
# df.next_pymnt_d = df.last_credit_pull_d.astype(str)

# Make verification_status a boolean (dummies?)
df.verification_status = df.verification_status.map({'Source Verified': 1, 'Not Verified': 0, 'Verified': 1})

# Drop pymnt_plan and application_type and verification_status_joint (only 1 value each) and title (purely qualitative and inconsistent)
df = df.drop('pymnt_plan', 1)
df = df.drop('application_type', 1)
df = df.drop('verification_status_joint', 1)
df = df.drop('title', 1)

# Make home_ownership, purpose, loan_status, initial_list_status into dummies later?

In [None]:
dtype_groups = df.columns.to_series().groupby(df.dtypes).groups
dtype_dict = {k.name: v for k, v in dtype_groups.items()}
all_num_cols = list(dtype_dict['float64']) + list(dtype_dict['int64']) + ['loan_status']

df = df[all_num_cols]
df.head()

In [None]:
# TODO: make plot larger
plt.figure(figsize=(20,10))
sns.heatmap(df.corr())
plt.show()

In [None]:
from sklearn.decomposition import PCA

pca_num_components = [10, 3, 2, 1]

def get_pca_cross_val_score(num_components):
    X = df.drop('loan_status', 1)
    X = pd.get_dummies(X)
    old_num_cols = len(X.columns)
    X = X.dropna(axis=1)
    print('Dropped', len(X.columns) - old_num_cols, 'columns')
    
    pca = PCA(n_components=num_components)
    principal_components = pca.fit_transform(X)
    
    rfc = ensemble.RandomForestClassifier()
    Y = df['loan_status']
    X = principal_components
    
    print(num_components, cross_val_score(rfc, X, Y, cv=10))
    
    # TODO: Graph each feature's importance to the model's accuracy
    importances = rfc.feature_importances_
    indices = np.argsort(importances)[::-1]

    # Print the feature ranking
    print("Feature ranking:")
    for f in range(X.shape[1]):
        print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

    plt.figure(figsize=(20,10))
    plt.title("Feature importances")
    plt.bar(range(X.shape[1]), importances[indices],
           color="r", yerr=std[indices], align="center")
    plt.xticks(range(X.shape[1]), indices)
    plt.xlim([-1, X.shape[1]])
    plt.show()

for num_components in pca_num_components:
    get_pca_cross_val_score(num_components)

Can you do it without using anything related to payment amount or outstanding principal? How do you know?

I can do this without anything related to payment amount or outstanding principal – I ran PCA again without those columns, and got extremely similar cross validation scores for the same number of components.

TODO: DO FEATURE IMPORTANCE TO SEE IF PAYMENT AMOUNT OR OUTSTANDING PRINCIPLE RANKS HIGH
TRY TO REMOVE AND SEE HOW MUCH EFFECT THEY HAVE ON MODEL

In [None]:
def get_pca_cross_val_score_wo_pymnt_or_principal(num_components):
    X = df.drop('loan_status', 1)
    X = X.drop('total_pymnt', 1)
    X = X.drop('out_prncp', 1)
    X = X.drop('total_pymnt_inv', 1)
    X = X.drop('out_prncp_inv', 1)
    X = pd.get_dummies(X)
    old_num_cols = len(X.columns)
    X = X.dropna(axis=1)
    print('Dropped', len(X.columns) - old_num_cols, 'columns')
    
    pca = PCA(n_components=num_components)
    principal_components = pca.fit_transform(X)
    
    rfc = ensemble.RandomForestClassifier()
    Y = df['loan_status']
    X = principal_components
    
    print(num_components, cross_val_score(rfc, X, Y, cv=10))

for num_components in pca_num_components:
    get_pca_cross_val_score(num_components)

It seems like 2 is the minimum number of components needed for PCA to still get > 90% cross validation scores across 10 folds. 

In [None]:
# FEEDBACK NOTES:

# REMINDER: correlation matrices do not work for categorical (so no point in doing these correlation matrices)
# May just show that I need to do PCA to get rid of unnecessary features that are highly correlated with others

# you CAN test for multicollinearity, but doesn't matter for random forest
# random forest is NOT affected by multicollinearity 
# (will simply vote for whatever the 2 correlated features say target should be)

# To test effectiveness on categorical target:

# T test will help you see:
# - distribution of data for different categories is significantly different? T-test
# - difference in categories' averages? (using standard deviation as well)

# effective of category on category - chi square (REVIEW)
# effect of category on continuous - t-test / z-test

# random forest (ensemble model) – each decision tree votes on response / target