In [1]:
%matplotlib inline
import os
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import math

In [2]:
os.chdir('C:/Users/evanm_000/Desktop/Work/LendingClub')

data = pd.read_csv('loan.csv', low_memory=False)
data.drop(['id', 'member_id'], axis=1, inplace=True)

data['term'] = data['term'].str.replace(" months", "").astype(int)

data.drop(['emp_title'], axis=1, inplace=True)

import datetime as dt

data['issue_date'] = data.issue_d.apply(lambda x: dt.datetime.strptime(x,"%b-%Y"))
data.issue_date.head()

data['issue_year'], data['issue_month'] = data['issue_date'].dt.year, data['issue_date'].dt.month

data.drop(['issue_d', 'issue_date'], axis=1, inplace=True)
data['pymnt_plan'] = data['pymnt_plan'].replace(['y', 'n'], [1, 0])

data.drop(['url', 'desc', 'title', 'zip_code', 'addr_state'], axis=1, inplace=True)
data.drop(['initial_list_status'], axis=1, inplace=True)

data = data.replace('n/a', np.nan,regex=True)
data = data.replace('nan', np.nan,regex=True)

data['earliest_cr_line'] = data['earliest_cr_line'].apply(str)
data['earliest_cr_line_year'] = data['earliest_cr_line'].apply(lambda x: x[4:8])

data['earliest_cr_line_year'] = data['earliest_cr_line_year'].replace('', np.nan,regex=True)
data['earliest_cr_line_year'] = data['earliest_cr_line_year'].astype(float)

# Creating a variable that looks at whether a loan is in default or not

data['default'] = 0
data.loc[data['loan_status'] == 'Default', 'default'] = 1

y = data.default

data_X = data

dropping_vars = ['default', 'loan_status', 'term', 'emp_length', 'sub_grade']
data_X.drop(dropping_vars, axis=1, inplace=True)

In [3]:
#http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.FactorAnalysis.html

#### Will create dummy variables for grade, verification_status, purpose

dummy_grade = pd.get_dummies(data_X.grade)
dummy_verify = pd.get_dummies(data_X.verification_status)
dummy_purpose = pd.get_dummies(data_X.purpose)
dummy_app_type = pd.get_dummies(data_X.application_type)

dummies = pd.concat([dummy_grade, dummy_verify, dummy_purpose, dummy_app_type], axis = 1)

cont_vars = ['loan_amnt', 'int_rate', 'installment', 'annual_inc', 'total_pymnt', 'total_rec_late_fee', 'recoveries', 'out_prncp']
data_X_cont = data_X[cont_vars]
data_X_cont = data_X_cont.fillna(data_X_cont.mean())

X_df = pd.concat([data_X_cont, dummies], axis = 1)
X = X_df.as_matrix()

In [7]:
# Will want to check out maxabs_scale and MaxAbsScaler as they work well with sparse data

from sklearn.preprocessing import StandardScaler
X_std = StandardScaler().fit_transform(X)

from sklearn.decomposition import PCA, FactorAnalysis
from sklearn.covariance import ShrunkCovariance, LedoitWolf
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import GridSearchCV

n_components = np.arange(0, len(X_df.columns), 2)

def compute_scores(X):
    pca = PCA()
    fa = FactorAnalysis()

    pca_scores, fa_scores = [], []
    for n in n_components:
        pca.n_components = n
        fa.n_components = n
        pca_scores.append(np.mean(cross_val_score(pca, X)))
        fa_scores.append(np.mean(cross_val_score(fa, X)))

    return pca_scores, fa_scores

def shrunk_cov_score(X):
    shrinkages = np.logspace(-2, 0, 30)
    cv = GridSearchCV(ShrunkCovariance(), {'shrinkage': shrinkages})
    return np.mean(cross_val_score(cv.fit(X).best_estimator_, X))

def lw_score(X):
    return np.mean(cross_val_score(LedoitWolf(), X))

In [None]:
for X, title in [(X_std, 'Standardised Lending Club Data')]:
    pca_scores, fa_scores = compute_scores(X)
    n_components_pca = n_components[np.argmax(pca_scores)]
    n_components_fa = n_components[np.argmax(fa_scores)]

    pca = PCA(n_components='mle')
    pca.fit(X)
    n_components_pca_mle = pca.n_components_

    print("best n_components by PCA CV = %d" % n_components_pca)
    print("best n_components by FactorAnalysis CV = %d" % n_components_fa)
    print("best n_components by PCA MLE = %d" % n_components_pca_mle)

    plt.figure()
    plt.plot(n_components, pca_scores, 'b', label='PCA scores')
    plt.plot(n_components, fa_scores, 'r', label='FA scores')
    plt.axvline(n_components_pca, color='b',
                label='PCA CV: %d' % n_components_pca, linestyle='--')
    plt.axvline(n_components_fa, color='r',
                label='FactorAnalysis CV: %d' % n_components_fa, linestyle='--')
    plt.axvline(n_components_pca_mle, color='k',
                label='PCA MLE: %d' % n_components_pca_mle, linestyle='--')

    # compare with other covariance estimators
    plt.axhline(shrunk_cov_score(X), color='violet',
                label='Shrunk Covariance MLE', linestyle='-.')
    plt.axhline(lw_score(X), color='orange',
                label='LedoitWolf MLE' % n_components_pca_mle, linestyle='-.')

    plt.xlabel('nb of components')
    plt.ylabel('CV scores')
    plt.legend(loc='lower right')
    plt.title(title)
    axes = plt.gca()
    axes.set_ylim([0,-50])

plt.show()