# CS 109A - Intro to Data Science: Project (WIP)
## Predicting Loan Outcomes
## Group: Andrew Greene and David Modjeska
### Harvard University, Fall 2016

In [1]:
import itertools as it
import matplotlib
import matplotlib.cm as cmx
import matplotlib.colors as colors
import matplotlib.pyplot as plt
import numpy as np
import os.path as op
import pandas as pd
import re
import scipy as sp
import sklearn.preprocessing as Preprocessing

from itertools import combinations
from sklearn.cross_validation import KFold as kfold
from sklearn.cross_validation import train_test_split as sk_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.lda import LDA
from sklearn.linear_model import LinearRegression as Lin_Reg
from sklearn.linear_model import LogisticRegression as Log_Reg
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.qda import QDA

%matplotlib inline

## Step 1: Clean Data

In [2]:
# helper function to select the columns of interest from the data set
def Select_Data(data):
    
    # list columns to select
    features_to_select = ["loan_status", "annual_inc", "earliest_cr_line", "delinq_2yrs", \
                          "emp_length", "home_ownership", "inq_last_6mths", "loan_amnt", \
                         "purpose", "open_acc", "total_acc", "term", "installment", \
                         "revol_bal", "sub_grade", "issue_d"]
    ratios_to_select = ["dti", "revol_util"]
    text_to_select = ["desc"]

    # concatenate selected columns
    data_select = pd.concat(( \
                            data[features_to_select],
                            data[ratios_to_select], \
                            data[text_to_select]), \
                            axis = 1)

    # synthesize new columns, and drop temporary columns
    monthly_inc = (data["annual_inc"] / 12)
    data_select["ipr"] = data["installment"] / monthly_inc # income to payment ratio
    data_select["rir"] = data["revol_bal"] / monthly_inc # revolving to income ratio
    data_select = data_select.drop("installment", axis = 1)
    data_select = data_select.drop("revol_bal", axis = 1)
    
    # rename columns for legibility
    data_select.columns = [
        "loan_status", "annual_income", "earliest_credit", "delinq_2_yrs", \
        "employ_length", "home_owner", "inquiry_6_mos", "loan_amount", \
        "loan_purpose", "open_accounts", "total_acccounts", "loan_term", \
        "loan_subgrade", "issue_date", "dti", "revol_util", "description", \
        "ipr", "rir" \
    ]
    
    return data_select

In [3]:
# helper function to filter the data set down to rows of interest
def Filter_Data(data_select):
    
    # set flags for resolved loans
    status_indexes = (data_select["loan_status"] == "Fully Paid") | \
                    (data_select["loan_status"] == "Charged Off")

    # set flags for date range of interest
    earliest_date = pd.to_datetime("2012-01-01")
    data_my = pd.to_datetime(data_select["issue_date"])
    date_indexes = (data_my > earliest_date)

    # filter rows per flags of interest
    data_filter = data_select.ix[status_indexes & date_indexes, :].reset_index()
    data_filter = data_filter.drop("issue_date", axis = 1)
    
    return data_filter

In [4]:
# helper function to clean data - recoding, retyping, pruning, and censoring
def Clean_Data(data_filter):
    data_clean = data_filter.copy()

    # recode loan status as boolean: fully paid = True
    data_clean["loan_status"] = data_clean["loan_status"] == "Fully Paid"

    # recode loan subgrades from 1 (best) to 35 (worst)
    num_grades = 5
    grade = data_clean["loan_subgrade"].str[0]
    grade = (pd.DataFrame(ord(c) for c in grade) - ord('A')) * num_grades
    sub_grade = data_clean["loan_subgrade"].str[1].astype('int')
    data_clean["loan_subgrade"] =  grade + sub_grade

    # convert earliest credit date to datetime
    data_clean["earliest_credit"] = pd.to_datetime(data_clean["earliest_credit"])

    # prune extra text in loan term 
    data_clean["loan_term"] = data_clean["loan_term"].str.replace(" months", "")

    # prune extra text in employment length, and right-censor
    data_clean["employ_length"] = data_clean["employ_length"].str.replace(" years*", "")
    data_clean["employ_length"] = data_clean["employ_length"].str.replace("10\+", "10")
    data_clean["employ_length"] = data_clean["employ_length"].str.replace("< 1", "0")
    
    # right-censor delinquencies and inquiries
    data_clean["delinq_2_yrs"] = np.clip(data_clean["delinq_2_yrs"], 0, 2)
    data_clean["inquiry_6_mos"] = np.clip(data_clean["inquiry_6_mos"], 0, 3)
    
    return data_clean

In [5]:
# helper function to clean data - filtering nuisance NaNs (not structural NaNs)
def Clean_Data2(data_clean):
    n, p = data_clean.shape
    
    # count nulls by column
    col_nan_pct = data_clean.isnull().sum() / n
    
    # flag columns that have some nuisance nulls
    cols_with_nans = (col_nan_pct > 0.0) & (col_nan_pct < 0.01)
    
    # flag rows that have some nuisance nulls in the flagged columns
    rows_without_nans_flags = data_clean.ix[:, cols_with_nans].notnull()
    
    # index the flagged rows with some nuisance nulls
    rows_without_nans_indexes = np.where(rows_without_nans_flags)
    
    # filter the data set to rows with no nuisance nulls
    data_clean2 = data_clean.ix[rows_without_nans_indexes[0], :]
    
    return data_clean2

In [6]:
# helper function to pre-process each data subset to get around memory limits
def Prep_Data_Part(index, num_parts, file_prefix, data_all):
    filename = file_prefix + str(index) + ".csv"
    
    # pre-process new data part if the file doesn't already exist
    if not op.isfile(filename):
        n, p = data_all.shape
        
        # pre-process the row range for this data part, avoiding empty data subsets
        start_row = index * (n / num_parts)
        data_part = data_all.ix[range(start_row, start_row + (n / num_parts)), :]
        if data_part.shape[0] > 0:
            data_select = Select_Data(data_part)
            data_filter = Filter_Data(data_select)
            if data_filter.shape[0] > 0:
                data_clean = Clean_Data(data_filter)
                data_clean2 = Clean_Data2(data_clean)
                data_clean2.to_csv(filename, index = False)

In [7]:
# helper function to pre-process full data set and save new file, 
# or to read pre-processed file if it already exists
def Preprocess_Full_Dataset():
    file_prefix = "./data_parts/loan_clean_part"
    full_clean_data_file = "loan_clean_data.csv"
    num_parts = 30

    # pre-process data set and save result as new file
    if not op.isfile(full_clean_data_file):

        # pre-process and save part files
        data_raw = pd.read_csv("loan.csv")
        for part in range(num_parts):
            Prep_Data_Part(part, num_parts, file_prefix, data_raw)       

        # read and concatenate part files
        data = pd.DataFrame({})
        for part in range(num_parts):
            file_part = file_prefix + str(part) + ".csv"
            if op.isfile(file_part):
                data_part = pd.read_csv(file_part)
                data = pd.concat((data, data_part), axis = 0)

        # save full file
        data = data.reset_index()
        data.to_csv(full_clean_data_file, index = False)

    # read pre-processed full data file
    else:
        data = pd.read_csv(full_clean_data_file)
        
    return data

In [None]:
# helper function to pre-process sampled data set and save new file, 
# or to read pre-processed file if it already exists
def Preprocess_Sample_Dataset():
    sample_percent = 10
    sample_clean_data_file = "loan_clean_data_" + str(sample_percent) + "pct.csv"

    # pre-process sample data set and save result as new file
    if not op.isfile(sample_clean_data_file):
        data_raw = pd.read_csv("loan.csv")
        data_sample, data_other = sk_split(data_raw, train_size = sample_percent / 100.0)
        data_select = Select_Data(data_sample)
        data_filter = Filter_Data(data_select)
        data_clean = Clean_Data(data_filter)
        data_clean2 = Clean_Data2(data_clean)
        data_clean2.to_csv(sample_clean_data_file, index = False)

    # read pre-processed sample data file
    else:
        data_clean2 = pd.read_csv(sample_clean_data_file)
        
    return data

In [None]:
# create or load appropriate version of data set for analysis

load_full = False

if load_full:
    data = Preprocess_Full_Dataset()
    
else:
    data = Preprocess_Sample_Dataset()

  if self.run_code(code, result):


In [None]:
# set column data types
data["loan_status"] = data["loan_status"].astype('bool')
data["loan_subgrade"] = data["loan_subgrade"].astype('int')
data["description"] = data["description"].astype('str')

In [None]:
# summarize nulls/NaNs in data columns
# FIX - print only cols with nulls
print
print "COUNT OF NULLS IN DATA SET BY COLUMN:\n"
print data.isnull().sum()
print

## Step 2: Explore Data

In [None]:
# print dataset shape, and widen pandas dataframe display
n, p = data.shape
pd.set_option('display.max_columns', p)

print
print "The shape of the data is", data.shape
print

In [None]:
# display top rows of data set
print
data.head()

In [None]:
# summarize data set
print
data.describe()

In [None]:
# extract loan description and status, and create n-grams from description

# extract and pre-process loan description and loan_status for NLP
data_nlp = data[["description", "loan_status"]].copy()
data_nlp["description"] = data_nlp["description"].str.replace("Borrower.* > ", "")
n, p = data_nlp.shape
rows_without_nans_flags = data_nlp["description"].notnull()
rows_without_nans_indexes = np.where(rows_without_nans_flags)
data_nlp = data_nlp.ix[rows_without_nans_indexes[0], :]

# create n-grams from loan description
# TO DO: stem words
vectorizer = CountVectorizer(stop_words = 'english', ngram_range = (3, 3))
desc_matrix = vectorizer.fit_transform(data_nlp['description'].values)
n, p = desc_matrix.shape

# print descriptive information about n-grams
feature_names = vectorizer.get_feature_names().reshape(-1, 1)
print "Number of descriptions and terms:", n, p
print
print "Sample description terms:\n", feature_names[:10, 0]
print

score_accum = 0
num_iters = 50

In [None]:
# plot histogram of term frequencies
plt.histogram(desc_matrix)

In [None]:
# compute most frequent description terms associated with each loan status

# split term matrix into defaulted vs. fully repaid
default_term_matrix = desc_matrix[data_nlp["loan_status"] == False]
repaid_term_matrix = desc_matrix[data_nlp["loan_status"] == True]

# calculate each term frequency
default_term_freqs = default_term_matrix.sum().reshape(-1, 1)
repaid_term_freqs = repaid_term_matrix.sum().reshape(-1, 1)

# rename columns
default_term_freqs.columns = ["term_frequencies"]
repaid_term_freqs.columns = ["term_frequencies"]

# combine term frequencies with feature names as a reverse dictionary
default_term_dict = pd.concat((feature_names, default_term_freqs))
repaid_term_dict = pd.concat((feature_names, repaid_term_freqs))

# sort dictionaries by term frequency
default_term_dict = default_term_dict.sort(columns = "term_frequencies", ascending = False)
repaid_term_dict = repaid_term_dict.sort(columns = "term_frequencies", ascending = False)

In [None]:
print "Most Frequent Terms in Descriptions of Defaulted Loans:"
default_term_dict.head(10)

In [None]:
print "Most Frequent Terms in Descriptions of Fully Repaid Loans:"
repaid_term_dict.head(10)

## Step 3: Model Data

In [None]:
# use description column, n-grams, and KNN to predict defaults and score accuracy

# use KNN with cross-validation to train and score models predicting loan defaults
# TO DO: tune parameters k for KNN and n for n-grams
for i in range(num_iters):
    mask = np.random.rand(n) < 0.8

    train_y = data_nlp["loan_status"][mask]
    test_y = data_nlp["loan_status"][~mask]

    train_x = desc_matrix[mask]
    test_x = desc_matrix[~mask]  

    model = KNN(n_neighbors = 20)
    model.fit(train_x, train_y)
    score_accum += model.score(test_x, test_y)

# print prediction accuracy
score = score_accum / float(num_iters)
print "Accuracy of predicting defaults from descriptions with KNN:", round(score, 2)

In [None]:
# use description column with logistic regression to predict defaults and score accuracy

# use logistic regression with cross-validation to train and score models
# predicting loan defaults
for i in range(num_iters):
    mask = np.random.rand(n) < 0.8

    train_y = data_nlp["loan_status"][mask]
    test_y = data_nlp["loan_status"][~mask]

    train_x = desc_matrix[mask]
    test_x = desc_matrix[~mask]  

    model = Log_Reg()
    model.fit(train_x, train_y)
    score_accum += model.score(test_x, test_y)

# print prediction accuracy
score = score_accum / float(num_iters)
print "Accuracy of predicting defaults from descriptions with logistic regression:", \
        round(score, 2)

In [None]:
# use description column with LDA to predict defaults and score accuracy

# use LDA with cross-validation to train and score models
# predicting loan defaults
for i in range(num_iters):
    mask = np.random.rand(n) < 0.8

    train_y = data_nlp["loan_status"][mask]
    test_y = data_nlp["loan_status"][~mask]

    train_x = desc_matrix[mask]
    test_x = desc_matrix[~mask]  

    model = LDA()
    model.fit(train_x, train_y)
    score_accum += model.score(test_x, test_y)

# print prediction accuracy
score = score_accum / float(num_iters)
print "Accuracy of predicting defaults from descriptions with LDA:", round(score, 2)

In [None]:
# use description column with QDA to predict defaults and score accuracy

# use QDA with cross-validation to train and score models
# predicting loan defaults
for i in range(num_iters):
    mask = np.random.rand(n) < 0.8

    train_y = data_nlp["loan_status"][mask]
    test_y = data_nlp["loan_status"][~mask]

    train_x = desc_matrix[mask]
    test_x = desc_matrix[~mask]  

    model = QDA()
    model.fit(train_x, train_y)
    score_accum += model.score(test_x, test_y)

# print prediction accuracy
score = score_accum / float(num_iters)
print "Accuracy of predicting defaults from descriptions with QDA:", round(score, 2)