# CS 109A - Intro to Data Science: Project (WIP)
## Predicting Loan Outcomes
## Group: Andrew Greene and David Modjeska
### Harvard University, Fall 2016

In [145]:
import itertools as it
import numpy as np
import os.path as op
import pandas as pd
import re
import sklearn.preprocessing as Preprocessing
import datetime

import enchant

from itertools import combinations
from sklearn.cross_validation import train_test_split as sk_split
from sklearn.decomposition import TruncatedSVD as tSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.io import mmwrite

%matplotlib inline
from IPython.display import display, HTML

In [146]:
### specify processed data files to generate - full/partial, partial %, and train/test
### Note: this cell is present in both notebooks

# load and clean full dataset?
load_full = False

# if not loading and cleaning full dataset, what sample percentage?
sample_percent = 10

if load_full:
    pct_str = ""
else: # not load_full
    pct_str = str(sample_percent) + "_pct"
    
# use training or testing data to generate minor files?
minor_use_train = True
if minor_use_train:
    mode_str = "train"
else: # not minor_use_train
    mode_str = "test"
    
### set intermediate file names
dir_str = "./intermediate_files/"

processed_data_train_file = dir_str + "processed_data_" + "train" + pct_str + ".json"
processed_data_test_file = dir_str + "processed_data_" + "test" + pct_str + ".json"

nlp_data_file = dir_str + "nlp_data_" + mode_str + pct_str + ".json"
term_freqs_file = dir_str + "term_freqs_" + mode_str + pct_str + ".mtx"
diff_terms_file = dir_str + "diff_terms_" + mode_str + pct_str + ".json"

## Step 1: Load and Clean Data

### Economic Data

In [147]:
# load CPI data (from https://fred.stlouisfed.org/series/CPIAUCSL/downloaddata)
def load_cpi_data():
    cpi_xls = pd.ExcelFile("datasets/CPIAUCSL.xls")
    cpi_sheet = cpi_xls.sheet_names[0]
    cpi_df = cpi_xls.parse(cpi_sheet, header = None, skiprows = 55)
    cpi_df.columns = ['date', 'cpi']
    
    return cpi_df

In [148]:
# load GDP data (from https://fred.stlouisfed.org/series/GDP/downloaddata)
def load_gdp_data():
    gdp_xls = pd.ExcelFile("datasets/GDP.xls")
    gdp_sheet = gdp_xls.sheet_names[0]
    gdp_df = gdp_xls.parse(gdp_sheet, header = None, skiprows = 20)
    gdp_df.columns = ['date', 'gdp']
    
    return gdp_df

In [149]:
# load unemployment data (from https://fred.stlouisfed.org/series/UNRATE/downloaddata)
def load_unemploy_data():
    unemploy_xls = pd.ExcelFile("datasets/UNRATE.xls")
    unemploy_sheet = unemploy_xls.sheet_names[0]
    unemploy_df = unemploy_xls.parse(unemploy_sheet, header = None, skiprows = 25)
    unemploy_df.columns = ['date', 'unemploy']

    return unemploy_df

In [150]:
### load economic data

econ_filename = "econ_data.json"

if not op.isfile(econ_filename):
    cpi_df = load_cpi_data()
    gdp_df = load_gdp_data()
    unemploy_df = load_unemploy_data()
    
    econ_data_2 = pd.merge(cpi_df, gdp_df, 'inner')
    econ_data_3 = pd.merge(econ_data_2, unemploy_df, 'inner')
    
    econ_data_3.to_json("econ_data.json", date_unit = 's')
    
econ_data_4 = pd.read_json(econ_filename)

In [151]:
# convert UNIX timestamp to calendar quarter
# FIX hack to adjust timezone with timedelta
new_col = econ_data_4["date"].copy()
for index in range(econ_data_4.shape[0]):
        new_col[index] = \
             (datetime.datetime.fromtimestamp(econ_data_4["date"].values[index]) +
            datetime.timedelta(hours = 5)).replace(hour = 0)
econ_data_4['date'] = new_col 

In [152]:
econ_data_4.head()

Unnamed: 0,cpi,date,gdp,unemploy
0,23.68,1948-01-01 00:00:00,266.2,3.4
1,23.82,1948-04-01 00:00:00,272.9,3.9
10,24.07,1974-10-01 00:00:00,308.5,5.0
100,42.7,1995-04-01 00:00:00,1380.7,4.9
101,43.7,1952-10-01 00:00:00,1417.6,5.0


### LC Data

In [153]:
# helper function to select the columns of interest from the data set
def Select_Data(data):
    
    # list columns to select
    features_to_select = ['id', "loan_status", "annual_inc", "earliest_cr_line", "delinq_2yrs", \
                          "emp_length", "home_ownership", "inq_last_6mths", "loan_amnt", \
                         "purpose", "open_acc", "total_acc", "term", "installment", \
                         "revol_bal", "sub_grade", "issue_d", "int_rate", \
                          'mths_since_last_record', 'emp_title', 'addr_state', \
                          'initial_list_status', 'verification_status_joint', 'recoveries']
    ratios_to_select = ["dti", "revol_util"]
    text_to_select = ["desc"]

    # concatenate selected columns
    data_select = pd.concat(( \
                            data[features_to_select],
                            data[ratios_to_select], \
                            data[text_to_select]), \
                            axis = 1)

    # synthesize new columns, and drop temporary columns
    monthly_inc = (data["annual_inc"] / 12)
    data_select["ipr"] = data["installment"] / monthly_inc # income to payment ratio
    data_select["rir"] = data["revol_bal"] / monthly_inc # revolving to income ratio
    data_select = data_select.drop("revol_bal", axis = 1)
    
    # rename columns for legibility
    data_select.columns = [
        'id', "loan_status", "annual_income", "earliest_credit", "delinq_2_yrs", \
        "employ_length", "home_owner", "inquiry_6_mos", "loan_amount", \
        "loan_purpose", "open_accounts", "total_accounts", "loan_term", "installment", \
        "loan_subgrade", "issue_date", "interest_rate", "months_since_last_record", \
        "employ_title", "address_state", "initial_list_status", "verif_status", 'recoveries', \
        "dti", "revol_util", "description", \
        "ipr", "rir" \
    ]
    
    return data_select

In [154]:
# helper function to filter the data set down to rows of interest
def Filter_Data(data_select):
    
    # set flags for resolved loans
    status_flags = (data_select["loan_status"] == "Fully Paid") | \
                    (data_select["loan_status"] == "Charged Off")

    # set flags for date range of interest
    earliest_date = pd.to_datetime("2008-01-01")
    issue_dates = pd.to_datetime(data_select["issue_date"])
    date_flags = (issue_dates > earliest_date)
    
    # set flags for 36-month loan terms
    #term_flags = (data_select['loan_term'] == " 36 months")

    # filter rows per flags of interest
    data_filter = \
        data_select.ix[status_flags & date_flags, :].reset_index(drop = True)
    
    return data_filter

In [155]:
# helper function to clean data - recoding, retyping, pruning, and censoring
def Clean_Data(data_filter):
    data_clean = data_filter.copy()

    # recode loan status as boolean: charged off = True
    data_clean["loan_status"] = data_clean["loan_status"] == "Charged Off"

    # recode loan subgrades from 1 (best) to 35 (worst)
    num_grades = 5
    grade = data_clean["loan_subgrade"].str[0]
    grade = (pd.DataFrame(ord(c) for c in grade) - ord('A')) * num_grades
    sub_grade = data_clean["loan_subgrade"].str[1].astype('int')
    data_clean["loan_subgrade"] =  grade + sub_grade

    # convert earliest credit date to datetime
    data_clean["earliest_credit"] = pd.to_datetime(data_clean["earliest_credit"])

    # prune extra text in loan term 
    data_clean["loan_term"] = data_clean["loan_term"].str.strip()
    data_clean["loan_term"] = data_clean["loan_term"].str.replace(" months", "")


    # prune extra text in employment length, and right-censor
    data_clean["employ_length"] = data_clean["employ_length"].str.replace(" years*", "")
    data_clean["employ_length"] = data_clean["employ_length"].str.replace("10\+", "10")
    data_clean["employ_length"] = data_clean["employ_length"].str.replace("< 1", "0")
    
    # right-censor delinquencies and inquiries
    data_clean["delinq_2_yrs"] = np.clip(data_clean["delinq_2_yrs"], 0, 2)
    data_clean["inquiry_6_mos"] = np.clip(data_clean["inquiry_6_mos"], 0, 3)
        
    return data_clean

In [156]:
# helper function to clean data - filtering nuisance NaNs (not structural NaNs)
def Clean_Data2(data_clean):
    n, p = data_clean.shape
    
    # count nulls by column
    col_nan_pct = data_clean.isnull().sum() / n
    
    # flag columns that have some nuisance nulls
    cols_with_nans = (col_nan_pct > 0.0) & (col_nan_pct < 0.01)
    
    # flag rows that have some nuisance nulls in the flagged columns
    rows_without_nans_flags = data_clean.ix[:, cols_with_nans].notnull()
    
    # index the flagged rows that contain some nuisance nulls
    rows_without_nans_indexes = np.where(rows_without_nans_flags)
    
    # filter the data set to rows that contain no nuisance nulls
    data_clean2 = data_clean.ix[rows_without_nans_indexes[0], :]
    
    return data_clean2

In [157]:
# helper function to pre-process each data subset to get around memory limits
def Prep_Data_Part(index, num_parts, file_prefix, data_all):
    filename = file_prefix + str(index) + ".csv"
    
    # pre-process new data part if the file doesn't already exist
    if not op.isfile(filename):
        n, p = data_all.shape
        
        # pre-process the row range for this data part, avoiding empty data subsets
        start_row = index * (n / num_parts)
        data_part = data_all.ix[range(start_row, start_row + (n / num_parts)), :]
        if data_part.shape[0] > 0:
            data_select = Select_Data(data_part)
            data_filter = Filter_Data(data_select)
            if data_filter.shape[0] > 0:
                data_clean = Clean_Data(data_filter)
                data_clean2 = Clean_Data2(data_clean)
                data_clean2.to_csv(filename, index = False)

In [158]:
# Lending Club (LC) data (from https://www.kaggle.com/wendykan/lending-club-loan-data)
# helper function to pre-process full data set and save new file, 
# or to read pre-processed file if it already exists
# Note: this non-shared intermediate file is not split into train/test
def Preprocess_Full_Dataset():
    file_prefix = "./data_parts/loan_clean_part"
    full_clean_data_file = "loan_clean_data.csv"
    num_parts = 30

    # pre-process data set and save result as new file
    if not op.isfile(full_clean_data_file):

        # pre-process and save part files
        data_raw = pd.read_csv("loan.csv")
        for part in range(num_parts):
            Prep_Data_Part(part, num_parts, file_prefix, data_raw)       

        # read and concatenate part files
        data = pd.DataFrame({})
        for part in range(num_parts):
            file_part = file_prefix + str(part) + ".csv"
            if op.isfile(file_part):
                data_part = pd.read_csv(file_part)
                data = pd.concat((data, data_part), axis = 0)

        # save full file
        data = data.reset_index(drop = True)
        data.to_csv(full_clean_data_file, index = False)

    # read pre-processed full data file
    data = pd.read_csv(full_clean_data_file)
        
    return data

In [159]:
# helper function to pre-process sampled data set and save new file, 
# or to read pre-processed file if it already exists
# Note: this non-shared intermediate file is not split into train/test
def Preprocess_Sample_Dataset():
    sample_clean_data_file = "loan_clean_data_" + str(sample_percent) + "_pct.csv"

    # pre-process sample data set and save result as new file
    if not op.isfile(sample_clean_data_file):
        data_raw = pd.read_csv("loan.csv")
        data_sample, data_other = sk_split(data_raw, train_size = sample_percent / 100.0)
        data_select = Select_Data(data_sample)
        data_filter = Filter_Data(data_select)
        data_clean = Clean_Data(data_filter)
        data_clean2 = Clean_Data2(data_clean)
        data_clean2.to_csv(sample_clean_data_file, index = False)

    # read pre-processed sample data file
    data_clean2 = pd.read_csv(sample_clean_data_file)
        
    return data_clean2

In [160]:
# create or load appropriate version of data set for analysis

if load_full:
    data = Preprocess_Full_Dataset()
    
else:
    data = Preprocess_Sample_Dataset()   

In [161]:
# set boolean and string column data types
data["loan_status"] = data["loan_status"].astype(bool)
data["description"] = data["description"].astype('str')
data["issue_date"] = data["issue_date"].astype('str') # for later conversion to datetime
data["employ_title"] = data["employ_title"].astype('str')
data["address_state"] = data["address_state"].astype('str')

nan_flags = data["description"].str.match("nan")
data.ix[nan_flags, "description"] = None

# replace all numbers with a token
data["description"] = data["description"].str.replace("[0-9]+", "_number_")

In [162]:
# summarize nulls/NaNs in data columns
# FIX - print only cols with nulls
print
print "Count of nulls in data set by column:\n"
print data.isnull().sum()
print


Count of nulls in data set by column:

id                               0
loan_status                      0
annual_income                    0
earliest_credit                  0
delinq_2_yrs                     0
employ_length                    0
home_owner                       0
inquiry_6_mos                    0
loan_amount                      0
loan_purpose                     0
open_accounts                    0
total_accounts                   0
loan_term                        0
installment                      0
loan_subgrade                    0
issue_date                       0
interest_rate                    0
months_since_last_record    217958
employ_title                     0
address_state                    0
initial_list_status              0
verif_status                248445
recoveries                       0
dti                              0
revol_util                       0
description                 160615
ipr                              0
rir            

In [163]:
# create new column for first day of quarter that contains issue date
new_col = data["issue_date"].copy()
new_col.name = 'issue_quarter'
for index in range(data.shape[0]):
    new_col.iloc[index] = datetime.datetime.strptime(data["issue_date"].values[index], 
                                                "%b-%Y").replace(day = 1)
    quarter = (new_col.iloc[index].month - 1) // 3
    new_col.iloc[index] = new_col.iloc[index].replace(month = (3 * quarter) + 1)
data = pd.concat((data, new_col), axis = 1).reset_index(drop = True)

In [164]:
# join LC and economic data
print data.shape
data = data.merge(right = econ_data_4, how = 'inner', 
                  left_on = "issue_quarter", right_on = "date", 
                  left_index = True).reset_index(drop = True)
data.drop('issue_quarter', axis = 1, inplace = True)
data.drop('date', axis = 1, inplace = True)

(248445, 29)


In [165]:
# normalize float columns
float_cols = ['dti', 'revol_util', 'ipr', 'rir', 'cpi', 'gdp', 'unemploy']
data[float_cols] = data[float_cols].astype(float)
data[float_cols] = Preprocessing.normalize(data[float_cols]).astype(float)

In [166]:
data.head()

Unnamed: 0,id,loan_status,annual_income,earliest_credit,delinq_2_yrs,employ_length,home_owner,inquiry_6_mos,loan_amount,loan_purpose,...,verif_status,recoveries,dti,revol_util,description,ipr,rir,cpi,gdp,unemploy
0,1077501.0,False,24000.0,1985-01-01,0.0,10,RENT,1.0,5000.0,credit_card,...,,0.0,0.004445,0.013455,Borrower added on _number_/_number_/_number_...,1.3e-05,0.001097,0.021895,0.999659,0.001093
1,1077430.0,True,30000.0,1999-04-01,0.0,0,RENT,3.0,2500.0,car,...,,117.08,0.000161,0.001511,Borrower added on _number_/_number_/_number_...,4e-06,0.000108,0.021897,0.999758,0.001093
2,1077175.0,False,12252.0,2001-11-01,0.0,10,RENT,2.0,2400.0,small_business,...,,0.0,0.001402,0.015834,,1.3e-05,0.000465,0.021895,0.999633,0.001093
3,1076863.0,False,49200.0,1996-02-01,0.0,10,RENT,1.0,10000.0,other,...,,0.0,0.003215,0.003376,Borrower added on _number_/_number_/_number_...,1.3e-05,0.00022,0.021897,0.999749,0.001093
4,1075269.0,False,36000.0,2004-11-01,0.0,3,RENT,3.0,5000.0,wedding,...,,0.0,0.001801,0.00455,,8e-06,0.000427,0.021897,0.999748,0.001093


In [167]:
data.describe()

Unnamed: 0,id,annual_income,delinq_2_yrs,inquiry_6_mos,loan_amount,open_accounts,total_accounts,loan_term,installment,loan_subgrade,...,months_since_last_record,verif_status,recoveries,dti,revol_util,ipr,rir,cpi,gdp,unemploy
count,248445.0,248445.0,248445.0,248445.0,248445.0,248445.0,248445.0,248445.0,248445.0,248445.0,...,30487.0,0.0,248445.0,248445.0,248445.0,248445.0,248445.0,248445.0,248445.0,248445.0
mean,11541170.0,72410.11,0.210143,0.821739,13552.297088,10.930097,25.014957,41.319036,418.061126,11.171297,...,76.864664,,161.921232,0.002417,0.008003,1.146846e-05,0.000393,0.020923,0.999738,0.000973
std,13580970.0,57654.66,0.514727,0.968979,8109.43645,4.870852,11.721059,9.968206,244.505571,6.761935,...,28.649832,,753.751969,0.001118,0.003724,6.040069e-06,0.000333,0.000697,4e-05,0.000132
min,54734.0,3000.0,0.0,0.0,500.0,1.0,2.0,36.0,16.08,1.0,...,0.0,,0.0,0.0,0.0,4.79087e-08,0.0,0.019594,0.992349,0.000705
25%,1452197.0,45000.0,0.0,0.0,7300.0,7.0,16.0,36.0,240.02,6.0,...,,,0.0,0.001579,0.005284,6.848098e-06,0.000175,0.020243,0.999715,0.00088
50%,6180235.0,62000.0,0.0,1.0,12000.0,10.0,23.0,36.0,365.23,11.0,...,,,0.0,0.002374,0.008145,1.061129e-05,0.000321,0.020924,0.999744,0.000977
75%,15591540.0,87000.0,0.0,1.0,18225.0,14.0,32.0,36.0,547.16,16.0,...,,,0.0,0.003229,0.010837,1.534597e-05,0.000525,0.021466,0.999765,0.001082
max,68604660.0,8706582.0,2.0,3.0,35000.0,76.0,150.0,60.0,1424.57,34.0,...,129.0,,33520.27,0.005802,0.121802,6.533504e-05,0.016245,0.023216,0.999807,0.001244


In [168]:
# calculate description lengths in characters
description_flags = data["description"].notnull()
descriptions = data.ix[description_flags, "description"]
description_lengths = descriptions.str.len()
data['desc_len'] = description_lengths

In [169]:
### write data frame to intermediate file
np.random.seed(42)
mask = np.random.rand(data.shape[0]) < 0.7
data_train = data.iloc[mask, :]
data_test = data.iloc[~mask, :]     

if not op.isfile(processed_data_train_file):
    data_train.to_json(processed_data_train_file, date_unit = 's')
    
if not op.isfile(processed_data_test_file):
    data_test.to_json(processed_data_test_file, date_unit = 's')

if minor_use_train:
    data = data_train
else: #not minor_use_train
    data = data_test

### Text Analysis

In [170]:
# extract and pre-process loan description and loan_status for NLP
data_nlp = data.loc[description_flags, :].copy()
data_nlp["description"] = data_nlp["description"].str.replace("Borrower.* > ", "")

In [171]:
### set up stemming

from nltk.stem import SnowballStemmer

stemmer = SnowballStemmer(language = 'english', ignore_stopwords = True)
analyzer = TfidfVectorizer().build_analyzer()

def stemmed_words(doc):
    return (stemmer.stem(w) for w in analyzer(doc))

def take(n, seq):
    seq = iter(seq)
    result = []
    try:
        for i in range(n):
            result.append(seq.next())
    except StopIteration:
        pass
    
    return result

In [172]:
# stem words in Description field
for index in range(data_nlp.shape[0]):
    data_nlp['description'].values[index] = \
        " ".join(take(1000, stemmed_words(data_nlp['description'].values[index])))

In [173]:
# create n-grams from loan description
vectorizer = CountVectorizer(stop_words = 'english', ngram_range = (1, 1))
desc_matrix = vectorizer.fit_transform(data_nlp['description'].values)
n, p = desc_matrix.shape
print desc_matrix.shape

if not op.isfile(term_freqs_file):
    mmwrite(term_freqs_file, desc_matrix)

(26549, 9366)


In [174]:
# apply SVD to document-term matrix
tsvd = tSVD(n_components = 100)
desc_matrix_reduce = tsvd.fit_transform(desc_matrix)

In [175]:
data_nlp['desc_matrix_reduce'] = desc_matrix_reduce[:, 0]

In [176]:
# print descriptive information about n-grams
feature_names = np.array(vectorizer.get_feature_names()).reshape(-1, 1)
print "Number of descriptions and terms:", n, p
print
print "Sample terms:", 
pd.DataFrame(feature_names[:10, 0])

Number of descriptions and terms: 26549 9366

Sample terms:

Unnamed: 0,0
0,__________________
1,___________________________________
2,_number_
3,_number_a
4,_number_b
5,_number_ba
6,_number_bn
7,_number_br
8,_number_capitalone_number_
9,_number_cc





In [177]:
data_nlp['desc_word_count'] = desc_matrix.sum(axis = 1)

In [178]:
data_nlp['vocab_count'] = (desc_matrix > 0).sum(axis=1)
data_nlp['vocab_count_norm'] = data_nlp['vocab_count'] \
    / data_nlp['desc_len'].astype(float) 

In [179]:
# split term matrix into defaulted vs. fully repaid
mask = data_nlp["loan_status"].values == False
bad_term_matrix = desc_matrix[mask]
good_term_matrix = desc_matrix[~mask]

all_term_dict = zip(vectorizer.get_feature_names(),
    np.asarray(desc_matrix.sum(axis = 0)).ravel())
all_term_dict_df = pd.DataFrame(all_term_dict).sort_values(by = [1], \
                                                                   ascending = False)
bad_term_dict = zip(vectorizer.get_feature_names(),
    np.asarray(bad_term_matrix.sum(axis = 0)).ravel())
bad_term_dict_df = pd.DataFrame(bad_term_dict).sort_values(by = [1], \
                                                                   ascending = False)
good_term_dict = zip(vectorizer.get_feature_names(),
    np.asarray(good_term_matrix.sum(axis = 0)).ravel())
good_term_dict_df = pd.DataFrame(good_term_dict).sort_values(by = [1], \
                                                                 ascending = False)

top_bad_dict_df = bad_term_dict_df.iloc[:125, :]
top_good_dict_df = good_term_dict_df.iloc[:125, :]

bad_only_df = pd.DataFrame(list(set(top_bad_dict_df[0]) - set(top_good_dict_df[0])))
good_only_df = pd.DataFrame(list(set(top_good_dict_df[0]) - set(top_bad_dict_df[0])))

In [180]:
print
print "Most Frequent Terms in Descriptions of All Loans:"
all_term_dict_df.head(20)


Most Frequent Terms in Descriptions of All Loans:


Unnamed: 0,0,1
1063,br,31313
2,_number_,25666
2014,credit,17415
4904,loan,16536
6035,pay,15992
1293,card,15449
2200,debt,13022
6053,payment,9332
1819,consolid,8771
5371,month,7027


In [181]:
print
print "Most Frequent Terms Only in Descriptions of Defaulted Loans:"
bad_only_df


Most Frequent Terms Only in Descriptions of Defaulted Loans:


Unnamed: 0,0
0,term
1,purpos
2,end
3,faster
4,secur
5,excel
6,refin
7,cover
8,littl
9,repay


In [182]:
print
print "Most Frequent Terms Only in Descriptions of Fully Repaid Loans:"
good_only_df


Most Frequent Terms Only in Descriptions of Fully Repaid Loans:


Unnamed: 0,0
0,insur
1,quot
2,feel
3,hard
4,question
5,long
6,having
7,thing
8,know
9,instead


In [183]:
### count misspellings

d = enchant.Dict("en_US")

num_terms = all_term_dict_df.shape[0]
misspellings = np.zeros(num_terms)
for index in range(num_terms):
    misspellings[index] = not d.check(all_term_dict_df.iloc[index, 0])

desc_matrix_misspell = desc_matrix[:, misspellings > 0]
data_nlp['misspell_count'] = desc_matrix_misspell.sum(axis=1)

In [184]:
data_nlp['misspell_count_norm'] = data_nlp['misspell_count'] \
    / data_nlp['desc_len'].astype(float) 

if not op.isfile(nlp_data_file):
    data_nlp.to_json(nlp_data_file, date_unit = 's')

In [185]:
data_nlp.describe()

Unnamed: 0,id,annual_income,delinq_2_yrs,inquiry_6_mos,loan_amount,open_accounts,total_accounts,loan_term,installment,loan_subgrade,...,cpi,gdp,unemploy,desc_len,desc_matrix_reduce,desc_word_count,vocab_count,vocab_count_norm,misspell_count,misspell_count_norm
count,26549.0,26549.0,26549.0,26549.0,26549.0,26549.0,26549.0,26549.0,26549.0,26549.0,...,26549.0,26549.0,26549.0,26549.0,26549.0,26549.0,26549.0,26549.0,26549.0,26549.0
mean,3153978.0,71291.24,0.172436,0.828732,13274.682662,10.510076,24.064711,40.465705,412.033321,10.061019,...,0.021413,0.999723,0.001055,292.949527,1.953343,17.419225,14.567969,0.055696,7.718633,0.025
std,3243430.0,48177.64,0.464443,0.975947,7792.32358,4.552667,11.264682,9.340112,239.076134,6.534816,...,0.000503,3.7e-05,8.8e-05,320.171582,3.054017,20.206145,13.265821,0.01818,10.081565,0.014498
min,55742.0,4080.0,0.0,0.0,1000.0,1.0,2.0,36.0,19.87,1.0,...,0.019956,0.999524,0.000762,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,968815.0,45000.0,0.0,0.0,7200.0,7.0,16.0,36.0,233.91,6.0,...,0.02105,0.999699,0.000977,120.0,0.760162,7.0,6.0,0.046875,2.0,0.013986
50%,1516331.0,61000.0,0.0,1.0,12000.0,10.0,23.0,36.0,364.62,9.0,...,0.021465,0.999729,0.001081,206.0,1.136875,12.0,11.0,0.059028,5.0,0.025126
75%,4526257.0,85000.0,0.0,1.0,18000.0,13.0,31.0,36.0,538.4,14.0,...,0.021792,0.999751,0.001139,343.0,2.063849,22.0,19.0,0.066852,10.0,0.035156
max,13048030.0,2000000.0,2.0,3.0,35000.0,48.0,80.0,60.0,1406.45,34.0,...,0.023216,0.999789,0.001244,4517.0,100.0792,347.0,201.0,0.5,208.0,0.166667


### Create NLP Features

In [186]:
#@@ create TF-IDF term matrices for two loan outcomes separately

# split data
data_nlp_2 = data_nlp[['description', 'loan_status']]
good_flags = data_nlp_2['loan_status'] == True
good_nlp = data_nlp_2.loc[good_flags, :]
bad_nlp = data_nlp_2.loc[~good_flags, :]

In [187]:
# compute matrices
vectorizer_good = TfidfVectorizer(stop_words = 'english', ngram_range = (1, 1), norm = None,
                                 use_idf = True)
vectorizer_bad = TfidfVectorizer(stop_words = 'english', ngram_range = (1, 1), norm = None,
                                use_idf = True)
good_matrix = vectorizer_good.fit_transform(good_nlp['description'].values)
bad_matrix = vectorizer_bad.fit_transform(bad_nlp['description'].values)

In [188]:
# sum term scores
good_term_scores = good_matrix.sum(axis = 0)
bad_term_scores = bad_matrix.sum(axis = 0)

good_term_scores = Preprocessing.normalize(good_term_scores, axis = 1)
bad_term_scores = Preprocessing.normalize(bad_term_scores, axis = 1)

In [189]:
# get feature names
good_terms = vectorizer_good.get_feature_names()
bad_terms = vectorizer_bad.get_feature_names()

In [190]:
### calculate absolute differences between normalized term scores

n_good = len(good_terms)
n_bad = len(bad_terms)

good_diffs = np.zeros(n_good)
bad_diffs = np.zeros(n_bad)

# consider all words in good loans
for good_term in range(n_good):
    try: 
        match_index = bad_terms.index(good_terms[good_term])
        good_diffs[good_term] = np.abs(good_term_scores[0, good_term] - 
                                       bad_term_scores[0, match_index])
    except ValueError:
        good_diffs[good_term] = np.abs(good_term_scores[0, good_term])

# # only consider words uniquely in bad loans, since matches already considered above in loop
for bad_term in range(n_bad):
    try:
        match_index = good_terms.index(bad_terms[bad_term])
        bad_diffs[bad_term] = -1
    except ValueError:
        bad_diffs[bad_term] = np.abs(bad_term_scores[0, bad_term])

In [191]:
### find top different terms

good_diffs_df = pd.DataFrame({'diffs': good_diffs, 'terms': good_terms})
bad_diffs_df = pd.DataFrame({'diffs': bad_diffs, 'terms': bad_terms})

diffs_df = pd.concat((good_diffs_df, bad_diffs_df), axis = 0)
diffs_df_sort = diffs_df.sort_values(by = 'diffs', ascending = False)
diffs_df_sort.head(20)

Unnamed: 0,diffs,terms
2,0.037362,_number_
2831,0.02725,rate
2547,0.021531,payment
507,0.019195,busi
760,0.017567,consolid
326,0.015455,balanc
454,0.01478,br
2538,0.014611,pay
2340,0.013468,need
3971,0.013168,year


In [192]:
### add term flags to data as new features

num_top_diffs = 30
diff_indexes = np.zeros(num_top_diffs).astype(int)
for diff in range(num_top_diffs):
    match_index = np.where(feature_names == diffs_df_sort.iloc[diff, 1])[0]
    diff_indexes[diff] = match_index
    
count_cols = desc_matrix[:, diff_indexes]
count_col_names = feature_names[diff_indexes, 0].astype(str)
count_cols_df = pd.DataFrame(count_cols.toarray(), columns = count_col_names)
if not op.isfile(diff_terms_file):
    count_cols_df.to_json(diff_terms_file, date_unit = 's')