# CS 109A - Intro to Data Science: Project (WIP)
## Predicting Loan Outcomes
## Group: Andrew Greene and David Modjeska
### Harvard University, Fall 2016

In [1]:
import itertools as it
import numpy as np
import os.path as op
import pandas as pd
import re
import sklearn.preprocessing as Preprocessing
import datetime

import enchant

from itertools import combinations
from sklearn.cross_validation import train_test_split as sk_split
from sklearn.decomposition import TruncatedSVD as tSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.io import mmwrite

%matplotlib inline
from IPython.display import display, HTML

In [2]:
### specify dataset proportion to work with

# load and clean full dataset?
load_full = False

# if not loading and cleaning full dataset, what sample percentage?
sample_percent = 10

if load_full:
    pct_str = ""
else: # not load_full
    pct_str = str(sample_percent) + "_pct"

### set intermediate file names

processed_data_train_file = "./intermediate_files/processed_data_train" + pct_str + ".json"
processed_data_test_file = "./intermediate_files/processed_data_test" + pct_str + ".json"

nlp_data_train_file = "./intermediate_files/nlp_data_train" + pct_str + ".json"
nlp_data_test_file = "./intermediate_files/nlp_data_test" + pct_str + ".json"

term_freqs_train_file = "./intermediate_files/term_freqs_train" + pct_str + ".mtx"
term_freqs_test_file = "./intermediate_files/term_freqs_test" + pct_str + ".mtx"

diff_terms_train_file = "./intermediate_files/diff_terms_train" + pct_str + ".json"
diff_terms_test_file = "./intermediate_files/diff_terms_test" + pct_str + ".json"


## Step 1: Load and Clean Data

### Economic Data

In [3]:
# load CPI data (from https://fred.stlouisfed.org/series/CPIAUCSL/downloaddata)
def load_cpi_data():
    cpi_xls = pd.ExcelFile("datasets/CPIAUCSL.xls")
    cpi_sheet = cpi_xls.sheet_names[0]
    cpi_df = cpi_xls.parse(cpi_sheet, header = None, skiprows = 55)
    cpi_df.columns = ['date', 'cpi']
    
    return cpi_df

In [4]:
# load GDP data (from https://fred.stlouisfed.org/series/GDP/downloaddata)
def load_gdp_data():
    gdp_xls = pd.ExcelFile("datasets/GDP.xls")
    gdp_sheet = gdp_xls.sheet_names[0]
    gdp_df = gdp_xls.parse(gdp_sheet, header = None, skiprows = 20)
    gdp_df.columns = ['date', 'gdp']
    
    return gdp_df

In [5]:
# load unemployment data (from https://fred.stlouisfed.org/series/UNRATE/downloaddata)
def load_unemploy_data():
    unemploy_xls = pd.ExcelFile("datasets/UNRATE.xls")
    unemploy_sheet = unemploy_xls.sheet_names[0]
    unemploy_df = unemploy_xls.parse(unemploy_sheet, header = None, skiprows = 25)
    unemploy_df.columns = ['date', 'unemploy']

    return unemploy_df

In [6]:
### load economic data

econ_filename = "econ_data.json"

if not op.isfile(econ_filename):
    cpi_df = load_cpi_data()
    gdp_df = load_gdp_data()
    unemploy_df = load_unemploy_data()
    
    econ_data_2 = pd.merge(cpi_df, gdp_df, 'inner')
    econ_data_3 = pd.merge(econ_data_2, unemploy_df, 'inner')
    
    econ_data_3.to_json("econ_data.json", date_unit = 's')
    
econ_data_4 = pd.read_json(econ_filename)

In [7]:
# convert UNIX timestamp to calendar quarter
# FIX hack to adjust timezone with timedelta
new_col = econ_data_4["date"].copy()
for index in range(econ_data_4.shape[0]):
        new_col[index] = \
             (datetime.datetime.fromtimestamp(econ_data_4["date"].values[index]) +
            datetime.timedelta(hours = 5)).replace(hour = 0)
econ_data_4['date'] = new_col 

In [8]:
econ_data_4.head()

Unnamed: 0,cpi,date,gdp,unemploy
0,23.68,1948-01-01 00:00:00,266.2,3.4
1,23.82,1948-04-01 00:00:00,272.9,3.9
10,24.07,1974-10-01 00:00:00,308.5,5.0
100,42.7,1995-04-01 00:00:00,1380.7,4.9
101,43.7,1952-10-01 00:00:00,1417.6,5.0


### LC Data

In [9]:
# helper function to select the columns of interest from the data set
def Select_Data(data):
    
    # list columns to select
    features_to_select = ["loan_status", "annual_inc", "earliest_cr_line", "delinq_2yrs", \
                          "emp_length", "home_ownership", "inq_last_6mths", "loan_amnt", \
                         "purpose", "open_acc", "total_acc", "term", "installment", \
                         "revol_bal", "sub_grade", "issue_d"]
    ratios_to_select = ["dti", "revol_util"]
    text_to_select = ["desc"]

    # concatenate selected columns
    data_select = pd.concat(( \
                            data[features_to_select],
                            data[ratios_to_select], \
                            data[text_to_select]), \
                            axis = 1)

    # synthesize new columns, and drop temporary columns
    monthly_inc = (data["annual_inc"] / 12)
    data_select["ipr"] = data["installment"] / monthly_inc # income to payment ratio
    data_select["rir"] = data["revol_bal"] / monthly_inc # revolving to income ratio
    data_select = data_select.drop("installment", axis = 1)
    data_select = data_select.drop("revol_bal", axis = 1)
    
    # rename columns for legibility
    data_select.columns = [
        "loan_status", "annual_income", "earliest_credit", "delinq_2_yrs", \
        "employ_length", "home_owner", "inquiry_6_mos", "loan_amount", \
        "loan_purpose", "open_accounts", "total_accounts", "loan_term", \
        "loan_subgrade", "issue_date", "dti", "revol_util", "description", \
        "ipr", "rir" \
    ]
    
    return data_select

In [10]:
# helper function to filter the data set down to rows of interest
def Filter_Data(data_select):
    
    # set flags for resolved loans
    status_flags = (data_select["loan_status"] == "Fully Paid") | \
                    (data_select["loan_status"] == "Charged Off")

    # set flags for date range of interest
    earliest_date = pd.to_datetime("2008-01-01")
    issue_dates = pd.to_datetime(data_select["issue_date"])
    date_flags = (issue_dates > earliest_date)
    
    # set flags for 36-month loan terms
    #term_flags = (data_select['loan_term'] == " 36 months")

    # filter rows per flags of interest
    data_filter = \
        data_select.ix[status_flags & date_flags, :].reset_index()
    
    return data_filter

In [11]:
# helper function to clean data - recoding, retyping, pruning, and censoring
def Clean_Data(data_filter):
    data_clean = data_filter.copy()

    # recode loan status as boolean: charged off = True
    data_clean["loan_status"] = data_clean["loan_status"] == "Charged Off"

    # recode loan subgrades from 1 (best) to 35 (worst)
    num_grades = 5
    grade = data_clean["loan_subgrade"].str[0]
    grade = (pd.DataFrame(ord(c) for c in grade) - ord('A')) * num_grades
    sub_grade = data_clean["loan_subgrade"].str[1].astype('int')
    data_clean["loan_subgrade"] =  grade + sub_grade

    # convert earliest credit date to datetime
    data_clean["earliest_credit"] = pd.to_datetime(data_clean["earliest_credit"])

    # prune extra text in loan term 
    data_clean["loan_term"] = data_clean["loan_term"].str.strip()
    data_clean["loan_term"] = data_clean["loan_term"].str.replace(" months", "")


    # prune extra text in employment length, and right-censor
    data_clean["employ_length"] = data_clean["employ_length"].str.replace(" years*", "")
    data_clean["employ_length"] = data_clean["employ_length"].str.replace("10\+", "10")
    data_clean["employ_length"] = data_clean["employ_length"].str.replace("< 1", "0")
    
    # right-censor delinquencies and inquiries
    data_clean["delinq_2_yrs"] = np.clip(data_clean["delinq_2_yrs"], 0, 2)
    data_clean["inquiry_6_mos"] = np.clip(data_clean["inquiry_6_mos"], 0, 3)
        
    return data_clean

In [12]:
# helper function to clean data - filtering nuisance NaNs (not structural NaNs)
def Clean_Data2(data_clean):
    n, p = data_clean.shape
    
    # count nulls by column
    col_nan_pct = data_clean.isnull().sum() / n
    
    # flag columns that have some nuisance nulls
    cols_with_nans = (col_nan_pct > 0.0) & (col_nan_pct < 0.01)
    
    # flag rows that have some nuisance nulls in the flagged columns
    rows_without_nans_flags = data_clean.ix[:, cols_with_nans].notnull()
    
    # index the flagged rows that contain some nuisance nulls
    rows_without_nans_indexes = np.where(rows_without_nans_flags)
    
    # filter the data set to rows that contain no nuisance nulls
    data_clean2 = data_clean.ix[rows_without_nans_indexes[0], :]
    
    return data_clean2

In [13]:
# helper function to pre-process each data subset to get around memory limits
def Prep_Data_Part(index, num_parts, file_prefix, data_all):
    filename = file_prefix + str(index) + ".csv"
    
    # pre-process new data part if the file doesn't already exist
    if not op.isfile(filename):
        n, p = data_all.shape
        
        # pre-process the row range for this data part, avoiding empty data subsets
        start_row = index * (n / num_parts)
        data_part = data_all.ix[range(start_row, start_row + (n / num_parts)), :]
        if data_part.shape[0] > 0:
            data_select = Select_Data(data_part)
            data_filter = Filter_Data(data_select)
            if data_filter.shape[0] > 0:
                data_clean = Clean_Data(data_filter)
                data_clean2 = Clean_Data2(data_clean)
                data_clean2.to_csv(filename)

In [14]:
# Lending Club (LC) data (from https://www.kaggle.com/wendykan/lending-club-loan-data)
# helper function to pre-process full data set and save new file, 
# or to read pre-processed file if it already exists
def Preprocess_Full_Dataset():
    file_prefix = "./data_parts/loan_clean_part"
    full_clean_data_file = "loan_clean_data.csv"
    num_parts = 30

    # pre-process data set and save result as new file
    if not op.isfile(full_clean_data_file):

        # pre-process and save part files
        data_raw = pd.read_csv("loan.csv")
        for part in range(num_parts):
            Prep_Data_Part(part, num_parts, file_prefix, data_raw)       

        # read and concatenate part files
        data = pd.DataFrame({})
        for part in range(num_parts):
            file_part = file_prefix + str(part) + ".csv"
            if op.isfile(file_part):
                data_part = pd.read_csv(file_part)
                data = pd.concat((data, data_part), axis = 0)

        # save full file
        data = data.reset_index()
        data.to_csv(full_clean_data_file, index = False)

    # read pre-processed full data file
    data = pd.read_csv(full_clean_data_file)
        
    return data

In [15]:
# helper function to pre-process sampled data set and save new file, 
# or to read pre-processed file if it already exists
def Preprocess_Sample_Dataset():
    sample_clean_data_file = "loan_clean_data_" + str(sample_percent) + "_pct.csv"

    # pre-process sample data set and save result as new file
    if not op.isfile(sample_clean_data_file):
        data_raw = pd.read_csv("loan.csv")
        data_sample, data_other = sk_split(data_raw, train_size = sample_percent / 100.0)
        data_select = Select_Data(data_sample)
        data_filter = Filter_Data(data_select)
        data_clean = Clean_Data(data_filter)
        data_clean2 = Clean_Data2(data_clean)
        data_clean2.to_csv(sample_clean_data_file, index = False)

    # read pre-processed sample data file
    data_clean2 = pd.read_csv(sample_clean_data_file)
        
    return data_clean2

In [16]:
# create or load appropriate version of data set for analysis

if load_full:
    data = Preprocess_Full_Dataset()
    
else:
    data = Preprocess_Sample_Dataset()   

In [17]:
# set boolean and string column data types
data["loan_status"] = data["loan_status"].astype(bool)
data["description"] = data["description"].astype('str')
data["issue_date"] = data["issue_date"].astype('str') # for later conversion to datetime

nan_flags = data["description"].str.match("nan")
data.ix[nan_flags, "description"] = None

# replace all numbers with a token
data["description"] = data["description"].str.replace("[0-9]+", "_number_")

In [18]:
# summarize nulls/NaNs in data columns
# FIX - print only cols with nulls
print
print "Count of nulls in data set by column:\n"
print data.isnull().sum()
print


Count of nulls in data set by column:

index                  0
loan_status            0
annual_income          0
earliest_credit        0
delinq_2_yrs           0
employ_length          0
home_owner             0
inquiry_6_mos          0
loan_amount            0
loan_purpose           0
open_accounts          0
total_accounts         0
loan_term              0
loan_subgrade          0
issue_date             0
dti                    0
revol_util             0
description        16666
ipr                    0
rir                    0
dtype: int64



In [19]:
# create new column for first day of quarter that contains issue date
new_col = data["issue_date"].copy()
new_col.name = 'issue_quarter'
for index in range(data.shape[0]):
    new_col.iloc[index] = datetime.datetime.strptime(data["issue_date"].values[index], 
                                                "%b-%Y").replace(day = 1)
    quarter = (new_col.iloc[index].month - 1) // 3
    new_col.iloc[index] = new_col.iloc[index].replace(month = (3 * quarter) + 1)
data = pd.concat((data, new_col), axis = 1).reset_index(drop = True)

In [20]:
# join LC and economic data
print data.shape
data = data.merge(right = econ_data_4, how = 'inner', 
                  left_on = "issue_quarter", right_on = "date", 
                  left_index = True).reset_index(drop = True)
data.drop('issue_quarter', axis = 1, inplace = True)
data.drop('date', axis = 1, inplace = True)

(25439, 21)


In [21]:
# normalize float columns
float_cols = ['dti', 'revol_util', 'ipr', 'rir', 'cpi', 'gdp', 'unemploy']
data[float_cols] = data[float_cols].astype(float)
data[float_cols] = Preprocessing.normalize(data[float_cols]).astype(float)

In [22]:
data.head()

Unnamed: 0,index,loan_status,annual_income,earliest_credit,delinq_2_yrs,employ_length,home_owner,inquiry_6_mos,loan_amount,loan_purpose,...,loan_subgrade,issue_date,dti,revol_util,description,ipr,rir,cpi,gdp,unemploy
0,30060,False,65000.0,1974-12-01,0.0,10,RENT,0.0,25000.0,credit_card,...,7.0,May-2010,0.002904,0.012541,Borrower added on _number_/_number_/_number_...,1.6e-05,0.000769,0.021751,0.99968,0.000919
1,30052,True,97584.0,2000-04-01,0.0,10,MORTGAGE,0.0,24000.0,debt_consolidation,...,22.0,May-2010,0.002054,0.012558,Borrower added on _number_/_number_/_number_...,1.8e-05,0.000203,0.021751,0.999682,0.000919
2,28865,False,75000.0,1993-11-01,0.0,6,MORTGAGE,0.0,10500.0,debt_consolidation,...,2.0,Jun-2010,0.002068,0.003677,Borrower added on _number_/_number_/_number_...,9e-06,0.001381,0.021753,0.999753,0.000919
3,31333,True,64000.0,1994-05-01,1.0,7,MORTGAGE,3.0,15000.0,debt_consolidation,...,7.0,Apr-2010,0.002895,0.0068,Borrower added on _number_/_number_/_number_...,1.6e-05,0.000481,0.021752,0.999736,0.000919
4,29790,False,235000.0,1993-01-01,0.0,3,MORTGAGE,1.0,25000.0,small_business,...,7.0,May-2010,0.001327,0.008448,Borrower added on _number_/_number_/_number_...,4e-06,0.000384,0.021752,0.999726,0.000919


In [23]:
data.describe()

Unnamed: 0,index,annual_income,delinq_2_yrs,inquiry_6_mos,loan_amount,open_accounts,total_accounts,loan_term,loan_subgrade,dti,revol_util,ipr,rir,cpi,gdp,unemploy
count,25439.0,25439.0,25439.0,25439.0,25439.0,25439.0,25439.0,25439.0,25439.0,25439.0,25439.0,25439.0,25439.0,25439.0,25439.0,25439.0
mean,252899.832855,71947.34,0.214041,0.818625,13588.464562,10.953968,25.099021,41.381344,11.035536,0.002427,0.007971,1.147308e-05,0.000394,0.020907,0.999738,0.00097
std,216398.050977,45298.66,0.517693,0.969848,8158.083503,4.849334,11.67888,10.009861,6.69912,0.001124,0.003724,6.064169e-06,0.000335,0.000711,3.7e-05,0.000135
min,12.0,4200.0,0.0,0.0,1000.0,1.0,2.0,36.0,2.0,0.0,0.0,4.884525e-07,0.0,0.019594,0.999537,0.000705
25%,98809.5,45000.0,0.0,0.0,7200.0,8.0,17.0,36.0,7.0,0.001583,0.005256,6.829929e-06,0.000174,0.020243,0.999716,0.00088
50%,196302.0,62000.0,0.0,1.0,12000.0,10.0,23.0,36.0,12.0,0.00239,0.008133,1.059864e-05,0.000321,0.020924,0.999744,0.000967
75%,361455.5,86400.0,0.0,1.0,18550.0,14.0,32.0,36.0,17.0,0.003245,0.010779,1.541481e-05,0.000528,0.021466,0.999765,0.001082
max,887279.0,1250000.0,2.0,3.0,35000.0,49.0,91.0,60.0,32.0,0.005396,0.01936,4.702698e-05,0.008012,0.023216,0.999808,0.001244


In [24]:
# calculate description lengths in characters
description_flags = data["description"].notnull()
descriptions = data.ix[description_flags, "description"]
description_lengths = descriptions.str.len()
data['desc_len'] = description_lengths

In [25]:
### write data frame to intermediate file
mask = np.random.rand(data.shape[0]) < 0.7
data_train = data.iloc[mask, :]
data_test = data.iloc[~mask, :]     

data_train.to_json(processed_data_train_file, date_unit = 's')
data_test.to_json(processed_data_test_file, date_unit = 's')

data = data_train

### Text Analysis

In [26]:
# extract and pre-process loan description and loan_status for NLP
data_nlp = data.loc[description_flags, :].copy()
data_nlp["description"] = data_nlp["description"].str.replace("Borrower.* > ", "")

In [27]:
### set up stemming

from nltk.stem import SnowballStemmer

stemmer = SnowballStemmer(language = 'english', ignore_stopwords = True)
analyzer = TfidfVectorizer().build_analyzer()

def stemmed_words(doc):
    return (stemmer.stem(w) for w in analyzer(doc))

def take(n, seq):
    seq = iter(seq)
    result = []
    try:
        for i in range(n):
            result.append(seq.next())
    except StopIteration:
        pass
    
    return result

In [28]:
# stem words in Description field
for index in range(data_nlp.shape[0]):
    data_nlp['description'].values[index] = \
        " ".join(take(1000, stemmed_words(data_nlp['description'].values[index])))

In [29]:
# create n-grams from loan description
vectorizer = CountVectorizer(stop_words = 'english', ngram_range = (1, 1))
desc_matrix = vectorizer.fit_transform(data_nlp['description'].values)
n, p = desc_matrix.shape
print desc_matrix.shape

mmwrite(term_freqs_train_file, desc_matrix)

(6086, 4872)


In [30]:
# apply SVD to document-term matrix
tsvd = tSVD(n_components = 100)
desc_matrix_reduce = tsvd.fit_transform(desc_matrix)

In [31]:
data_nlp['desc_matrix_reduce'] = desc_matrix_reduce[:, 0]

In [32]:
# print descriptive information about n-grams
feature_names = np.array(vectorizer.get_feature_names()).reshape(-1, 1)
print "Number of descriptions and terms:", n, p
print
print "Sample terms:", 
pd.DataFrame(feature_names[:10, 0])

Number of descriptions and terms: 6086 4872

Sample terms:

Unnamed: 0,0
0,_number_
1,_number_a
2,_number_b
3,_number_bdrm
4,_number_br
5,_number_cc
6,_number_d
7,_number_dollar
8,_number_entertain
9,_number_f





In [33]:
data_nlp['desc_word_count'] = desc_matrix.sum(axis = 1)

In [34]:
data_nlp['vocab_count'] = (desc_matrix > 0).sum(axis=1)
data_nlp['vocab_count_norm'] = data_nlp['vocab_count'] \
    / data_nlp['desc_len'].astype(float) 

In [35]:
# split term matrix into defaulted vs. fully repaid
mask = data_nlp["loan_status"].values == False
bad_term_matrix = desc_matrix[mask]
good_term_matrix = desc_matrix[~mask]

all_term_dict = zip(vectorizer.get_feature_names(),
    np.asarray(desc_matrix.sum(axis = 0)).ravel())
all_term_dict_df = pd.DataFrame(all_term_dict).sort_values(by = [1], \
                                                                   ascending = False)
bad_term_dict = zip(vectorizer.get_feature_names(),
    np.asarray(bad_term_matrix.sum(axis = 0)).ravel())
bad_term_dict_df = pd.DataFrame(bad_term_dict).sort_values(by = [1], \
                                                                   ascending = False)
good_term_dict = zip(vectorizer.get_feature_names(),
    np.asarray(good_term_matrix.sum(axis = 0)).ravel())
good_term_dict_df = pd.DataFrame(good_term_dict).sort_values(by = [1], \
                                                                 ascending = False)

top_bad_dict_df = bad_term_dict_df.iloc[:125, :]
top_good_dict_df = good_term_dict_df.iloc[:125, :]

bad_only_df = pd.DataFrame(list(set(top_bad_dict_df[0]) - set(top_good_dict_df[0])))
good_only_df = pd.DataFrame(list(set(top_good_dict_df[0]) - set(top_bad_dict_df[0])))

In [36]:
print
print "Most Frequent Terms in Descriptions of All Loans:"
all_term_dict_df.head(20)


Most Frequent Terms in Descriptions of All Loans:


Unnamed: 0,0,1
541,br,7022
0,_number_,6109
1036,credit,4071
2562,loan,3815
3149,pay,3644
660,card,3563
1127,debt,2987
3162,payment,2157
932,consolid,2050
2822,month,1602


In [37]:
print
print "Most Frequent Terms Only in Descriptions of Defaulted Loans:"
bad_only_df


Most Frequent Terms Only in Descriptions of Defaulted Loans:


Unnamed: 0,0
0,apr
1,past
2,rent
3,respons
4,opportun
5,wed
6,excel
7,littl
8,low
9,complet


In [38]:
print
print "Most Frequent Terms Only in Descriptions of Fully Repaid Loans:"
good_only_df


Most Frequent Terms Only in Descriptions of Fully Repaid Loans:


Unnamed: 0,0
0,major
1,extra
2,feel
3,replac
4,hard
5,realli
6,alreadi
7,open
8,build
9,instead


In [39]:
### count misspellings

d = enchant.Dict("en_US")

num_terms = all_term_dict_df.shape[0]
misspellings = np.zeros(num_terms)
for index in range(num_terms):
    misspellings[index] = not d.check(all_term_dict_df.iloc[index, 0])

desc_matrix_misspell = desc_matrix[:, misspellings > 0]
data_nlp['misspell_count'] = desc_matrix_misspell.sum(axis=1)

In [40]:
data_nlp['misspell_count_norm'] = data_nlp['misspell_count'] \
    / data_nlp['desc_len'].astype(float) 
data_nlp.to_json(nlp_data_train_file, date_unit = 's')

In [41]:
data_nlp.describe()

Unnamed: 0,index,annual_income,delinq_2_yrs,inquiry_6_mos,loan_amount,open_accounts,total_accounts,loan_term,loan_subgrade,dti,...,cpi,gdp,unemploy,desc_len,desc_matrix_reduce,desc_word_count,vocab_count,vocab_count_norm,misspell_count,misspell_count_norm
count,6086.0,6086.0,6086.0,6086.0,6086.0,6086.0,6086.0,6086.0,6086.0,6086.0,...,6086.0,6086.0,6086.0,6086.0,6086.0,6086.0,6086.0,6086.0,6086.0,6086.0
mean,144361.385475,70497.457752,0.170884,0.834703,13344.94742,10.577062,24.232994,40.578377,9.882024,0.002455,...,0.021417,0.999723,0.001055,296.883174,1.907025,17.759777,14.785081,0.055769,10.270128,0.036341
std,110633.176465,43874.106137,0.456898,0.985658,7835.647409,4.565264,11.21958,9.430489,6.455069,0.001138,...,0.000508,3.6e-05,8.8e-05,327.132319,3.33709,21.374467,13.70616,0.01794,12.457731,0.014133
min,12.0,4200.0,0.0,0.0,1000.0,1.0,3.0,36.0,2.0,0.0,...,0.020494,0.999537,0.000903,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,36608.25,44000.0,0.0,0.0,7431.25,7.0,16.0,36.0,7.0,0.001593,...,0.02105,0.999701,0.000979,118.0,0.669999,7.0,6.0,0.046737,4.0,0.027778
50%,157612.0,60000.0,0.0,1.0,12000.0,10.0,23.0,36.0,7.0,0.002447,...,0.021465,0.999729,0.001081,208.0,1.037905,12.0,11.0,0.059259,7.0,0.0375
75%,202847.25,85000.0,0.0,1.0,18000.0,13.0,31.0,36.0,12.0,0.003281,...,0.021792,0.999751,0.001139,343.0,2.001544,22.0,19.0,0.066667,12.0,0.045455
max,465820.0,750000.0,2.0,3.0,35000.0,49.0,82.0,60.0,32.0,0.005366,...,0.023216,0.999789,0.001244,4487.0,107.572238,399.0,193.0,0.153846,273.0,0.153846


### Create NLP Features

In [42]:
#@@ create TF-IDF term matrices for two loan outcomes separately

# split data
data_nlp_2 = data_nlp[['description', 'loan_status']]
good_flags = data_nlp_2['loan_status'] == True
good_nlp = data_nlp_2.loc[good_flags, :]
bad_nlp = data_nlp_2.loc[~good_flags, :]

In [43]:
# compute matrices
vectorizer_good = TfidfVectorizer(stop_words = 'english', ngram_range = (1, 1), norm = None,
                                 use_idf = True)
vectorizer_bad = TfidfVectorizer(stop_words = 'english', ngram_range = (1, 1), norm = None,
                                use_idf = True)
good_matrix = vectorizer_good.fit_transform(good_nlp['description'].values)
bad_matrix = vectorizer_bad.fit_transform(bad_nlp['description'].values)

In [44]:
# sum term scores
good_term_scores = good_matrix.sum(axis = 0)
bad_term_scores = bad_matrix.sum(axis = 0)

good_term_scores = Preprocessing.normalize(good_term_scores, axis = 1)
bad_term_scores = Preprocessing.normalize(bad_term_scores, axis = 1)

In [45]:
# get feature names
good_terms = vectorizer_good.get_feature_names()
bad_terms = vectorizer_bad.get_feature_names()

In [46]:
### calculate absolute differences between normalized term scores

n_good = len(good_terms)
n_bad = len(bad_terms)

good_diffs = np.zeros(n_good)
bad_diffs = np.zeros(n_bad)

# consider all words in good loans
for good_term in range(n_good):
    try: 
        match_index = bad_terms.index(good_terms[good_term])
        good_diffs[good_term] = np.abs(good_term_scores[0, good_term] - 
                                       bad_term_scores[0, match_index])
    except ValueError:
        good_diffs[good_term] = np.abs(good_term_scores[0, good_term])

# # only consider words uniquely in bad loans, since matches already considered above in loop
for bad_term in range(n_bad):
    try:
        match_index = good_terms.index(bad_terms[bad_term])
        bad_diffs[bad_term] = -1
    except ValueError:
        bad_diffs[bad_term] = np.abs(bad_term_scores[0, bad_term])

In [47]:
### find top different terms

good_diffs_df = pd.DataFrame({'diffs': good_diffs, 'terms': good_terms})
bad_diffs_df = pd.DataFrame({'diffs': bad_diffs, 'terms': bad_terms})

diffs_df = pd.concat((good_diffs_df, bad_diffs_df), axis = 0)
diffs_df_sort = diffs_df.sort_values(by = 'diffs', ascending = False)
diffs_df_sort.head(20)

Unnamed: 0,diffs,terms
0,0.072127,_number_
230,0.032541,busi
1261,0.027626,pay
1162,0.02689,need
832,0.025251,help
157,0.021182,balanc
1404,0.019628,rate
360,0.019112,consolid
114,0.018866,apr
1650,0.018301,start


In [48]:
### add term flags to data as new features

num_top_diffs = 30
diff_indexes = np.zeros(num_top_diffs).astype(int)
for diff in range(num_top_diffs):
    match_index = np.where(feature_names == diffs_df_sort.iloc[diff, 1])[0]
    diff_indexes[diff] = match_index
    
count_cols = desc_matrix[:, diff_indexes]
count_col_names = feature_names[diff_indexes, 0].astype(str)
count_cols_df = pd.DataFrame(count_cols.toarray(), columns = count_col_names)
count_cols_df.to_json(diff_terms_train_file, date_unit = 's')