In [44]:
# Imports
import pickle
import operator
import numpy as np
import pandas as pd
from collections import defaultdict

In [45]:
# Train vs val/test
train = False

In [46]:
# Getting the data
df = pd.DataFrame()
if (train):
    df = pd.read_csv("data/lending_train.csv")
else:
    df = pd.read_csv("data/lending_topredict.csv")
pd.set_option('max_columns', None)

In [47]:
# Removing bad cols
if (train):
    df = df.drop(columns=['ID', 'race', 'extended_reason'])
else:
    df = df.drop(columns=['race', 'extended_reason', 'loan_paid'])

In [48]:
# Creating new column containing # of missing values initially
missing_vals = df.isnull().sum(axis=1)
df['missing_vals'] = missing_vals

In [49]:
# removing rows with lot of missing values
df = df.dropna(thresh=len(df.columns)-2)

In [50]:
# Filling in bad values using median or mode
categorical_cols = ['employment_length', 'zipcode', 'state']
numerical_cols = ['debt_to_income_ratio', 'public_bankruptcies', 'fico_inquired_last_6mths', 'months_since_last_delinq', 'total_revolving_limit', 'any_tax_liens']
other_cols = ['employment', 'home_ownership_status']

for col in categorical_cols:
    mode = df[col].mode()[0]
    df[col].fillna(mode, inplace=True)
    
for col in numerical_cols:
    median = df[col].median()
    df[col].fillna(median, inplace=True)
    
for col in other_cols:
    df[col].fillna('unknown', inplace=True)

df

Unnamed: 0,ID,requested_amnt,loan_duration,employment,employment_length,reason_for_loan,annual_income,debt_to_income_ratio,employment_verified,public_bankruptcies,zipcode,state,home_ownership_status,delinquency_last_2yrs,fico_score_range_low,fico_score_range_high,fico_inquired_last_6mths,months_since_last_delinq,revolving_balance,total_revolving_limit,type_of_application,any_tax_liens,missing_vals
0,1000000,9600.0,36 months,Computer Technician,< 1 year,debt_consolidation,30000.0,19.01,Verified,0.0,465xx,IN,RENT,0.0,680.0,684.0,0.0,31.0,18492.0,22900.0,Individual,0.0,1
1,1000001,8000.0,36 months,Teacher,6 years,debt_consolidation,34500.0,31.30,Source Verified,0.0,941xx,CA,RENT,0.0,650.0,654.0,0.0,31.0,5119.0,26000.0,Individual,0.0,1
2,1000002,21000.0,36 months,Portability Specialist,10+ years,other,45000.0,13.15,Verified,1.0,330xx,FL,MORTGAGE,0.0,650.0,654.0,0.0,31.0,5579.0,8700.0,Individual,0.0,1
3,1000003,5000.0,36 months,Internal Maintence,2 years,credit_card,30000.0,6.74,Not Verified,0.0,864xx,AZ,RENT,0.0,650.0,654.0,1.0,31.0,3872.0,22100.0,Individual,0.0,1
4,1000004,3000.0,36 months,Office administration,< 1 year,other,38000.0,0.79,Source Verified,0.0,950xx,CA,OWN,0.0,650.0,654.0,0.0,31.0,250.0,8800.0,Individual,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
345305,1345305,3000.0,36 months,pharmacy technician,< 1 year,debt_consolidation,24000.0,5.75,Source Verified,0.0,952xx,CA,OWN,0.0,650.0,654.0,0.0,28.0,3629.0,20400.0,Individual,0.0,0
345306,1345306,12000.0,36 months,Senior Account Manage,6 years,debt_consolidation,65000.0,19.31,Source Verified,0.0,142xx,NY,OWN,1.0,650.0,654.0,2.0,19.0,10010.0,18200.0,Individual,0.0,0
345307,1345307,30100.0,36 months,unknown,10+ years,debt_consolidation,87028.0,28.71,Verified,0.0,611xx,IL,MORTGAGE,2.0,665.0,669.0,1.0,9.0,7886.0,9400.0,Individual,0.0,2
345308,1345308,27000.0,36 months,Quality Assurance Associate,7 years,debt_consolidation,100400.0,21.22,Not Verified,0.0,913xx,CA,RENT,0.0,705.0,709.0,0.0,31.0,19109.0,32200.0,Individual,0.0,1


In [51]:
# Feature engineering
## generating new col 
income_to_loan_amount_ratio = df['requested_amnt'] / df['annual_income']
df['income_to_loan_amount_ratio'] = income_to_loan_amount_ratio

# combining fico high and low score
fico_score = (df['fico_score_range_high'] + df['fico_score_range_low']) / 2
fico_score_range = df['fico_score_range_high'] - df['fico_score_range_low']

df = df.drop(columns=['fico_score_range_high', 'fico_score_range_low'])

df['fico_score'] = fico_score
df['fico_score_range'] = fico_score_range

In [52]:
# Removing unnecessary strings
new_loan_duration = df.shape[0] * [0]
new_employment_length = df.shape[0] * [0]

for index in range (df.shape[0]):
    new_loan_duration[index] = int(df['loan_duration'].iloc[index].replace(" months", ""))
    new_employment_length[index] = int(df['employment_length'].iloc[index].replace(" years", "").replace(" year", "").replace("< ", "").replace("+", ""))

df = df.drop(columns=['loan_duration', 'employment_length'])
df['loan_duration'] = new_loan_duration
df['employment_length'] = new_employment_length

In [53]:
set(df['loan_duration'])

{36, 60}

In [54]:
# encoding based on data ordinality
encodings = {'employment_verified': {'Not Verified': 0, 'Verified': 1, 'Source Verified': 2},
             'type_of_application': {'Individual': 0, 'Joint App': 1},
             'loan_duration': {36: 0, 60: 1}}
for col in encodings.keys():
    new_col = []
    for item in df[col]:
        new_col.append(encodings[col][item])
    df = df.drop(columns=[col])
    df[col] = new_col

In [55]:
# special encoding for cols with too many unqiue values
top_five_employment = ['Teacher', 'Manager', 'Owner', 'Registered Nurse', 'RN']
top_five_reason = ['debt_consolidation', 'credit_card', 'home_improvement', 'major_purchase', 'small_business']
        
new_employment_list = df.shape[0] * ['other']
new_reason_list = df.shape[0] * ['other']
for index in range (df.shape[0]):
    if df['employment'].iloc[index] in top_five_employment:
        new_employment_list[index] = df['employment'].iloc[index]
        
    if df['reason_for_loan'].iloc[index] in top_five_reason:
        new_reason_list[index] = df['reason_for_loan'].iloc[index]
    
df = df.drop(columns=['employment', 'reason_for_loan'])
df['employment'] = new_employment_list
df['reason_for_loan'] = new_reason_list

In [56]:
# giving encodings to states based on their ordinality
ordering = ['MS', 'IA', 'NE', 'AR', 'AL', 'OK', 'LA', 'NY', 'NV', 'TN', 'IN', 'MO', 'NM', 'MD', 'SD', 'FL', 'NJ', 'PA', 'KY', 'NC', 'OH', 'ND', 'MI', 'VA', 'HI', 'TX', 'AZ', 'MN', 'AK', 'CA', 'ID', 'DE', 'MA', 'GA', 'IL', 'WI', 'RI', 'CT', 'KS', 'UT', 'WY', 'SC', 'MT', 'CO', 'WV', 'WA', 'NH', 'OR', 'VT', 'ME', 'DC']
        
new_states_col = df.shape[0] * [0]
for index in range (df.shape[0]):
    new_states_col[index] = ordering.index(df['state'].iloc[index])
    
df['state'] = new_states_col

In [57]:
# simplifying categories for zipcode based on correlation with loan paid
cols = ['zipcode']
if (train):
    for col in cols:
        values = set(df[col])
        values_avg = {}
        for value in values:
            values_avg[value] = df[df[col] == value]['loan_paid'].mean()

        summary_df = pd.DataFrame(values_avg.values())
        percentile_20 = summary_df.quantile(.2)[0]
        percentile_40 = summary_df.quantile(.4)[0]
        percentile_60 = summary_df.quantile(.6)[0]
        percentile_80 = summary_df.quantile(.8)[0]
        
        col_classification = {}
        for value in values:
            avg = values_avg[value]
            if (avg < percentile_20):
                col_classification[value] = 'bad'
            elif (avg < percentile_40):
                col_classification[value] = 'mid_bad'
            elif (avg < percentile_60):
                col_classification[value] = 'mid'
            elif (avg < percentile_80):
                col_classification[value] = 'mid_good'
            else:
                col_classification[value] = 'good'
        
        with open('data/' + col + '.pickle', 'wb') as handle:
            pickle.dump(col_classification, handle, protocol=pickle.HIGHEST_PROTOCOL)

        new_col = df.shape[0] * [0]
        for index in range (df.shape[0]):
            new_col[index] = col_classification[df[col].iloc[index]]

        df.drop(columns=[col])
        df[col] = new_col
else:
    for col in cols:
        with open('data/' + col + '.pickle', 'rb') as handle:
            col_classification = pickle.load(handle)

        new_col = df.shape[0] * [0]
        for index in range (df.shape[0]):
            try:
                new_col[index] = col_classification[df[col].iloc[index]]
            except:
                new_col[index] = 'mid'
                
        df.drop(columns=[col])
        df[col] = new_col

In [58]:
# one-hot encoding necessary columns
cols_to_one_hot_encode = ['reason_for_loan', 'employment', 'home_ownership_status', 'zipcode']
for col in cols_to_one_hot_encode:
    df = pd.get_dummies(df, columns=[col])

In [59]:
# fixing ordinality of months_since_last_delinq
fixed_months_since_last_delinq = df.shape[0] * [0]
for index in range (df.shape[0]):
    if (df['months_since_last_delinq'].iloc[index] == 0):
        fixed_months_since_last_delinq[index] = 250
    else:
        fixed_months_since_last_delinq[index] = df['months_since_last_delinq'].iloc[index]
        
df['months_since_last_delinq'] = fixed_months_since_last_delinq

In [60]:
# normalizing columns and filling bad values
columns_to_norm = ['requested_amnt', 'annual_income', 'debt_to_income_ratio', 'months_since_last_delinq', 'fico_score', 'revolving_balance', 'total_revolving_limit']

if (train):
    train_attrs = {}
    for col in columns_to_norm:
        train_attrs[col] = (df[col].mean(), df[col].std())
        df[col] = (df[col] - train_attrs[col][0]) / train_attrs[col][1]
    with open('data/train_attributes.pickle', 'wb') as handle:
        pickle.dump(train_attrs, handle, protocol=pickle.HIGHEST_PROTOCOL)
else:
    train_attrs = {}
    with open('data/train_attributes.pickle', 'rb') as handle:
        train_attrs = pickle.load(handle)
    for col in columns_to_norm:
        df[col] = (df[col] - train_attrs[col][0]) / train_attrs[col][1]

In [61]:
# other
cols = ['public_bankruptcies', 'delinquency_last_2yrs', 'fico_inquired_last_6mths', 'any_tax_liens']
for col in cols:
    df[col] = df[col].astype(np.int32)

In [62]:
# converting from int64 to int32 to save space + compatibility issues
for col in df.columns:
    if (df[col].dtype == 'int64'):
        df[col] = df[col].astype(np.int32)

In [63]:
# converting from float64 to float32 to save space + compatibility issues
for col in df.columns:
    if (df[col].dtype == 'float64'):
        df[col] = df[col].astype(np.float32)

In [64]:
df.dtypes

ID                                      int32
requested_amnt                        float32
annual_income                         float32
debt_to_income_ratio                  float32
public_bankruptcies                     int32
state                                   int32
delinquency_last_2yrs                   int32
fico_inquired_last_6mths                int32
months_since_last_delinq              float32
revolving_balance                     float32
total_revolving_limit                 float32
any_tax_liens                           int32
missing_vals                            int32
income_to_loan_amount_ratio           float32
fico_score                            float32
fico_score_range                      float32
employment_length                       int32
employment_verified                     int32
type_of_application                     int32
loan_duration                           int32
reason_for_loan_credit_card             uint8
reason_for_loan_debt_consolidation

In [65]:
# Write to csv
if (train):
    df.to_csv('data/train_preprocessed_data.csv', index=None)
else:
    df.to_csv('data/predict_preprocessed_data.csv', index=None)
df

Unnamed: 0,ID,requested_amnt,annual_income,debt_to_income_ratio,public_bankruptcies,state,delinquency_last_2yrs,fico_inquired_last_6mths,months_since_last_delinq,revolving_balance,total_revolving_limit,any_tax_liens,missing_vals,income_to_loan_amount_ratio,fico_score,fico_score_range,employment_length,employment_verified,type_of_application,loan_duration,reason_for_loan_credit_card,reason_for_loan_debt_consolidation,reason_for_loan_home_improvement,reason_for_loan_major_purchase,reason_for_loan_other,reason_for_loan_small_business,employment_Manager,employment_Owner,employment_RN,employment_Registered Nurse,employment_Teacher,employment_other,home_ownership_status_ANY,home_ownership_status_MORTGAGE,home_ownership_status_NONE,home_ownership_status_OTHER,home_ownership_status_OWN,home_ownership_status_RENT,zipcode_bad,zipcode_good,zipcode_mid,zipcode_mid_bad,zipcode_mid_good
0,1000000,-0.564163,-0.663909,0.078216,0,10,0,0,-0.114735,0.095393,-0.256616,0,1,0.320000,-0.290869,4.0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0
1,1000001,-0.747450,-0.600644,1.276768,0,29,0,0,-0.114735,-0.495263,-0.172716,0,1,0.231884,-1.196633,4.0,6,2,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0
2,1000002,0.741754,-0.453025,-0.493266,1,15,0,0,-0.114735,-0.474945,-0.640935,0,1,0.466667,-1.196633,4.0,10,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0
3,1000003,-1.091112,-0.663909,-1.118386,0,26,0,1,-0.114735,-0.550340,-0.278268,0,1,0.166667,-1.196633,4.0,2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0
4,1000004,-1.320221,-0.551437,-1.698645,0,29,0,0,-0.114735,-0.710316,-0.638229,0,1,0.078947,-1.196633,4.0,1,2,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
345305,1345305,-1.320221,-0.748263,-1.214933,0,29,0,0,-0.285455,-0.561073,-0.324278,0,0,0.125000,-1.196633,4.0,1,2,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0
345306,1345306,-0.289234,-0.171846,0.107472,0,7,1,2,-0.797614,-0.279238,-0.383820,0,0,0.184615,-1.196633,4.0,6,2,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0
345307,1345307,1.784196,0.137844,1.024184,0,34,2,1,-1.366679,-0.373050,-0.621990,0,2,0.345866,-0.743751,4.0,10,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0
345308,1345308,1.429078,0.325841,0.293741,0,29,0,0,-0.114735,0.122645,-0.004915,0,1,0.268924,0.463934,4.0,7,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0
