In [21]:

import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

import warnings

warnings.filterwarnings('ignore')


def spaceless_lowers(df):
    """remove spaces and capital letters from column names. replace with _ and lowercase, respectively"""

    new_cols = []
    cols = df.columns

    for col in cols:

        if df[col].dtype == 'object':
            df[col] = df[col].str.lower()
        new_col = col.replace(' ', '_').lower()
        new_cols.append(new_col)

    df.columns = new_cols

    return df


train_df = pd.read_csv('Resources/2019loans.csv')
test_df = pd.read_csv('Resources/2020Q1loans.csv')

use_cols = ['home_ownership', 'verification_status',
            'application_type', 'hardship_flag',
            'debt_settlement_flag', 'initial_list_status',
            'pymnt_plan', 'loan_status']



In [22]:
train_df = train_df.dropna(axis='columns', how='all')
test_df = test_df.dropna(axis='columns', how='all')

train_df = train_df.dropna()

train_df = train_df.drop(['Unnamed: 0', 'index'], axis=1)
test_df = test_df.drop(['Unnamed: 0', 'index'], axis=1)

test_df = spaceless_lowers(test_df)
train_df = spaceless_lowers(train_df)

# more straightforward to do it without this
# train_df = pd.get_dummies(train_df, columns=use_cols)
# test_df = pd.get_dummies(test_df, columns=use_cols)

dfs = [train_df, test_df]

In [23]:
train_df

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,dti,delinq_2yrs,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,13375.0,0.1797,483.34,mortgage,223000.0,not verified,low_risk,n,29.99,0.0,...,100.0,50.0,0.0,0.0,577150.0,122018.0,32000.0,170200.0,n,n
1,21000.0,0.1308,478.68,mortgage,123000.0,source verified,low_risk,n,11.26,2.0,...,85.0,33.3,0.0,0.0,132750.0,27896.0,15900.0,35398.0,n,n
2,20000.0,0.1240,448.95,mortgage,197000.0,source verified,low_risk,n,11.28,0.0,...,85.7,33.3,0.0,0.0,628160.0,114043.0,22600.0,90340.0,n,n
3,3000.0,0.1240,100.22,rent,45000.0,not verified,low_risk,n,18.08,0.0,...,100.0,16.7,1.0,0.0,42006.0,20761.0,19900.0,15406.0,n,n
4,30000.0,0.1612,1056.49,mortgage,133000.0,source verified,low_risk,n,27.77,0.0,...,100.0,66.7,0.0,0.0,283248.0,109056.0,79500.0,58778.0,n,n
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12175,19975.0,0.2565,801.09,rent,28000.0,not verified,high_risk,n,28.42,0.0,...,100.0,16.7,0.0,0.0,50055.0,28192.0,18700.0,19055.0,n,n
12176,15000.0,0.1774,540.34,rent,50000.0,verified,high_risk,n,23.43,4.0,...,90.5,11.1,0.0,0.0,70324.0,57025.0,13300.0,54824.0,n,n
12177,3600.0,0.1862,131.28,rent,60000.0,not verified,high_risk,n,28.80,0.0,...,100.0,0.0,0.0,0.0,83765.0,55156.0,14800.0,53065.0,n,n
12178,15000.0,0.0881,475.68,mortgage,62000.0,source verified,high_risk,n,11.44,0.0,...,100.0,0.0,0.0,0.0,189930.0,23748.0,7000.0,32930.0,n,n


In [24]:
train_df.columns

Index(['loan_amnt', 'int_rate', 'installment', 'home_ownership', 'annual_inc',
       'verification_status', 'loan_status', 'pymnt_plan', 'dti',
       'delinq_2yrs', 'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal',
       'total_acc', 'initial_list_status', 'out_prncp', 'out_prncp_inv',
       'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int',
       'total_rec_late_fee', 'recoveries', 'collection_recovery_fee',
       'last_pymnt_amnt', 'collections_12_mths_ex_med', 'policy_code',
       'application_type', 'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal',
       'open_acc_6m', 'open_act_il', 'open_il_12m', 'open_il_24m',
       'mths_since_rcnt_il', 'total_bal_il', 'il_util', 'open_rv_12m',
       'open_rv_24m', 'max_bal_bc', 'all_util', 'total_rev_hi_lim', 'inq_fi',
       'total_cu_tl', 'inq_last_12m', 'acc_open_past_24mths', 'avg_cur_bal',
       'bc_open_to_buy', 'bc_util', 'chargeoff_within_12_mths', 'delinq_amnt',
       'mo_sin_old_il_acct', 'mo_sin_old

In [25]:
obj_cols = [col for col in train_df.columns if train_df[col].dtype == 'object']

for col in obj_cols:
    train_df[col] = train_df[col].str.lower().str.replace(' ', '_')
    test_df[col] = test_df[col].str.lower().str.replace(' ', '_')

for df in dfs:
    home_ownership = df['home_ownership']
    verification_status = df['verification_status']
    pymnt_plan = df['pymnt_plan']
    initial_list_status = df['initial_list_status']
    application_type = df['application_type']
    hardship_flag = df['hardship_flag']
    debt_settlement_flag = df['debt_settlement_flag']
    loan_status = df['loan_status']

    home_ownership.replace(
        {
            'mortgage': 0,
            'rent': 1,
            'own': 2,
            'any': 3

        }, inplace=True
    )

    loan_status.replace(
        {
            'low_risk': 0,
            'high_risk': 1
        }, inplace=True
    )

    verification_status.replace(
        {
            'not_verified': 0,
            'source_verified': 1,
            'verified': 2
        }, inplace=True
    )

    pymnt_plan.replace(
        {
            'n': 0
        }, inplace=True
    )

    initial_list_status.replace(
        {
            'w': 0,
            'f': 1
        }, inplace=True
    )

    application_type.replace(
        {
            'individual': 0,
            'joint_app': 1
        }, inplace=True
    )

    hardship_flag.replace(
        {
            'n': 0,
            'y': 1
        }, inplace=True
    )

    debt_settlement_flag.replace(
        {
            'n': 0,
            'y': 1
        }, inplace=True)



In [26]:
train_df[obj_cols]

Unnamed: 0,home_ownership,verification_status,loan_status,pymnt_plan,initial_list_status,application_type,hardship_flag,debt_settlement_flag
0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
12175,1,0,1,0,0,0,0,0
12176,1,2,1,0,0,0,0,0
12177,1,0,1,0,0,0,0,0
12178,0,1,1,0,0,1,0,0


In [27]:
X_train = train_df.drop('loan_status', axis=1)
y_train = train_df['loan_status'].values

for col in obj_cols:
    print(train_df[col].value_counts(), '\n----\n')

0    5800
1    4944
2    1371
3      65
Name: home_ownership, dtype: int64 
----

0    5301
1    4881
2    1998
Name: verification_status, dtype: int64 
----

0    6090
1    6090
Name: loan_status, dtype: int64 
----

0    12180
Name: pymnt_plan, dtype: int64 
----

0    11158
1     1022
Name: initial_list_status, dtype: int64 
----

0    10400
1     1780
Name: application_type, dtype: int64 
----

0    11832
1      348
Name: hardship_flag, dtype: int64 
----

0    12175
1        5
Name: debt_settlement_flag, dtype: int64 
----



In [28]:
X_test = test_df.drop('loan_status', axis=1).dropna()
y_test = test_df['loan_status'].values

In [29]:
classifier = LogisticRegression(max_iter=300)
classifier.fit(X_train, y_train)

print(
    f'The training coefficient is {classifier.score(X_train, y_train)}\n'
    f'The testing coefficient is {classifier.score(X_test, y_test)}\n'
)

The training coefficient is 0.6786535303776683
The testing coefficient is 0.546575925138239



In [30]:
rand_class = RandomForestClassifier(
    random_state=42,
    n_estimators=300
).fit(X_train, y_train)

print(
    f'The random forest training coefficient is {rand_class.score(X_train, y_train)}\n'
    f'The random forest testing coefficient is {rand_class.score(X_test, y_test)}\n'
)

The random forest training coefficient is 1.0
The random forest testing coefficient is 0.6431305827307529



In [31]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

classifier.fit(X_train_scaled, y_train)

print(
    f'The scaled regression training coefficient is {classifier.score(X_train_scaled, y_train)}\n'
    f'The scaled regression testing coefficient is {classifier.score(X_test_scaled, y_test)}\n'
)

The scaled regression training coefficient is 0.7082101806239737
The scaled regression testing coefficient is 0.6616333475116971



In [32]:
print(f'Scaled Random Forest coefficient: {rand_class.score(X_test_scaled, y_test)}')

Scaled Random Forest coefficient: 0.5
