In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

In [2]:
# Read CSV files, remove unnamed column
train_df = pd.read_csv(Path('Resources/2019loans.csv'), index_col=0)
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'), index_col=0)

In [3]:
train_df.head()

Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,dti,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
57107,57107,13375.0,0.1797,483.34,MORTGAGE,223000.0,Not Verified,low_risk,n,29.99,...,100.0,50.0,0.0,0.0,577150.0,122018.0,32000.0,170200.0,N,N
141451,141451,21000.0,0.1308,478.68,MORTGAGE,123000.0,Source Verified,low_risk,n,11.26,...,85.0,33.3,0.0,0.0,132750.0,27896.0,15900.0,35398.0,N,N
321143,321143,20000.0,0.124,448.95,MORTGAGE,197000.0,Source Verified,low_risk,n,11.28,...,85.7,33.3,0.0,0.0,628160.0,114043.0,22600.0,90340.0,N,N
11778,11778,3000.0,0.124,100.22,RENT,45000.0,Not Verified,low_risk,n,18.08,...,100.0,16.7,1.0,0.0,42006.0,20761.0,19900.0,15406.0,N,N
169382,169382,30000.0,0.1612,1056.49,MORTGAGE,133000.0,Source Verified,low_risk,n,27.77,...,100.0,66.7,0.0,0.0,283248.0,109056.0,79500.0,58778.0,N,N


In [4]:
test_df.head()

Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,dti,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
67991,67991,40000.0,0.0819,814.7,MORTGAGE,140000.0,Not Verified,low_risk,n,19.75,...,97.7,0.0,0.0,0.0,527975.0,70914.0,74600.0,99475.0,N,N
25429,25429,6000.0,0.1524,208.7,RENT,55000.0,Not Verified,low_risk,n,11.52,...,66.7,0.0,0.0,0.0,34628.0,23460.0,5900.0,23628.0,N,N
38496,38496,3600.0,0.1695,128.27,RENT,42000.0,Not Verified,low_risk,n,6.74,...,100.0,0.0,0.0,0.0,23100.0,19183.0,7300.0,15000.0,N,N
19667,19667,20000.0,0.1524,478.33,RENT,100000.0,Not Verified,low_risk,n,12.13,...,100.0,50.0,0.0,0.0,56481.0,43817.0,13800.0,35981.0,N,N
37505,37505,3600.0,0.124,120.27,RENT,50000.0,Not Verified,low_risk,n,16.08,...,100.0,25.0,0.0,0.0,45977.0,32448.0,21000.0,24977.0,N,N


# Convert categorical data to numeric and separate target feature for TRAINING data
# Index and unnamed columns can be dropped or made into index

In [5]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder

In [6]:
# Create the X feature vector (train data), dropping the label feature and index column
X_train = train_df.drop(columns=["loan_status", "index"])
X_train.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,pymnt_plan,dti,delinq_2yrs,inq_last_6mths,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
57107,13375.0,0.1797,483.34,MORTGAGE,223000.0,Not Verified,n,29.99,0.0,0.0,...,100.0,50.0,0.0,0.0,577150.0,122018.0,32000.0,170200.0,N,N
141451,21000.0,0.1308,478.68,MORTGAGE,123000.0,Source Verified,n,11.26,2.0,0.0,...,85.0,33.3,0.0,0.0,132750.0,27896.0,15900.0,35398.0,N,N
321143,20000.0,0.124,448.95,MORTGAGE,197000.0,Source Verified,n,11.28,0.0,0.0,...,85.7,33.3,0.0,0.0,628160.0,114043.0,22600.0,90340.0,N,N
11778,3000.0,0.124,100.22,RENT,45000.0,Not Verified,n,18.08,0.0,0.0,...,100.0,16.7,1.0,0.0,42006.0,20761.0,19900.0,15406.0,N,N
169382,30000.0,0.1612,1056.49,MORTGAGE,133000.0,Source Verified,n,27.77,0.0,2.0,...,100.0,66.7,0.0,0.0,283248.0,109056.0,79500.0,58778.0,N,N


In [7]:
X_train.columns

Index(['loan_amnt', 'int_rate', 'installment', 'home_ownership', 'annual_inc',
       'verification_status', 'pymnt_plan', 'dti', 'delinq_2yrs',
       'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal', 'total_acc',
       'initial_list_status', 'out_prncp', 'out_prncp_inv', 'total_pymnt',
       'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int',
       'total_rec_late_fee', 'recoveries', 'collection_recovery_fee',
       'last_pymnt_amnt', 'collections_12_mths_ex_med', 'policy_code',
       'application_type', 'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal',
       'open_acc_6m', 'open_act_il', 'open_il_12m', 'open_il_24m',
       'mths_since_rcnt_il', 'total_bal_il', 'il_util', 'open_rv_12m',
       'open_rv_24m', 'max_bal_bc', 'all_util', 'total_rev_hi_lim', 'inq_fi',
       'total_cu_tl', 'inq_last_12m', 'acc_open_past_24mths', 'avg_cur_bal',
       'bc_open_to_buy', 'bc_util', 'chargeoff_within_12_mths', 'delinq_amnt',
       'mo_sin_old_il_acct', 'mo_sin_old_rev_tl_op', 'm

In [8]:
# Display the binary text features to be used with label encoder (avoid making extra features/columns)
X_train[["application_type", "debt_settlement_flag","hardship_flag","initial_list_status","pymnt_plan"]].head()

Unnamed: 0,application_type,debt_settlement_flag,hardship_flag,initial_list_status,pymnt_plan
57107,Individual,N,N,w,n
141451,Individual,N,N,w,n
321143,Individual,N,N,w,n
11778,Individual,N,N,w,n
169382,Individual,N,N,w,n


In [9]:
# Display the count of unique values in these text data features
X_train[["application_type", "debt_settlement_flag","hardship_flag","initial_list_status","pymnt_plan"]].nunique()

application_type        2
debt_settlement_flag    2
hardship_flag           2
initial_list_status     2
pymnt_plan              1
dtype: int64

In [10]:
# Drop pymnt_plan column which has only a single value
X_train = X_train.drop('pymnt_plan', axis=1)
X_train.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,dti,delinq_2yrs,inq_last_6mths,open_acc,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
57107,13375.0,0.1797,483.34,MORTGAGE,223000.0,Not Verified,29.99,0.0,0.0,15.0,...,100.0,50.0,0.0,0.0,577150.0,122018.0,32000.0,170200.0,N,N
141451,21000.0,0.1308,478.68,MORTGAGE,123000.0,Source Verified,11.26,2.0,0.0,16.0,...,85.0,33.3,0.0,0.0,132750.0,27896.0,15900.0,35398.0,N,N
321143,20000.0,0.124,448.95,MORTGAGE,197000.0,Source Verified,11.28,0.0,0.0,12.0,...,85.7,33.3,0.0,0.0,628160.0,114043.0,22600.0,90340.0,N,N
11778,3000.0,0.124,100.22,RENT,45000.0,Not Verified,18.08,0.0,0.0,12.0,...,100.0,16.7,1.0,0.0,42006.0,20761.0,19900.0,15406.0,N,N
169382,30000.0,0.1612,1056.49,MORTGAGE,133000.0,Source Verified,27.77,0.0,2.0,13.0,...,100.0,66.7,0.0,0.0,283248.0,109056.0,79500.0,58778.0,N,N


In [11]:
# Use Label Encoder to preprocess binary text features (convert to numeric without additional columns)
le = LabelEncoder()
X_train["application_type"] = le.fit_transform(X_train["application_type"])
print("Label encoder for application_type: ", le.classes_)
X_train["debt_settlement_flag"] = le.fit_transform(X_train["debt_settlement_flag"])
print("Label encoder for deb_settlement_flag: ", le.classes_)
X_train["hardship_flag"] = le.fit_transform(X_train["hardship_flag"])
print("Label encoder for hardship_flag: ", le.classes_)
X_train["initial_list_status"] = le.fit_transform(X_train["initial_list_status"])
print("Label encoder for initial_list_status: ", le.classes_)

Label encoder for application_type:  ['Individual' 'Joint App']
Label encoder for deb_settlement_flag:  ['N' 'Y']
Label encoder for hardship_flag:  ['N' 'Y']
Label encoder for initial_list_status:  ['f' 'w']


In [12]:
# Display the features with binary data after label encoding
X_train[["application_type", "debt_settlement_flag","hardship_flag","initial_list_status"]].head()

Unnamed: 0,application_type,debt_settlement_flag,hardship_flag,initial_list_status
57107,0,0,0,1
141451,0,0,0,1
321143,0,0,0,1
11778,0,0,0,1
169382,0,0,0,1


In [13]:
# Use pd.get_dummies for all other categorical features (verification_status, home_ownership)
X_train = pd.get_dummies(X_train, drop_first=True)
print(X_train.columns)
X_train.head()

Index(['loan_amnt', 'int_rate', 'installment', 'annual_inc', 'dti',
       'delinq_2yrs', 'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal',
       'total_acc', 'initial_list_status', 'out_prncp', 'out_prncp_inv',
       'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int',
       'total_rec_late_fee', 'recoveries', 'collection_recovery_fee',
       'last_pymnt_amnt', 'collections_12_mths_ex_med', 'policy_code',
       'application_type', 'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal',
       'open_acc_6m', 'open_act_il', 'open_il_12m', 'open_il_24m',
       'mths_since_rcnt_il', 'total_bal_il', 'il_util', 'open_rv_12m',
       'open_rv_24m', 'max_bal_bc', 'all_util', 'total_rev_hi_lim', 'inq_fi',
       'total_cu_tl', 'inq_last_12m', 'acc_open_past_24mths', 'avg_cur_bal',
       'bc_open_to_buy', 'bc_util', 'chargeoff_within_12_mths', 'delinq_amnt',
       'mo_sin_old_il_acct', 'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op',
       'mo_sin_rcnt_tl', 'mort_acc', 'm

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag,home_ownership_MORTGAGE,home_ownership_OWN,home_ownership_RENT,verification_status_Source Verified,verification_status_Verified
57107,13375.0,0.1797,483.34,223000.0,29.99,0.0,0.0,15.0,0.0,39728.0,...,122018.0,32000.0,170200.0,0,0,1,0,0,0,0
141451,21000.0,0.1308,478.68,123000.0,11.26,2.0,0.0,16.0,0.0,9585.0,...,27896.0,15900.0,35398.0,0,0,1,0,0,1,0
321143,20000.0,0.124,448.95,197000.0,11.28,0.0,0.0,12.0,0.0,16708.0,...,114043.0,22600.0,90340.0,0,0,1,0,0,1,0
11778,3000.0,0.124,100.22,45000.0,18.08,0.0,0.0,12.0,1.0,8809.0,...,20761.0,19900.0,15406.0,0,0,0,0,1,0,0
169382,30000.0,0.1612,1056.49,133000.0,27.77,0.0,2.0,13.0,0.0,65420.0,...,109056.0,79500.0,58778.0,0,0,1,0,0,1,0


In [14]:
# Create the y-label (train data)
# Convert output labels to 0 and 1 with label encoder
y_train = le.fit_transform(train_df['loan_status'])
print("Label Encoder classes (train): ", le.classes_)
print("y_train (train): ", y_train)

Label Encoder classes (train):  ['high_risk' 'low_risk']
y_train (train):  [1 1 1 ... 0 0 0]


# Convert categorical data to numeric and separate target feature for TESTING data

In [15]:
# Create the X feature vector (test data)
X_test = test_df.drop(columns=['loan_status', 'index'])

# Use Label Encoder to preprocess binary text features
le = LabelEncoder()
X_test["application_type"] = le.fit_transform(X_test["application_type"])
print("Label encoder for application_type: ", le.classes_)
X_test["debt_settlement_flag"] = le.fit_transform(X_test["debt_settlement_flag"])
print("Label encoder for deb_settlement_flag: ", le.classes_)
X_test["hardship_flag"] = le.fit_transform(X_test["hardship_flag"])
print("Label encoder for hardship_flag: ", le.classes_)
X_test["initial_list_status"] = le.fit_transform(X_test["initial_list_status"])
print("Label encoder for initial_list_status: ", le.classes_)
X_test["pymnt_plan"] = le.fit_transform(X_test["pymnt_plan"])
print("Label encoder for pymnt_plan: ", le.classes_)

# Drop pymnt_plan since it only has one value, same as training data
X_test = X_test.drop('pymnt_plan', axis=1)

# Use pd.dummies for all other categorical features
X_test = pd.get_dummies(X_test, drop_first=True)
print(X_test.columns)
X_test.head()

Label encoder for application_type:  ['Individual' 'Joint App']
Label encoder for deb_settlement_flag:  ['N']
Label encoder for hardship_flag:  ['N' 'Y']
Label encoder for initial_list_status:  ['f' 'w']
Label encoder for pymnt_plan:  ['n']
Index(['loan_amnt', 'int_rate', 'installment', 'annual_inc', 'dti',
       'delinq_2yrs', 'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal',
       'total_acc', 'initial_list_status', 'out_prncp', 'out_prncp_inv',
       'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int',
       'total_rec_late_fee', 'recoveries', 'collection_recovery_fee',
       'last_pymnt_amnt', 'collections_12_mths_ex_med', 'policy_code',
       'application_type', 'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal',
       'open_acc_6m', 'open_act_il', 'open_il_12m', 'open_il_24m',
       'mths_since_rcnt_il', 'total_bal_il', 'il_util', 'open_rv_12m',
       'open_rv_24m', 'max_bal_bc', 'all_util', 'total_rev_hi_lim', 'inq_fi',
       'total_cu_tl', 'inq_last_

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag,home_ownership_MORTGAGE,home_ownership_OWN,home_ownership_RENT,verification_status_Source Verified,verification_status_Verified
67991,40000.0,0.0819,814.7,140000.0,19.75,0.0,1.0,18.0,0.0,9471.0,...,70914.0,74600.0,99475.0,0,0,1,0,0,0,0
25429,6000.0,0.1524,208.7,55000.0,11.52,2.0,0.0,8.0,0.0,1280.0,...,23460.0,5900.0,23628.0,0,0,0,0,1,0,0
38496,3600.0,0.1695,128.27,42000.0,6.74,0.0,0.0,6.0,0.0,4757.0,...,19183.0,7300.0,15000.0,0,0,0,0,1,0,0
19667,20000.0,0.1524,478.33,100000.0,12.13,0.0,2.0,7.0,0.0,12731.0,...,43817.0,13800.0,35981.0,0,0,0,0,1,0,0
37505,3600.0,0.124,120.27,50000.0,16.08,0.0,3.0,6.0,0.0,10413.0,...,32448.0,21000.0,24977.0,0,0,0,0,1,0,0


In [16]:
# Create the y-label (test data)
# Convert output labels to 0 and 1
y_test = le.fit_transform(test_df['loan_status'])
print("Label Encoder classes (test): ", le.classes_)
print("y_test (test): ", y_test)

Label Encoder classes (test):  ['high_risk' 'low_risk']
y_test (test):  [1 1 1 ... 0 0 0]


In [17]:
# add missing dummy variables to testing set
## NOTE: Shapes for (X_train, y_train) are currently compatible with (X_test, y_test) data frames
##  The column names of the binary text features are all accounted for.
print("X_train shape", X_train.shape)
print("y_train shape", y_train.shape)
print("X_test shape", X_test.shape)
print("y_test shape", y_test.shape)

X_train shape (12180, 85)
y_train shape (12180,)
X_test shape (4702, 85)
y_test shape (4702,)


# Create, train, score LogisticRegression model on unscaled data

In [18]:
# Train the Logistic Regression model on the unscaled data and print the model score
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [19]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.649671592775041
Testing Data Score: 0.5159506592939175


# Create, train, score RandomForestClassifier model on unscaled data

In [17]:
# Train a Random Forest Classifier model and print the model score

# Scale the data with StandardScaler

In [20]:
# Scaling the X data by using StandardScaler()
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_train_scaled


array([[-0.39311205,  0.73658452, -0.08760946, ..., -0.82658963,
        -0.81775408, -0.44297702],
       [ 0.35168119, -0.19171582, -0.10342722, ..., -0.82658963,
         1.22286152, -0.44297702],
       [ 0.25400339, -0.32080462, -0.20434179, ..., -0.82658963,
         1.22286152, -0.44297702],
       ...,
       [-1.34791257,  0.85997823, -1.28263075, ...,  1.20979016,
        -0.81775408, -0.44297702],
       [-0.23438563, -1.00231755, -0.11361032, ..., -0.82658963,
         1.22286152, -0.44297702],
       [-0.23438563,  0.69292214,  0.10586953, ...,  1.20979016,
        -0.81775408, -0.44297702]])

In [21]:
# Transforming the test dataset based on the fit from the training dataset
X_test_scaled = scaler.transform(X_test)
X_test_scaled

array([[ 2.20755943, -1.12001617,  1.0371484 , ..., -0.82658963,
        -0.81775408, -0.44297702],
       [-1.11348584,  0.21833096, -1.01983876, ...,  1.20979016,
        -0.81775408, -0.44297702],
       [-1.34791257,  0.54295132, -1.2928478 , ...,  1.20979016,
        -0.81775408, -0.44297702],
       ...,
       [-0.72277464,  1.7009538 , -0.41340093, ...,  1.20979016,
        -0.81775408,  2.25745345],
       [-0.91813024,  0.85997823, -1.02947877, ...,  1.20979016,
         1.22286152, -0.44297702],
       [ 1.23078141,  1.22636262,  2.08478621, ...,  1.20979016,
         1.22286152, -0.44297702]])

In [22]:
# Train the Logistic Regression model on the scaled data and print the model score
scaled_classifier = LogisticRegression()
scaled_classifier.fit(X_train_scaled, y_train)
print(f"Scaled Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Scaled Testing Data Score: {classifier.score(X_test_scaled, y_test)}")


Scaled Training Data Score: 0.6274220032840723
Scaled Testing Data Score: 0.49893662271373884


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
# Train a Random Forest Classifier model on the scaled data and print the model score