In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

In [2]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [3]:
train_df.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,pymnt_plan,dti,delinq_2yrs,inq_last_6mths,...,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag,target
0,25200.0,0.1102,548.17,RENT,65000.0,Not Verified,n,42.67,0.0,1.0,...,10.0,0.0,0.0,282008.0,93765.0,57300.0,116320.0,N,N,low_risk
1,14000.0,0.2055,375.22,MORTGAGE,80000.0,Source Verified,n,15.47,0.0,0.0,...,75.0,0.0,0.0,434976.0,137629.0,17800.0,95032.0,N,N,low_risk
2,30000.0,0.1171,992.28,MORTGAGE,200000.0,Not Verified,n,14.14,0.0,0.0,...,100.0,0.0,0.0,99849.0,68769.0,13500.0,86349.0,N,N,low_risk
3,12000.0,0.1033,256.92,MORTGAGE,50000.0,Not Verified,n,21.41,0.0,0.0,...,33.3,0.0,0.0,209700.0,44654.0,13000.0,39700.0,N,N,low_risk
4,10625.0,0.1612,259.06,OWN,29000.0,Not Verified,n,25.87,0.0,0.0,...,0.0,0.0,0.0,35300.0,11893.0,18800.0,8000.0,N,N,low_risk


In [4]:
test_df.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,pymnt_plan,dti,delinq_2yrs,inq_last_6mths,...,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag,target
0,12000.0,0.2055,449.34,RENT,70000.0,Not Verified,n,28.56,0.0,1.0,...,60.0,1.0,0.0,68828.0,50387.0,21200.0,42628.0,N,N,low_risk
1,36000.0,0.0819,733.23,MORTGAGE,200000.0,Source Verified,n,11.38,0.0,0.0,...,28.6,0.0,0.0,380705.0,63917.0,64100.0,48417.0,N,N,low_risk
2,37225.0,0.1308,848.51,MORTGAGE,122700.0,Not Verified,n,16.83,0.0,0.0,...,100.0,0.0,0.0,101675.0,68475.0,34600.0,55477.0,N,N,low_risk
3,12000.0,0.1102,392.98,MORTGAGE,64500.0,Source Verified,n,36.63,0.0,2.0,...,14.3,0.0,0.0,234193.0,43389.0,42000.0,42279.0,N,N,low_risk
4,25000.0,0.1774,631.31,RENT,50000.0,Not Verified,n,33.82,0.0,0.0,...,10.0,0.0,0.0,146810.0,75919.0,83400.0,63410.0,N,N,low_risk


# Convert categorical data to numeric and separate target feature for TRAINING data

In [5]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder

In [6]:
# Create the X feature vector (train data)
X_train = train_df.drop('target', axis=1)
X_train.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,pymnt_plan,dti,delinq_2yrs,inq_last_6mths,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,25200.0,0.1102,548.17,RENT,65000.0,Not Verified,n,42.67,0.0,1.0,...,100.0,10.0,0.0,0.0,282008.0,93765.0,57300.0,116320.0,N,N
1,14000.0,0.2055,375.22,MORTGAGE,80000.0,Source Verified,n,15.47,0.0,0.0,...,100.0,75.0,0.0,0.0,434976.0,137629.0,17800.0,95032.0,N,N
2,30000.0,0.1171,992.28,MORTGAGE,200000.0,Not Verified,n,14.14,0.0,0.0,...,100.0,100.0,0.0,0.0,99849.0,68769.0,13500.0,86349.0,N,N
3,12000.0,0.1033,256.92,MORTGAGE,50000.0,Not Verified,n,21.41,0.0,0.0,...,96.0,33.3,0.0,0.0,209700.0,44654.0,13000.0,39700.0,N,N
4,10625.0,0.1612,259.06,OWN,29000.0,Not Verified,n,25.87,0.0,0.0,...,100.0,0.0,0.0,0.0,35300.0,11893.0,18800.0,8000.0,N,N


In [7]:
# Display the features with binary data for use with label encoder
X_train[["application_type", "debt_settlement_flag","hardship_flag","initial_list_status","pymnt_plan"]].head()

Unnamed: 0,application_type,debt_settlement_flag,hardship_flag,initial_list_status,pymnt_plan
0,Joint App,N,N,w,n
1,Individual,N,N,w,n
2,Individual,N,N,w,n
3,Individual,N,N,w,n
4,Individual,N,N,w,n


In [8]:
# Use Label Encoder to preprocess binary text features
le = LabelEncoder()
X_train["application_type"] = le.fit_transform(X_train["application_type"])
print("Label encoder for application_type: ", le.classes_)
X_train["debt_settlement_flag"] = le.fit_transform(X_train["debt_settlement_flag"])
print("Label encoder for deb_settlement_flag: ", le.classes_)
X_train["hardship_flag"] = le.fit_transform(X_train["hardship_flag"])
print("Label encoder for hardship_flag: ", le.classes_)
X_train["initial_list_status"] = le.fit_transform(X_train["initial_list_status"])
print("Label encoder for initial_list_status: ", le.classes_)
X_train["pymnt_plan"] = le.fit_transform(X_train["pymnt_plan"])
print("Label encoder for pymnt_plan: ", le.classes_)

Label encoder for application_type:  ['Individual' 'Joint App']
Label encoder for deb_settlement_flag:  ['N']
Label encoder for hardship_flag:  ['N' 'Y']
Label encoder for initial_list_status:  ['f' 'w']
Label encoder for pymnt_plan:  ['n']


In [9]:
# Display the features with binary data after label encoding
X_train[["application_type", "debt_settlement_flag","hardship_flag","initial_list_status","pymnt_plan"]].head()

Unnamed: 0,application_type,debt_settlement_flag,hardship_flag,initial_list_status,pymnt_plan
0,1,0,0,1,0
1,0,0,0,1,0
2,0,0,0,1,0
3,0,0,0,1,0
4,0,0,0,1,0


In [10]:
# Use pd.get_dummies for all other categorical features
X_train = pd.get_dummies(X_train)
print(X_train.columns)
X_train.head()

Index(['loan_amnt', 'int_rate', 'installment', 'annual_inc', 'pymnt_plan',
       'dti', 'delinq_2yrs', 'inq_last_6mths', 'open_acc', 'pub_rec',
       'revol_bal', 'total_acc', 'initial_list_status', 'out_prncp',
       'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp',
       'total_rec_int', 'total_rec_late_fee', 'recoveries',
       'collection_recovery_fee', 'last_pymnt_amnt',
       'collections_12_mths_ex_med', 'policy_code', 'application_type',
       'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m',
       'open_act_il', 'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il',
       'total_bal_il', 'il_util', 'open_rv_12m', 'open_rv_24m', 'max_bal_bc',
       'all_util', 'total_rev_hi_lim', 'inq_fi', 'total_cu_tl', 'inq_last_12m',
       'acc_open_past_24mths', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util',
       'chargeoff_within_12_mths', 'delinq_amnt', 'mo_sin_old_il_acct',
       'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl',


Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,pymnt_plan,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,...,total_il_high_credit_limit,hardship_flag,debt_settlement_flag,home_ownership_ANY,home_ownership_MORTGAGE,home_ownership_OWN,home_ownership_RENT,verification_status_Not Verified,verification_status_Source Verified,verification_status_Verified
0,25200.0,0.1102,548.17,65000.0,0,42.67,0.0,1.0,20.0,0.0,...,116320.0,0,0,0,0,0,1,1,0,0
1,14000.0,0.2055,375.22,80000.0,0,15.47,0.0,0.0,19.0,0.0,...,95032.0,0,0,0,1,0,0,0,1,0
2,30000.0,0.1171,992.28,200000.0,0,14.14,0.0,0.0,7.0,0.0,...,86349.0,0,0,0,1,0,0,1,0,0
3,12000.0,0.1033,256.92,50000.0,0,21.41,0.0,0.0,14.0,0.0,...,39700.0,0,0,0,1,0,0,1,0,0
4,10625.0,0.1612,259.06,29000.0,0,25.87,0.0,0.0,10.0,0.0,...,8000.0,0,0,0,0,1,0,1,0,0


In [11]:
# Create the y-label (train data)
# Convert output labels to 0 and 1 with label encoder
y_train = le.fit_transform(train_df['target'])
print("Label Encoder classes (train): ", le.classes_)
print("y_train (train): ", y_train)

Label Encoder classes (train):  ['high_risk' 'low_risk']
y_train (train):  [1 1 1 ... 0 0 0]


# Convert categorical data to numeric and separate target feature for TESTING data

In [12]:
# Create the X feature vector (test data)
X_test = test_df.drop('target', axis=1)

# Use Label Encoder to preprocess binary text features
le = LabelEncoder()
X_test["application_type"] = le.fit_transform(X_test["application_type"])
print("Label encoder for application_type: ", le.classes_)
X_test["debt_settlement_flag"] = le.fit_transform(X_test["debt_settlement_flag"])
print("Label encoder for deb_settlement_flag: ", le.classes_)
X_test["hardship_flag"] = le.fit_transform(X_test["hardship_flag"])
print("Label encoder for hardship_flag: ", le.classes_)
X_test["initial_list_status"] = le.fit_transform(X_test["initial_list_status"])
print("Label encoder for initial_list_status: ", le.classes_)
X_test["pymnt_plan"] = le.fit_transform(X_test["pymnt_plan"])
print("Label encoder for pymnt_plan: ", le.classes_)

# Use pd.dummies for all other categorical features
X_test = pd.get_dummies(X_test)
print(X_test.columns)
X_test.head()

Label encoder for application_type:  ['Individual' 'Joint App']
Label encoder for deb_settlement_flag:  ['N']
Label encoder for hardship_flag:  ['N' 'Y']
Label encoder for initial_list_status:  ['f' 'w']
Label encoder for pymnt_plan:  ['n']
Index(['loan_amnt', 'int_rate', 'installment', 'annual_inc', 'pymnt_plan',
       'dti', 'delinq_2yrs', 'inq_last_6mths', 'open_acc', 'pub_rec',
       'revol_bal', 'total_acc', 'initial_list_status', 'out_prncp',
       'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp',
       'total_rec_int', 'total_rec_late_fee', 'recoveries',
       'collection_recovery_fee', 'last_pymnt_amnt',
       'collections_12_mths_ex_med', 'policy_code', 'application_type',
       'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m',
       'open_act_il', 'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il',
       'total_bal_il', 'il_util', 'open_rv_12m', 'open_rv_24m', 'max_bal_bc',
       'all_util', 'total_rev_hi_lim', 'inq_fi', 'total_cu_t

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,pymnt_plan,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,...,total_il_high_credit_limit,hardship_flag,debt_settlement_flag,home_ownership_ANY,home_ownership_MORTGAGE,home_ownership_OWN,home_ownership_RENT,verification_status_Not Verified,verification_status_Source Verified,verification_status_Verified
0,12000.0,0.2055,449.34,70000.0,0,28.56,0.0,1.0,9.0,1.0,...,42628.0,0,0,0,0,0,1,1,0,0
1,36000.0,0.0819,733.23,200000.0,0,11.38,0.0,0.0,13.0,0.0,...,48417.0,0,0,0,1,0,0,0,1,0
2,37225.0,0.1308,848.51,122700.0,0,16.83,0.0,0.0,11.0,0.0,...,55477.0,0,0,0,1,0,0,1,0,0
3,12000.0,0.1102,392.98,64500.0,0,36.63,0.0,2.0,15.0,0.0,...,42279.0,0,0,0,1,0,0,0,1,0
4,25000.0,0.1774,631.31,50000.0,0,33.82,0.0,0.0,12.0,0.0,...,63410.0,0,0,0,0,0,1,1,0,0


In [13]:
# Create the y-label (test data)
# Convert output labels to 0 and 1
y_test = le.fit_transform(test_df['target'])
print("Label Encoder classes (test): ", le.classes_)
print("y_test (test): ", y_test)

Label Encoder classes (test):  ['high_risk' 'low_risk']
y_test (test):  [1 1 1 ... 0 0 0]


In [14]:
# add missing dummy variables to testing set
## NOTE: Shapes for (X_train, y_train) are currently compatible with (X_test, y_test) data frames
##  The column names of the binary text features are all accounted for.
print("X_train shape", X_train.shape)
print("y_train shape", y_train.shape)
print("X_test shape", X_test.shape)
print("y_test shape", y_test.shape)

X_train shape (12790, 88)
y_train shape (12790,)
X_test shape (8418, 88)
y_test shape (8418,)


In [15]:
# Train the Logistic Regression model on the unscaled data and print the model score
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [16]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.6174354964816263
Testing Data Score: 0.489783796626277


In [None]:
# Train a Random Forest Classifier model and print the model score

# Scale the data

In [17]:
# Scaling the X data by using StandardScaler()
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_train_scaled

# Alternatively, scaling the data by using MinMaxScaler()
# scaler = MinMaxScaler().fit(X_train)
# X_train_scaled = scaler.transform(X_train)
# X_train_scaled

array([[ 0.74487515, -0.58546189,  0.14954838, ...,  1.13166712,
        -0.8232965 , -0.43244301],
       [-0.34631343,  1.27012145, -0.44868931, ..., -0.88365208,
         1.21462923, -0.43244301],
       [ 1.21252741, -0.45111221,  1.68573419, ...,  1.13166712,
        -0.8232965 , -0.43244301],
       ...,
       [-0.73602364,  2.26314087, -0.35934279, ...,  1.13166712,
        -0.8232965 , -0.43244301],
       [-0.30734241,  0.05318589, -0.03692745, ...,  1.13166712,
        -0.8232965 , -0.43244301],
       [-0.24888588,  0.72298722,  0.12246424, ...,  1.13166712,
        -0.8232965 , -0.43244301]])

In [18]:
# Transforming the test dataset based on the fit from the training dataset
X_test_scaled = scaler.transform(X_test)
X_test_scaled

array([[-0.54116854,  1.27012145, -0.19230667, ...,  1.13166712,
        -0.8232965 , -0.43244301],
       [ 1.79709272, -1.13649032,  0.7896748 , ..., -0.88365208,
         1.21462923, -0.43244301],
       [ 1.91644147, -0.18435993,  1.18843074, ...,  1.13166712,
        -0.8232965 , -0.43244301],
       ...,
       [ 1.21252741,  1.27012145,  2.13907349, ..., -0.88365208,
         1.21462923, -0.43244301],
       [ 0.72538964,  2.26314087,  1.72150047, ..., -0.88365208,
         1.21462923, -0.43244301],
       [ 0.04339678, -0.71981158, -0.41354565, ..., -0.88365208,
         1.21462923, -0.43244301]])

In [19]:
# Train the Logistic Regression model on the scaled data and print the model score
scaled_classifier = LogisticRegression()
scaled_classifier.fit(X_train_scaled, y_train)
print(f"Scaled Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Scaled Testing Data Score: {classifier.score(X_test_scaled, y_test)}")


Scaled Training Data Score: 0.6
Scaled Testing Data Score: 0.3995010691375624


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
# Train a Random Forest Classifier model on the scaled data and print the model score