In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder

In [2]:
train_df = pd.read_csv(Path('Resources_CreditRiskEvaluator/2019loans.csv'))
test_df = pd.read_csv(Path('Resources_CreditRiskEvaluator/2020Q1loans.csv'))
train_df.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,pymnt_plan,dti,delinq_2yrs,inq_last_6mths,...,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag,target
0,7000.0,0.1894,256.38,MORTGAGE,75000.0,Not Verified,n,28.62,0.0,2.0,...,87.5,0.0,0.0,352260.0,62666.0,35000.0,10000.0,N,N,low_risk
1,40000.0,0.1614,975.71,MORTGAGE,102000.0,Source Verified,n,11.72,2.0,0.0,...,0.0,0.0,0.0,294664.0,109911.0,9000.0,71044.0,N,N,low_risk
2,11000.0,0.2055,294.81,RENT,45000.0,Verified,n,37.25,1.0,3.0,...,7.7,0.0,0.0,92228.0,36007.0,33000.0,46328.0,N,N,low_risk
3,4000.0,0.1612,140.87,MORTGAGE,38000.0,Not Verified,n,42.89,1.0,0.0,...,100.0,0.0,0.0,284273.0,52236.0,13500.0,52017.0,N,N,low_risk
4,14000.0,0.1797,505.93,MORTGAGE,43000.0,Source Verified,n,22.16,1.0,0.0,...,25.0,0.0,0.0,120280.0,88147.0,33300.0,78680.0,N,N,low_risk


In [3]:
# Separate the target column (y) from the predictive features (X) in both our train and test dfs
X_train = train_df.drop("target", axis=1).copy()
X_test = test_df.drop("target", axis=1).copy()
y_train = train_df["target"].copy()
y_test = test_df["target"].copy()

In [4]:
# Convert categorical data to numeric and separate target feature for training data
X_train = pd.get_dummies(X_train)
X_train.head()
X_test = pd.get_dummies(X_test)
X_test.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,verification_status_Source Verified,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N
0,40000.0,0.1033,856.4,128700.0,12.47,0.0,1.0,8.0,0.0,38113.0,...,1,0,1,0,1,1,0,0,1,1
1,24450.0,0.143,572.72,44574.0,15.05,0.0,1.0,6.0,0.0,1665.0,...,0,0,1,0,1,1,0,1,0,1
2,13500.0,0.143,316.23,60000.0,28.72,0.0,0.0,8.0,0.0,13857.0,...,0,0,1,0,1,1,0,0,1,1
3,10625.0,0.1774,268.31,60000.0,15.7,0.0,4.0,17.0,0.0,6216.0,...,0,1,1,0,1,1,0,1,0,1
4,6375.0,0.1862,232.46,60000.0,35.5,0.0,0.0,13.0,0.0,12681.0,...,1,0,1,0,1,1,0,1,0,1


In [5]:
# First, identify any that are in one df and not the other.
cols_to_add = set(X_train.columns) ^ set(X_test.columns)

# Loop through each missing column and add it to X_test with all 0s
for col in cols_to_add:
    X_test[col] = 0

# Set the order of the columns in X_test to match the order of columns in X_train
X_test = X_test[X_train.columns]

# Encode the target column with 1s and 0s (both train and test)
target_encoder = LabelEncoder().fit(y_train)
y_train = target_encoder.transform(y_train)
y_test = target_encoder.transform(y_test)

In [6]:
# Check the shapes to make sure X_train and X_test have the same number of columns (same with y_train/y_test)
# Check that X_train and y_train have the same number of rows (same with X_test/y_test)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)


(12180, 92)
(4702, 92)
(12180,)
(4702,)


In [7]:
# Train the Logistic Regression model on the unscaled data and print the model train and test score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

lr = LogisticRegression()
lr.fit(X_train, y_train)
print(lr.score(X_test, y_test))
print(lr.score(X_train, y_train))


0.5108464483198639
0.6522988505747126


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [8]:
# Train a Random Forest Classifier model and print the model train and test score
rf = RandomForestClassifier(n_estimators=350, max_depth=3)
rf.fit(X_train, y_train)
print(rf.score(X_test, y_test))
print(rf.score(X_train, y_train))


0.5935772011909826
0.7294745484400657


In [9]:
# Create a scaler based on the X_train data
scaler = StandardScaler().fit(X_train)

# Use the scaler on X_train and X_test
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [10]:
# Train the Logistic Regression model on the scaled data and print the model train/test score
lr.fit(X_train_scaled, y_train)
print(lr.score(X_test_scaled, y_test))
print(lr.score(X_train_scaled, y_train))

0.7598894087622289
0.710919540229885


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [11]:
# Train a Random Forest Classifier model on the scaled data and print the model train/test score
rf.fit(X_train_scaled, y_train)
print(rf.score(X_test_scaled, y_test))
print(rf.score(X_train_scaled, y_train))

rf.feature_importances_

0.5842194810718843
0.7275862068965517


array([6.73547672e-03, 1.02535668e-01, 1.49683997e-02, 2.91185657e-04,
       4.59899324e-04, 5.43788098e-04, 2.48343922e-03, 5.42481099e-04,
       0.00000000e+00, 2.69879118e-03, 2.56760529e-04, 2.55245860e-02,
       3.23192140e-02, 1.13467889e-01, 1.13226531e-01, 7.56332776e-02,
       1.19424538e-01, 6.49313190e-02, 0.00000000e+00, 0.00000000e+00,
       2.11394623e-01, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       2.11578517e-04, 1.00148522e-03, 2.40724052e-03, 1.45661788e-04,
       3.19697536e-04, 3.35415361e-04, 3.63141928e-04, 1.98817656e-04,
       2.55702430e-03, 1.84169384e-03, 5.65611325e-03, 4.37445786e-03,
       2.74827658e-03, 7.04484184e-03, 1.38532789e-03, 5.57653578e-05,
       3.44540762e-03, 8.12482864e-03, 9.66841077e-04, 9.68091690e-03,
       4.56759453e-04, 8.26186492e-06, 0.00000000e+00, 6.06384888e-04,
       5.85080342e-03, 1.93634798e-03, 2.20984706e-03, 1.08412045e-03,
       1.79466417e-03, 4.36707615e-03, 4.43477873e-04, 8.07689929e-04,
      

In [None]:
# Next steps: create more models, try fewer columns, try PCA, see if any columns have additional data that can be split, add columns....
# Try to reduce overfit, increase accuracy (or another metric)