In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

In [4]:
# Create Data Frames for the 2 Files 2019 for Training & 2020 for Testing
train_df = pd.read_csv(Path('2019loans.csv'))
test_df = pd.read_csv(Path('2020Q1loans.csv'))

Check Columns of the DataFrames

In [5]:
train_columns = train_df.columns
test_columns = test_df.columns
train_types = train_df.dtypes
test_types = test_df.dtypes

In [6]:
compare_df = list(zip(train_columns, train_types, test_columns, test_types))
compare_df

[('loan_amnt', dtype('float64'), 'loan_amnt', dtype('float64')),
 ('int_rate', dtype('float64'), 'int_rate', dtype('float64')),
 ('installment', dtype('float64'), 'installment', dtype('float64')),
 ('home_ownership', dtype('O'), 'home_ownership', dtype('O')),
 ('annual_inc', dtype('float64'), 'annual_inc', dtype('float64')),
 ('verification_status', dtype('O'), 'verification_status', dtype('O')),
 ('pymnt_plan', dtype('O'), 'pymnt_plan', dtype('O')),
 ('dti', dtype('float64'), 'dti', dtype('float64')),
 ('delinq_2yrs', dtype('float64'), 'delinq_2yrs', dtype('float64')),
 ('inq_last_6mths', dtype('float64'), 'inq_last_6mths', dtype('float64')),
 ('open_acc', dtype('float64'), 'open_acc', dtype('float64')),
 ('pub_rec', dtype('float64'), 'pub_rec', dtype('float64')),
 ('revol_bal', dtype('float64'), 'revol_bal', dtype('float64')),
 ('total_acc', dtype('float64'), 'total_acc', dtype('float64')),
 ('initial_list_status', dtype('O'), 'initial_list_status', dtype('O')),
 ('out_prncp', dtype(

Convert the Categorical Data to Numeric & Seperate the Training Target Feature

In [7]:
train_converted_df = pd.get_dummies(train_df)
train_converted_df.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y,target_high_risk,target_low_risk
0,7000.0,0.1894,256.38,75000.0,28.62,0.0,2.0,20.0,0.0,40414.0,...,1,0,1,0,1,0,1,0,0,1
1,40000.0,0.1614,975.71,102000.0,11.72,2.0,0.0,10.0,0.0,43531.0,...,0,1,1,0,1,0,1,0,0,1
2,11000.0,0.2055,294.81,45000.0,37.25,1.0,3.0,23.0,0.0,8242.0,...,0,1,1,0,1,0,1,0,0,1
3,4000.0,0.1612,140.87,38000.0,42.89,1.0,0.0,7.0,0.0,12767.0,...,0,1,0,1,1,0,1,0,0,1
4,14000.0,0.1797,505.93,43000.0,22.16,1.0,0.0,22.0,0.0,11182.0,...,0,1,1,0,1,0,1,0,0,1


In [8]:
train_converted_columns = train_converted_df.columns

Convert the Categorical Data to Numeric & Seperate the Testing Target Feature

In [9]:
test_converted_df = pd.get_dummies(test_df)
test_converted_df.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,target_high_risk,target_low_risk
0,40000.0,0.1033,856.4,128700.0,12.47,0.0,1.0,8.0,0.0,38113.0,...,1,0,1,1,0,0,1,1,0,1
1,24450.0,0.143,572.72,44574.0,15.05,0.0,1.0,6.0,0.0,1665.0,...,1,0,1,1,0,1,0,1,0,1
2,13500.0,0.143,316.23,60000.0,28.72,0.0,0.0,8.0,0.0,13857.0,...,1,0,1,1,0,0,1,1,0,1
3,10625.0,0.1774,268.31,60000.0,15.7,0.0,4.0,17.0,0.0,6216.0,...,1,0,1,1,0,1,0,1,0,1
4,6375.0,0.1862,232.46,60000.0,35.5,0.0,0.0,13.0,0.0,12681.0,...,1,0,1,1,0,1,0,1,0,1


In [10]:
test_converted_columns = test_converted_df.columns

In [11]:
compare_converted_df = list(zip(train_converted_columns, test_converted_columns))
compare_converted_df

[('loan_amnt', 'loan_amnt'),
 ('int_rate', 'int_rate'),
 ('installment', 'installment'),
 ('annual_inc', 'annual_inc'),
 ('dti', 'dti'),
 ('delinq_2yrs', 'delinq_2yrs'),
 ('inq_last_6mths', 'inq_last_6mths'),
 ('open_acc', 'open_acc'),
 ('pub_rec', 'pub_rec'),
 ('revol_bal', 'revol_bal'),
 ('total_acc', 'total_acc'),
 ('out_prncp', 'out_prncp'),
 ('out_prncp_inv', 'out_prncp_inv'),
 ('total_pymnt', 'total_pymnt'),
 ('total_pymnt_inv', 'total_pymnt_inv'),
 ('total_rec_prncp', 'total_rec_prncp'),
 ('total_rec_int', 'total_rec_int'),
 ('total_rec_late_fee', 'total_rec_late_fee'),
 ('recoveries', 'recoveries'),
 ('collection_recovery_fee', 'collection_recovery_fee'),
 ('last_pymnt_amnt', 'last_pymnt_amnt'),
 ('collections_12_mths_ex_med', 'collections_12_mths_ex_med'),
 ('policy_code', 'policy_code'),
 ('acc_now_delinq', 'acc_now_delinq'),
 ('tot_coll_amt', 'tot_coll_amt'),
 ('tot_cur_bal', 'tot_cur_bal'),
 ('open_acc_6m', 'open_acc_6m'),
 ('open_act_il', 'open_act_il'),
 ('open_il_12m', '

Fill in the Missing Categories in the Testing Set

In [12]:
test_converted_df['debt_settlement_flag_Y'] = np.where(test_converted_df['debt_settlement_flag_N'] == 1, 0, 1)
test_converted_df.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,target_high_risk,target_low_risk,debt_settlement_flag_Y
0,40000.0,0.1033,856.4,128700.0,12.47,0.0,1.0,8.0,0.0,38113.0,...,0,1,1,0,0,1,1,0,1,0
1,24450.0,0.143,572.72,44574.0,15.05,0.0,1.0,6.0,0.0,1665.0,...,0,1,1,0,1,0,1,0,1,0
2,13500.0,0.143,316.23,60000.0,28.72,0.0,0.0,8.0,0.0,13857.0,...,0,1,1,0,0,1,1,0,1,0
3,10625.0,0.1774,268.31,60000.0,15.7,0.0,4.0,17.0,0.0,6216.0,...,0,1,1,0,1,0,1,0,1,0
4,6375.0,0.1862,232.46,60000.0,35.5,0.0,0.0,13.0,0.0,12681.0,...,0,1,1,0,1,0,1,0,1,0


## Considering the Models

Prediction:  I feel that a Random Forest Classifier will perform better than a Logostic Regression Model based on the fact that a Random Forest Classifier can capture more complex feature patterns to provide the best accuracy while the Logistic Regression does not give us a discrete output but the probability associated with each output.

Create a Logistic Regression Model (Unscaled Data) & Print the Model Score

In [13]:
train_converted_df.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y,target_high_risk,target_low_risk
0,7000.0,0.1894,256.38,75000.0,28.62,0.0,2.0,20.0,0.0,40414.0,...,1,0,1,0,1,0,1,0,0,1
1,40000.0,0.1614,975.71,102000.0,11.72,2.0,0.0,10.0,0.0,43531.0,...,0,1,1,0,1,0,1,0,0,1
2,11000.0,0.2055,294.81,45000.0,37.25,1.0,3.0,23.0,0.0,8242.0,...,0,1,1,0,1,0,1,0,0,1
3,4000.0,0.1612,140.87,38000.0,42.89,1.0,0.0,7.0,0.0,12767.0,...,0,1,0,1,1,0,1,0,0,1
4,14000.0,0.1797,505.93,43000.0,22.16,1.0,0.0,22.0,0.0,11182.0,...,0,1,1,0,1,0,1,0,0,1


In [14]:
# Assign the Train data to X and y
# Reshape the data to create a two-dimensional array of values for Sklearn
X = train_converted_df.drop(['target_high_risk', 'target_low_risk'], axis=1)
y = train_converted_df['target_high_risk']   # outcome variable from training data

print("Shape: ", X.shape, y.shape)

Shape:  (12180, 92) (12180,)


In [15]:
# Assign the Test data to X and y
X_2 = test_converted_df.drop(['target_high_risk', 'target_low_risk'], axis=1)
y_2 = test_converted_df['target_high_risk'] 

print("Shape: ", X_2.shape, y_2.shape)

Shape:  (4702, 92) (4702,)


Splitting the data into Test - Train

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=7)

In [17]:
classifier= LogisticRegression(max_iter=20000)
classifier

LogisticRegression(max_iter=20000)

In [18]:
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=20000)

In [19]:
print(f"TRAINING SCORE: {classifier.score(X_train, y_train)}")
print(f"TESTING SCORE(from training set): {classifier.score(X_test, y_test)}")
print(f'1Q 2020 FIT: {classifier.score(X_2, y_2)}')

TRAINING SCORE: 0.7111111111111111
TESTING SCORE(from training set): 0.6952380952380952
1Q 2020 FIT: 0.5591237771161208


In [20]:
# Set the variables for Table Creation
train_logistic = classifier.score(X_train, y_train)
test_logistic = classifier.score(X_test, y_test)
Q1_2020_logistic = classifier.score(X_2, y_2)

Create a Random Forest Classifier Model (Unscaled Data) & Print the Model Score

In [21]:
rfc = RandomForestClassifier(random_state=7, n_estimators=500).fit(X_train, y_train)
print(f"TRAINING SCORE: {rfc.score(X_train, y_train)}")
print(f"TESTING SCORE(from training set): {rfc.score(X_test, y_test)}")
print(f'1Q 2020 FIT: {rfc.score(X_2, y_2)}')

TRAINING SCORE: 1.0
TESTING SCORE(from training set): 0.7878489326765189
1Q 2020 FIT: 0.6461080391322841


In [22]:
# Set the variables for Table Creation
train_rfc = rfc.score(X_train, y_train)
test_rfc = rfc.score(X_test, y_test)
Q1_2020_rfc = rfc.score(X_2, y_2)

## Outcome

The Random Forest Classifier did out perform the Logistic Regression Model with a better accuracy rate.  This outcome was in line with my prediction

Revisit the Preprocessing & Scale the Data

In [23]:
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_2_scaled=scaler.transform(X_2)

Create a Logistic Regression Model (Scaled Data) & Print the Model Score

## Scaling Prediction

Scaling the data will normalize all features so that each feature contributes approximately proportionately to the final distance.  Because our data will now be more evenly distributed, I predict that the Scaled data will perform better than our unscaled data.

In [24]:
classifier.fit(X_train_scaled, y_train)
print(f"TRAINING SCALED SCORE: {classifier.score(X_train_scaled, y_train)}")
print(f"TESTING SCALED SCORE(from training set): {classifier.score(X_test_scaled, y_test)}")
print(f'1Q 2020 SCALED FIT: {classifier.score(X_2_scaled, y_2)}')

TRAINING SCALED SCORE: 0.7139573070607553
TESTING SCALED SCORE(from training set): 0.7004926108374384
1Q 2020 SCALED FIT: 0.7539344959591663


In [25]:
# Set the variables for Table Creation
train_logistic_scaled = classifier.score(X_train_scaled, y_train)
test_logistic_scaled = classifier.score(X_test_scaled, y_test)
Q1_2020_logistic_scaled = classifier.score(X_2_scaled, y_2)

Create a Random Forest Classifier Model (Scaled Data) & Print the Model Score

In [26]:
rfc = RandomForestClassifier(random_state=7, n_estimators=500).fit(X_train_scaled, y_train)
print(f"TRAINING SCALED SCORE: {rfc.score(X_train_scaled, y_train)}")
print(f"TESTING SCALED SCORE(from training set): {rfc.score(X_test_scaled, y_test)}")
print(f'1Q 2020 SCALED FIT: {rfc.score(X_2_scaled, y_2)}')

TRAINING SCALED SCORE: 1.0
TESTING SCALED SCORE(from training set): 0.7878489326765189
1Q 2020 SCALED FIT: 0.6452573373032752


In [27]:
train_rfc_scaled = rfc.score(X_train_scaled, y_train)
test_rfc_scaled = rfc.score(X_test_scaled, y_test)
Q1_2020_rfc_scaled = rfc.score(X_2_scaled, y_2)

In [28]:
# Create a Data Frame that holds the Results
results = {'Data Set': ['2019 Training', '2019 Test', 'Q1 2020'], 'LR Unscaled': [train_logistic, test_logistic, Q1_2020_logistic],
    'RFC Unscaled': [train_rfc, test_rfc, Q1_2020_rfc], 'LR Scaled': [train_logistic_scaled, test_logistic_scaled, Q1_2020_logistic_scaled],
    'RFC Scaled': [train_rfc_scaled, test_rfc_scaled, Q1_2020_rfc_scaled]}
results_df = pd.DataFrame(data=results)
results_df

Unnamed: 0,Data Set,LR Unscaled,RFC Unscaled,LR Scaled,RFC Scaled
0,2019 Training,0.711111,1.0,0.713957,1.0
1,2019 Test,0.695238,0.787849,0.700493,0.787849
2,Q1 2020,0.559124,0.646108,0.753934,0.645257


## Conclusion: 

It appears that overall the Random Forest Classifier gave us better accuracy over the Logistic Regression Model for both the Scaled and Unscaled data.  However, scaling the data did not improve the scoring and remained consistant expect for the Q1 2020 data in which the Logostic Regression Model was at 0.55 for unscaled and jumped to 0.75 for scaled.