In [32]:
import numpy as np
import pandas as pd
from pathlib import Path


In [33]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))


In [71]:
#Prediction: 
#I believe that Logistic Regression will perform better than Random Forest. Even though the decision tree
#model is made for sectors like loan risk, having the training data in 2019 and the testing 
#data fir the first quarter in 2020 will strongly test boths model's predictive capacities.
#Random Forest does not do as well with numerical features outside the realm of the 
#training data. 2019 was a relatively stable year financially, and there is a lot more data to 
#mitigate and variance, so there will be fewer outliers. However, we have fewer samples in for the
#first quarter of 2020 (12181 samples for 2019 and 4703), and the last few weeks of the quarter
#ends will the lockdown, so I predict there will be quite a few outliers, and that economic 
#turbulence due to the pandemic will have an impact on the data and will hinder 
#Random Forest's ability to predict loan risk in 2020.  

In [34]:
# Convert categorical data to numeric and separate target feature for training data

X_train_df = train_df.drop(columns=["loan_status"])
y_train_df = train_df["loan_status"]

X_train_df = pd.get_dummies(X_train_df, dummy_na=True)

y_train_df

0         low_risk
1         low_risk
2         low_risk
3         low_risk
4         low_risk
           ...    
12175    high_risk
12176    high_risk
12177    high_risk
12178    high_risk
12179    high_risk
Name: loan_status, Length: 12180, dtype: object

In [35]:
X_train_df.columns

Index(['Unnamed: 0', 'index', 'loan_amnt', 'int_rate', 'installment',
       'annual_inc', 'dti', 'delinq_2yrs', 'inq_last_6mths', 'open_acc',
       ...
       'initial_list_status_nan', 'application_type_Individual',
       'application_type_Joint App', 'application_type_nan', 'hardship_flag_N',
       'hardship_flag_Y', 'hardship_flag_nan', 'debt_settlement_flag_N',
       'debt_settlement_flag_Y', 'debt_settlement_flag_nan'],
      dtype='object', length=101)

In [36]:
# Convert categorical data to numeric and separate target feature for testing data


X_test_df = test_df.drop(columns=["loan_status"])
y_test_df = test_df["loan_status"]

X_test_df = pd.get_dummies(test_df, dummy_na=True)
X_test_df.head()


Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,...,initial_list_status_w,initial_list_status_nan,application_type_Individual,application_type_Joint App,application_type_nan,hardship_flag_N,hardship_flag_Y,hardship_flag_nan,debt_settlement_flag_N,debt_settlement_flag_nan
0,67991,67991,40000.0,0.0819,814.7,140000.0,19.75,0.0,1.0,18.0,...,1,0,1,0,0,1,0,0,1,0
1,25429,25429,6000.0,0.1524,208.7,55000.0,11.52,2.0,0.0,8.0,...,1,0,1,0,0,1,0,0,1,0
2,38496,38496,3600.0,0.1695,128.27,42000.0,6.74,0.0,0.0,6.0,...,1,0,1,0,0,1,0,0,1,0
3,19667,19667,20000.0,0.1524,478.33,100000.0,12.13,0.0,2.0,7.0,...,1,0,1,0,0,1,0,0,1,0
4,37505,37505,3600.0,0.124,120.27,50000.0,16.08,0.0,3.0,6.0,...,1,0,1,0,0,1,0,0,1,0


In [37]:
#Make sure the test and train columns match.

len (X_train_df.columns), len(X_test_df.columns)

(101, 103)

In [38]:
# Add missing dummy variables to testing set

for col in X_test_df.columns:
    if col not in X_train_df.columns:
         print(col)



loan_status_high_risk
loan_status_low_risk
loan_status_nan


In [39]:
#Find any columns in the training set that need to be dropped.

for col in X_train_df.columns:
    if col not in X_test_df.columns:
         print(col)

debt_settlement_flag_Y


In [40]:
# Add missing dummy variables to testing set and drop the extra column

X_train_df["loan_status_high_risk"] = 0
X_train_df["loan_status_low_risk"] = 0
X_train_df["loan_status_nan"] = 0

X_train_df = X_train_df.drop(columns=['debt_settlement_flag_Y'])


In [41]:
len (X_train_df.columns), len(X_test_df.columns)

(103, 103)

In [54]:
X_train_df.info()
X_test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12180 entries, 0 to 12179
Columns: 103 entries, Unnamed: 0 to loan_status_nan
dtypes: float64(76), int64(5), uint8(22)
memory usage: 7.8 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4702 entries, 0 to 4701
Columns: 103 entries, Unnamed: 0 to debt_settlement_flag_nan
dtypes: float64(76), int64(2), uint8(25)
memory usage: 2.9 MB


In [59]:
# Train the Logistic Regression model on the unscaled data and print the model score

from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model
model.fit(X_train_df, y_train_df)
model.score(X_test_df, y_test_df)

print(f'Model Score (Logistic Regression - unscaled): {model.score(X_test_df, y_test_df)}')

Model Score (Logistic Regression - unscaled): 0.5253083794130158


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [74]:
# Train a Random Forest Classifier model and print the model score

from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state=1, n_estimators=100).fit(X_train_df, y_train_df)



print(f'Training Score (unscaled): {clf.score(X_train_df, y_train_df)}')
print(f'Testing Score (unscaled): {clf.score(X_test_df, y_test_df)}')

Training Score (unscaled): 1.0
Testing Score (unscaled): 0.6142067205444491


In [None]:
#The training score for Random Forest is perfect. However, the testing score is in the realm
#of poor discrimination. I think this coincides with my guess that Random Forest's predictive ability
#would be compromised by testing data that goes beyond its parameters. However, even with that
#considerable drop in accuracy, it still did better than the Logistic Regression, which performed abysmally.

In [None]:
#Prediction: I predict that both the Logistic Regression and Random Forest models will fare
#better with scaling on training (2019), but will fare worse on testing (2020), because I predict there
#will be a lot of outliers in at the end of Q1 in 2020.

In [56]:
# Scale the data


from sklearn.preprocessing import StandardScaler

X_scaler = StandardScaler().fit(X_train_df)



X_train_scaled = X_scaler.transform(X_train_df)
X_test_scaled = X_scaler.transform(X_test_df)


In [60]:
# Train the Logistic Regression model on the scaled data and print the model score

model_scaled = LogisticRegression()
model_scaled.fit(X_train_scaled, y_train_df)

model_scaled.score(X_train_scaled, y_train_df)
# model_scaled.score(X_test_scaled, y_test_df)

print(f'Model Score (Logistic Regression - scaled): {model_scaled.score(X_test_scaled, y_test_df)}')

Model Score (Logistic Regression - scaled): 0.7335176520629519


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
#The scaled Logistic Regression model fared much batter than the unscaled and inched into the 
#realm of acceptible discrimination, which, considering the two models (2019 vs. 2020), is actually pretty good.
#In fact, this is the best testing score out of all the scenarios.

In [75]:
# Train a Random Forest Classifier model on the scaled data and print the model score

clf_scaled = RandomForestClassifier(random_state=1, n_estimators=100).fit(X_train_scaled, y_train_df)

print(f'Training Score (scaled): {clf.score(X_train_scaled, y_train_df)}')
print(f'Testing Score (scaled): {clf.score(X_test_scaled, y_test_df)}')

Training Score (scaled): 0.5
Testing Score (scaled): 0.5


In [None]:
#The scaled random forest model gave the worst possible outcome of 0.5, 
#which is absolutely indicriminate. 
#Because the training score was just as bad as the testing score, I believe that this result
#was due to the scaling and not a much as reflection of the data.

In [None]:
#It appears I was wrong, as the Random Forest model would fare worse when scaled, but
#my prediction that the Logistic Regression would do better when scaled was correct.