In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

In [2]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [3]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,57107,57107,13375.0,0.1797,483.34,MORTGAGE,223000.0,Not Verified,low_risk,n,...,100.0,50.0,0.0,0.0,577150.0,122018.0,32000.0,170200.0,N,N
1,141451,141451,21000.0,0.1308,478.68,MORTGAGE,123000.0,Source Verified,low_risk,n,...,85.0,33.3,0.0,0.0,132750.0,27896.0,15900.0,35398.0,N,N
2,321143,321143,20000.0,0.124,448.95,MORTGAGE,197000.0,Source Verified,low_risk,n,...,85.7,33.3,0.0,0.0,628160.0,114043.0,22600.0,90340.0,N,N
3,11778,11778,3000.0,0.124,100.22,RENT,45000.0,Not Verified,low_risk,n,...,100.0,16.7,1.0,0.0,42006.0,20761.0,19900.0,15406.0,N,N
4,169382,169382,30000.0,0.1612,1056.49,MORTGAGE,133000.0,Source Verified,low_risk,n,...,100.0,66.7,0.0,0.0,283248.0,109056.0,79500.0,58778.0,N,N


In [4]:
# Convert categorical data to numeric and separate target feature for training data
train_df = pd.get_dummies(train_df)
X_train = train_df.drop(['loan_status_high_risk', 'loan_status_low_risk',
                         'Unnamed: 0', 'index'], axis=1)
X_train.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y
0,13375.0,0.1797,483.34,223000.0,29.99,0.0,0.0,15.0,0.0,39728.0,...,0,1,0,1,1,0,1,0,1,0
1,21000.0,0.1308,478.68,123000.0,11.26,2.0,0.0,16.0,0.0,9585.0,...,0,1,0,1,1,0,1,0,1,0
2,20000.0,0.124,448.95,197000.0,11.28,0.0,0.0,12.0,0.0,16708.0,...,0,1,0,1,1,0,1,0,1,0
3,3000.0,0.124,100.22,45000.0,18.08,0.0,0.0,12.0,1.0,8809.0,...,0,1,0,1,1,0,1,0,1,0
4,30000.0,0.1612,1056.49,133000.0,27.77,0.0,2.0,13.0,0.0,65420.0,...,0,1,0,1,1,0,1,0,1,0


In [5]:
y_train = train_df['loan_status_high_risk'].values
y_train

array([0, 0, 0, ..., 1, 1, 1], dtype=uint8)

In [6]:
# Convert categorical data to numeric and separate target feature for testing data
test_df = pd.get_dummies(test_df)
X_test = test_df.drop(['loan_status_high_risk', 'loan_status_low_risk', 
                       'Unnamed: 0', 'index'], axis=1)
X_test.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,verification_status_Source Verified,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N
0,40000.0,0.0819,814.7,140000.0,19.75,0.0,1.0,18.0,0.0,9471.0,...,0,0,1,0,1,1,0,1,0,1
1,6000.0,0.1524,208.7,55000.0,11.52,2.0,0.0,8.0,0.0,1280.0,...,0,0,1,0,1,1,0,1,0,1
2,3600.0,0.1695,128.27,42000.0,6.74,0.0,0.0,6.0,0.0,4757.0,...,0,0,1,0,1,1,0,1,0,1
3,20000.0,0.1524,478.33,100000.0,12.13,0.0,2.0,7.0,0.0,12731.0,...,0,0,1,0,1,1,0,1,0,1
4,3600.0,0.124,120.27,50000.0,16.08,0.0,3.0,6.0,0.0,10413.0,...,0,0,1,0,1,1,0,1,0,1


In [7]:
y_test = test_df['loan_status_high_risk'].values
y_test

array([0, 0, 0, ..., 1, 1, 1], dtype=uint8)

In [8]:
# add missing dummy variables to testing set
[col for col in X_train.columns if col not in X_test.columns]

['debt_settlement_flag_Y']

In [9]:
X_test['debt_settlement_flag_Y'] = np.zeros(len(X_test))

## Unscaled Data
### Prediction (Logistic Regression vs. Random Forest):
I predict that the random forests classifier will outperform the logistic regression because the data has not been scaled and logistic regression is sensitive to this. Without scaling, the model may have a hard time fitting the data since the features with larger variances may not necessarily be the most important. I expect this to affect a regression model more than a classifier based on decision trees.

In [10]:
# Train the Logistic Regression model on the unscaled data and print the model score
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(max_iter=1000, random_state=1)

classifier.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(max_iter=1000, random_state=1)

In [11]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.6864532019704433
Testing Data Score: 0.5723096554657593


In [12]:
# Train a Random Forest Classifier model and print the model score
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, random_state=1)
clf.fit(X_train, y_train)

RandomForestClassifier(random_state=1)

In [13]:
print(f"Training Data Score: {clf.score(X_train, y_train)}")
print(f"Testing Data Score: {clf.score(X_test, y_test)}")

Training Data Score: 1.0
Testing Data Score: 0.6544023819651212


### Results:
The random forest classifier did perform slightly better with a testing than the logistic regression model with testing scores of 65% and 57%, respectively. Neither model did very well though, which is likely due to the fact that the data wasn't scaled. In fact, the logistic regression failed to converge even when the number of iterations was increased to 10,000. As for the random forest classifier, it seemed to pick up some predictive capabilities but the scores indicate that it was clearly overfitting. 

## Scaled Data
### Prediction:
I expect both models will improve when trained on the scaled data but the logistic regression will improve significantly more based on the same logic as before. My prediction for the final performances of the models is that they will be similar.

In [14]:
# Scale the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [15]:
# Train the Logistic Regression model on the scaled data and print the model score
classifier = LogisticRegression(max_iter=1000, random_state=1)
classifier.fit(X_train_scaled, y_train)

LogisticRegression(max_iter=1000, random_state=1)

In [16]:
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")

Training Data Score: 0.7079638752052545
Testing Data Score: 0.7677584006805614


In [17]:
# Train a Random Forest Classifier model on the scaled data and print the model score
clf = RandomForestClassifier(n_estimators=100, random_state=1)
clf.fit(X_train_scaled, y_train)

RandomForestClassifier(random_state=1)

In [18]:
print(f"Training Data Score: {clf.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {clf.score(X_test_scaled, y_test)}")

Training Data Score: 1.0
Testing Data Score: 0.6552530837941302


### Results:
As expected, the logistic regression model improved significantly, going from 57% to about 77% accuracy on the testing data. The similar training scores indicate that the model is picking up on relevant patterns in the features and has developed some decent predicting abilities.

What surprised me somewhat, is the performance of the random forest classifier which did not seem to improve at all and is very much still overfitting despite scaling the data. Of course, this was a rather quick exploration and perhaps further preprocessing of the data combined with hyper-paramter tuning can lead to different results for both models.