In [2]:
import numpy as np
import pandas as pd
from pathlib import Path

In [3]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

train_df.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,pymnt_plan,dti,delinq_2yrs,inq_last_6mths,...,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag,target
0,7000.0,0.1894,256.38,MORTGAGE,75000.0,Not Verified,n,28.62,0.0,2.0,...,87.5,0.0,0.0,352260.0,62666.0,35000.0,10000.0,N,N,low_risk
1,40000.0,0.1614,975.71,MORTGAGE,102000.0,Source Verified,n,11.72,2.0,0.0,...,0.0,0.0,0.0,294664.0,109911.0,9000.0,71044.0,N,N,low_risk
2,11000.0,0.2055,294.81,RENT,45000.0,Verified,n,37.25,1.0,3.0,...,7.7,0.0,0.0,92228.0,36007.0,33000.0,46328.0,N,N,low_risk
3,4000.0,0.1612,140.87,MORTGAGE,38000.0,Not Verified,n,42.89,1.0,0.0,...,100.0,0.0,0.0,284273.0,52236.0,13500.0,52017.0,N,N,low_risk
4,14000.0,0.1797,505.93,MORTGAGE,43000.0,Source Verified,n,22.16,1.0,0.0,...,25.0,0.0,0.0,120280.0,88147.0,33300.0,78680.0,N,N,low_risk


In [45]:
# Convert categorical data to numeric and separate target feature for training data
train_dummies = pd.get_dummies(train_df)


X_train = train_dummies.drop('target_high_risk', axis=1)
X_train = X_train.drop('target_low_risk', axis=1)
X_train.head()

y_train = train_dummies['target_high_risk']

In [46]:
# Convert categorical data to numeric and separate target feature for testing data
test_dummies = pd.get_dummies(test_df)


X_test = test_dummies.drop('target_high_risk', axis=1)
X_test = X_test.drop('target_low_risk', axis=1)
X_test.head()

y_test = test_dummies['target_high_risk']

In [48]:
# add missing dummy variables to testing set (add debt_settlement_flag_Y)
X_test['debt_settlement_flag_Y'] = 0


### I believe that random forest will do better because its better at handling categorical data.

In [51]:
# Train the Logistic Regression model on the unscaled data and print the model score
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter=10000)

lr.fit(X_train, y_train)

print(f"Training Data Score: {lr.score(X_train, y_train)}")
print(f"Testing Data Score: {lr.score(X_test, y_test)}")



Training Data Score: 0.7060755336617406
Testing Data Score: 0.56571671629094


In [54]:
# Train a Random Forest Classifier model and print the model score
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state=42)

rf.fit(X_train, y_train)

print(f"Training Data Score: {rf.score(X_train, y_train)}")
print(f"Testing Data Score: {rf.score(X_test, y_test)}")


Training Data Score: 0.9182087684729063
Testing Data Score: -0.0456957039557635


### The logistic regression ended up being much better than random forest, which overfit. I predicted the opposite. This may be due to random forest not preforming well becuase of the amount of dummy variables.

### I predict that random forest will perform better because it performs better on scaled data.

In [55]:
# Scale the data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [57]:
# Train the Logistic Regression model on the scaled data and print the model score
lr = LogisticRegression(max_iter=10000)

lr.fit(X_train_scaled, y_train)

print(f"Training Data Score: {lr.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {lr.score(X_test_scaled, y_test)}")

Training Data Score: 0.710919540229885
Testing Data Score: 0.7601020842194811


In [56]:
# Train a Random Forest Classifier model on the scaled data and print the model score
rf = RandomForestRegressor(random_state=42)

rf.fit(X_train_scaled, y_train)

print(f"Training Data Score: {rf.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {rf.score(X_test_scaled, y_test)}")

Training Data Score: 0.9181929392446634
Testing Data Score: -0.0460071458953637


### Logistic regression performed much better on the scaled data (contrary to my prediction). Random forest overfit a lot as seen by the very good training score but poor testing score. The logistic regressesion must be good at not overfitting and handling a lot of dummy variables