In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder

In [2]:
# 2019 is our training data and 2020 is our test data
df_2019_raw = pd.read_csv(Path('Resources/2019loans.csv'))
df_2020_raw = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [3]:
df_2019 = df_2019_raw.drop(['index', 'Unnamed: 0'], axis = 1)
df_2020 = df_2020_raw.drop(['index', 'Unnamed: 0'], axis = 1)

In [4]:
for index, col in enumerate(df_2019.columns):
    if df_2019.dtypes[index] == 'object':
        print(col, df_2019[col].nunique(), f'\n{df_2019[col].value_counts()}\n')

home_ownership 4 
MORTGAGE    5800
RENT        4944
OWN         1371
ANY           65
Name: home_ownership, dtype: int64

verification_status 3 
Not Verified       5301
Source Verified    4881
Verified           1998
Name: verification_status, dtype: int64

loan_status 2 
high_risk    6090
low_risk     6090
Name: loan_status, dtype: int64

pymnt_plan 1 
n    12180
Name: pymnt_plan, dtype: int64

initial_list_status 2 
w    11158
f     1022
Name: initial_list_status, dtype: int64

application_type 2 
Individual    10400
Joint App      1780
Name: application_type, dtype: int64

hardship_flag 2 
N    11832
Y      348
Name: hardship_flag, dtype: int64

debt_settlement_flag 2 
N    12175
Y        5
Name: debt_settlement_flag, dtype: int64



In [5]:
type(df_2019.columns)

pandas.core.indexes.base.Index

In [6]:
X_2019_raw = df_2019.drop('loan_status', axis = 1)
y_2019_raw = df_2019['loan_status']
X_2020_raw = df_2020.drop('loan_status', axis = 1)
y_2020_raw = df_2020['loan_status']

X_2019_raw.shape, y_2019_raw.shape, X_2020_raw.shape, y_2020_raw.shape

((12180, 83), (12180,), (4702, 83), (4702,))

In [7]:
X_2019 = pd.get_dummies(X_2019_raw, drop_first=True)

y_2019 = y_2019_raw.replace({'low_risk':0, 'high_risk': 1})

In [8]:
X_2020 = pd.get_dummies(X_2020_raw, drop_first=True)

y_2020 = y_2020_raw.replace({'low_risk':0, 'high_risk': 1})

In [9]:
X_2019.shape, X_2020.shape

((12180, 85), (4702, 84))

In [10]:
missing_cols = []
for col in X_2019.columns:
    if col not in X_2020.columns:
        X_2020[col] = 0
        missing_cols.append(col)

X_2019.shape, X_2020.shape

((12180, 85), (4702, 85))

In [11]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(solver='lbfgs', random_state=1)
lr.fit(X_2019, y_2019)
print(f"Training Data Score: {lr.score(X_2019, y_2019)}")
print(f"Testing Data Score: {lr.score(X_2020, y_2020)}")

Training Data Score: 0.6497536945812807
Testing Data Score: 0.5157379838366652


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [12]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=1)
rf.fit(X_2019,y_2019)
print(f'Training Score: {rf.score(X_2019, y_2019)}')
print(f"Testing Data Score: {rf.score(X_2020, y_2020)}")

Training Score: 1.0
Testing Data Score: 0.6405784772437261


In [13]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_2019)
X_2019_scaled = scaler.transform(X_2019)
X_2020_scaled = scaler.transform(X_2020)

In [14]:
lr = LogisticRegression(solver='lbfgs', random_state=1)
lr.fit(X_2019_scaled, y_2019)
print(f"Training Data Score: {lr.score(X_2019_scaled, y_2019)}")
print(f"Testing Data Score: {lr.score(X_2020_scaled, y_2020)}")

Training Data Score: 0.7083743842364532
Testing Data Score: 0.7681837515950659


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [15]:
rf = RandomForestClassifier(random_state=1)
rf.fit(X_2019_scaled, y_2019)
print(f'Training Score: {rf.score(X_2019_scaled, y_2019)}')
print(f'Testing Score: {rf.score(X_2020_scaled, y_2020)}')

Training Score: 1.0
Testing Score: 0.6390897490429605
