In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Create Data Frames for the 2 Files 2019 for Training & 2020 for Testing
train_df = pd.read_csv(Path('2019loans.csv'))
test_df = pd.read_csv(Path('2020Q1loans.csv'))

Check Columns of the DataFrames

In [None]:
train_columns = train_df.columns
test_columns = test_df.columns
train_types = train_df.dtypes
test_types = test_df.dtypes

In [None]:
compare_df = list(zip(train_columns, train_types, test_columns, test_types))
compare_df

Convert the Categorical Data to Numeric & Seperate the Training Target Feature

In [None]:
train_converted_df = pd.get_dummies(train_df)
train_converted_df.head()

In [None]:
train_converted_columns = train_converted_df.columns

Convert the Categorical Data to Numeric & Seperate the Testing Target Feature

In [None]:
test_converted_df = pd.get_dumies(test_df)
test_converted_df.head()

In [None]:
test_converted_columns = test_converted_df.columns

In [None]:
compare_converted_df = list(zip(train_converted_columns, test_converted_columns))
compare_converted_df

Fill in the Missing Categories in the Testing Set

In [None]:
test_converted_df['debt_settlement_flag_Y'] = np.where(test_converted_df['debt_settlement_flag_N'] == 1, 0, 1)
test_converted_df.head()

Considering the Models

Create a Logistic Regression Model (Unscaled Data) & Print the Model Score

In [None]:
train_converted_df.head()

In [None]:
# Assign the Train data to X and y
# Reshape the data to create a two-dimensional array of values for Sklearn
X = train_converted_df.drop(['target_high_risk', 'target_low_risk'], axis=1)
y = train_converted_df['target_high_risk']   # outcome variable from training data

print("Shape: ", X.shape, y.shape)

In [None]:
# Assign the Test data to X and y
X_2 = test_converted_df.drop(['target_high_risk', 'target_low_risk'], axis=1)
y_2 = test_converted_df['target_high_risk'] 

print("Shape: ", X_2.shape, y_2.shape)

Splitting the data into Test - Train

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=7)

In [None]:
classifier= LogisticRegression(max_iter=20000)
classifier

In [None]:
classifier.fit(X_train, y_train)

In [None]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score (from training set): {classifier.score(X_test, y_test)}")
print(f'First Quarter 2020 fit: {classifier.score(X_2, y_2)}')

In [None]:
# Set the Vars
train_logistic = classifier.score(X_train, y_train)
test_logistic = classifier.score(X_test, y_test)
Q1_2020_logistic = classifier.score(X_2, y_2)

Create a Random Forest Classifier Model (Unscaled Data) & Print the Model Score

In [None]:
rfc = RandomForestClassifier(random_state=7, n_estimators=500).fit(X_train, y_train)
print(f"Training Data Score: {rfc.score(X_train, y_train)}")
print(f"Testing Data Score (from training set): {rfc.score(X_test, y_test)}")
print(f'First Quarter 2020 fit: {rfc.score(X_2, y_2)}')

In [None]:
# Set the Vars
train_rfc = rfc.score(X_train, y_train)
test_rfc = rfc.score(X_test, y_test)
Q1_2020_rfc = rfc.score(X_2, y_2)

Revisit the Preprocessing & Scale the Data

In [None]:
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_2_scaled=scaler.transform(X_2)

Create a Logistic Regression Model (Scaled Data) & Print the Model Score

In [None]:
classifier.fit(X_train_scaled, y_train)
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score (from training set): {classifier.score(X_test_scaled, y_test)}")
print(f'First Quarter 2020 fit: {classifier.score(X_2_scaled, y_2)}')

In [None]:
# Set the Vars
train_logistic_scaled = classifier.score(X_train_scaled, y_train)
test_logistic_scaled = classifier.score(X_test_scaled, y_test)
Q1_2020_logistic_scaled = classifier.score(X_2_scaled, y_2)

In [None]:
Create a Random Forest Classifier Model (Scaled Data) & Print the Model Score

In [None]:
# rfc = RandomForestClassifier(random_state=7, n_estimators=500).fit(X_train_scaled, y_train)
# print(f"Training Data Score: {clf.score(X_train_scaled, y_train)}")
# print(f"Testing Data Score (from training set): {clf.score(X_test_scaled, y_test)}")
# print(f'First Quarter 2020 fit: {clf.score(X_2_scaled, y_2)}')

In [None]:
train_rfc_scaled = rfc.score(X_train_scaled, y_train)
test_rfc_scaled = rfc.score(X_test_scaled, y_test)
Q1_2020_rfc_scaled = rfc.score(X_2_scaled, y_2)

In [None]:
# Create a Data Frame that holds the Results
results = {'data_set': ['2019_training', '2019_test', '2020_Q1'], 'LogReg_Unscaled': [train_logistic, test_logistic, Q1_2020_logistic],
    'RandomForest_Unscaled': [train_rfc, test_rfc, Q1_2020_rfc], 'LogReg_Scaled': [train_logistic_scaled, test_logistic_scaled, Q1_2020_logistic_scaled],
    'RandomForest_Scaled': [train_rfc_scaled, test_rfc_scaled, Q1_2020_rfc_scaled]}
results_df = pd.DataFrame(data=results)
results_df