### Prediction
I think that the random forest classifier will generate a better model than logistic regression.

Random forest doesn't rely on statistical assumptions and is far more robust to overfitting.

In [1]:

import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

import warnings

warnings.filterwarnings('ignore')


def spaceless_lowers(df):
    """remove spaces and capital letters from column names. replace with _ and lowercase, respectively"""

    new_cols = []
    cols = df.columns

    for col in cols:

        if df[col].dtype == 'object':
            df[col] = df[col].str.lower()
        new_col = col.replace(' ', '_').lower()
        new_cols.append(new_col)

    df.columns = new_cols

    return df


train_df = pd.read_csv('Resources/2019loans.csv')
test_df = pd.read_csv('Resources/2020Q1loans.csv')

use_cols = ['home_ownership', 'verification_status',
            'application_type', 'hardship_flag',
            'debt_settlement_flag', 'initial_list_status',
            'pymnt_plan', 'loan_status']



In [2]:
train_df = train_df.dropna(axis='columns', how='all')
test_df = test_df.dropna(axis='columns', how='all')

train_df = train_df.dropna()

train_df = train_df.drop(['Unnamed: 0', 'index'], axis=1)
test_df = test_df.drop(['Unnamed: 0', 'index'], axis=1)

test_df = spaceless_lowers(test_df)[use_cols]
train_df = spaceless_lowers(train_df)[use_cols]

# more straightforward to do it without this
# train_df = pd.get_dummies(train_df, columns=use_cols)
# test_df = pd.get_dummies(test_df, columns=use_cols)

dfs = [train_df, test_df]

In [3]:
test_df

Unnamed: 0,home_ownership,verification_status,application_type,hardship_flag,debt_settlement_flag,initial_list_status,pymnt_plan,loan_status
0,mortgage,not verified,individual,n,n,w,n,low_risk
1,rent,not verified,individual,n,n,w,n,low_risk
2,rent,not verified,individual,n,n,w,n,low_risk
3,rent,not verified,individual,n,n,w,n,low_risk
4,rent,not verified,individual,n,n,w,n,low_risk
...,...,...,...,...,...,...,...,...
4697,rent,source verified,individual,n,n,f,n,high_risk
4698,rent,not verified,individual,n,n,w,n,high_risk
4699,rent,verified,individual,n,n,f,n,high_risk
4700,rent,source verified,individual,n,n,w,n,high_risk


In [4]:
train_df

Unnamed: 0,home_ownership,verification_status,application_type,hardship_flag,debt_settlement_flag,initial_list_status,pymnt_plan,loan_status
0,mortgage,not verified,individual,n,n,w,n,low_risk
1,mortgage,source verified,individual,n,n,w,n,low_risk
2,mortgage,source verified,individual,n,n,w,n,low_risk
3,rent,not verified,individual,n,n,w,n,low_risk
4,mortgage,source verified,individual,n,n,w,n,low_risk
...,...,...,...,...,...,...,...,...
12175,rent,not verified,individual,n,n,w,n,high_risk
12176,rent,verified,individual,n,n,w,n,high_risk
12177,rent,not verified,individual,n,n,w,n,high_risk
12178,mortgage,source verified,joint app,n,n,w,n,high_risk


In [5]:
train_df.columns

Index(['home_ownership', 'verification_status', 'application_type',
       'hardship_flag', 'debt_settlement_flag', 'initial_list_status',
       'pymnt_plan', 'loan_status'],
      dtype='object')

In [6]:
obj_cols = [col for col in train_df.columns if train_df[col].dtype == 'object']

for col in obj_cols:
    train_df[col] = train_df[col].str.lower().str.replace(' ', '_')
    test_df[col] = test_df[col].str.lower().str.replace(' ', '_')

for df in dfs:
    home_ownership = df['home_ownership']
    verification_status = df['verification_status']
    pymnt_plan = df['pymnt_plan']
    initial_list_status = df['initial_list_status']
    application_type = df['application_type']
    hardship_flag = df['hardship_flag']
    debt_settlement_flag = df['debt_settlement_flag']
    loan_status = df['loan_status']

    home_ownership.replace(
        {
            'mortgage': 0,
            'rent': 1,
            'own': 2,
            'any': 3

        }, inplace=True
    )

    loan_status.replace(
        {
            'low_risk': 0,
            'high_risk': 1
        }, inplace=True
    )

    verification_status.replace(
        {
            'not_verified': 0,
            'source_verified': 1,
            'verified': 2
        }, inplace=True
    )

    pymnt_plan.replace(
        {
            'n': 0
        }, inplace=True
    )

    initial_list_status.replace(
        {
            'w': 0,
            'f': 1
        }, inplace=True
    )

    application_type.replace(
        {
            'individual': 0,
            'joint_app': 1
        }, inplace=True
    )

    hardship_flag.replace(
        {
            'n': 0,
            'y': 1
        }, inplace=True
    )

    debt_settlement_flag.replace(
        {
            'n': 0,
            'y': 1
        }, inplace=True)



In [7]:
train_df[obj_cols]

Unnamed: 0,home_ownership,verification_status,application_type,hardship_flag,debt_settlement_flag,initial_list_status,pymnt_plan,loan_status
0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
12175,1,0,0,0,0,0,0,1
12176,1,2,0,0,0,0,0,1
12177,1,0,0,0,0,0,0,1
12178,0,1,1,0,0,0,0,1


In [8]:
X_train = train_df.drop('loan_status', axis=1)
y_train = train_df['loan_status'].values

for col in obj_cols:
    print(train_df[col].value_counts(), '\n----\n')

0    5800
1    4944
2    1371
3      65
Name: home_ownership, dtype: int64 
----

0    5301
1    4881
2    1998
Name: verification_status, dtype: int64 
----

0    10400
1     1780
Name: application_type, dtype: int64 
----

0    11832
1      348
Name: hardship_flag, dtype: int64 
----

0    12175
1        5
Name: debt_settlement_flag, dtype: int64 
----

0    11158
1     1022
Name: initial_list_status, dtype: int64 
----

0    12180
Name: pymnt_plan, dtype: int64 
----

0    6090
1    6090
Name: loan_status, dtype: int64 
----



In [None]:
X_test = test_df.drop('loan_status', axis=1).dropna()
y_test = test_df['loan_status'].values

In [None]:
classifier = LogisticRegression(max_iter=300)
classifier.fit(X_train, y_train)

print(
    f'The training coefficient is {classifier.score(X_train, y_train)}\n'
    f'The testing coefficient is {classifier.score(X_test, y_test)}\n'
)

In [None]:
rand_class = RandomForestClassifier(
    random_state=42,
    n_estimators=300
).fit(X_train, y_train)

print(
    f'The random forest training coefficient is {rand_class.score(X_train, y_train)}\n'
    f'The random forest testing coefficient is {rand_class.score(X_test, y_test)}\n'
)

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

classifier.fit(X_train_scaled, y_train)

print(
    f'The scaled regression training coefficient is {classifier.score(X_train_scaled, y_train)}\n'
    f'The scaled regression testing coefficient is {classifier.score(X_test_scaled, y_test)}\n'
)

In [None]:
print(f'Scaled Random Forest coefficient: {rand_class.score(X_test_scaled, y_test)}')

In [None]:
from six import StringIO
from sklearn import tree

dot_data = StringIO()

clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
..

In [None]:
tree.export_graphviz(
    clf,
    out_file='dot_data_new.dot',
    filled=True,
    rounded=True,
    special_characters=True,
    leaves_parallel=True,
    feature_names=X_train.columns
)

# to create the svg, generate the .dot file and run the following in a terminal
# dot -Tsvg dot_data_new.dot -o tree_labels_new.svg

In [None]:
# from dtreeviz.trees import dtreeviz
#
# viz = dtreeviz(
#     clf,
#     X_train,
#     y,
#     feature_names=X_train.columns
# )
#
# viz

# this keeps running into a backend error that I don't have the time to fix right now.
# it generates really cool visualizations though.