In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split as tts
from sklearn.preprocessing import MinMaxScaler

import warnings
warnings.filterwarnings('ignore')

def spaceless_lowers(df):
    """remove spaces and capital letters from column names. replace with _ and lowercase, respectively"""

    new_cols = []
    cols = df.columns

    for col in cols:

        if df[col].dtype == 'object':
            df[col] = df[col].str.lower()
        new_col = col.replace(' ', '_').lower()
        new_cols.append(new_col)

    df.columns = new_cols

    return df


train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

# train_df['loan_status'].value_counts()

In [None]:
train_df = train_df.dropna(axis='columns', how='all')
test_df = test_df.dropna(axis='columns', how='all')

train_df = train_df.dropna()

train_df = train_df.drop(['Unnamed: 0', 'index'], axis=1)
test_df = test_df.drop(['Unnamed: 0', 'index'], axis=1)



test_df = spaceless_lowers(test_df)
train_df = spaceless_lowers(train_df)

dfs = [train_df, test_df]

In [None]:
obj_cols = [col for col in train_df.columns if train_df[col].dtype == 'object']

for col in obj_cols:
    train_df[col] = train_df[col].str.lower().str.replace(' ', '_')
    test_df[col] = test_df[col].str.lower().str.replace(' ', '_')

for df in dfs:
    home_ownership = df['home_ownership']
    verification_status = df['verification_status']
    pymnt_plan = df['pymnt_plan']
    initial_list_status = df['initial_list_status']
    application_type = df['application_type']
    hardship_flag = df['hardship_flag']
    debt_settlement_flag = df['debt_settlement_flag']
    loan_status = df['loan_status']

    home_ownership = home_ownership.replace(
        {
            'mortgage': 0,
            'rent': 1,
            'own': 2,
            'any': 3

        }, inplace=True
    )

    loan_status = loan_status.replace(
        {
            'low_risk': 0,
            'high_risk': 1
        }, inplace=True
    )

    verification_status = verification_status.replace(
        {
            'not_verified': 0,
            'source_verified': 1,
            'verified': 2
        }, inplace=True
    )

    pymnt_plan = pymnt_plan.replace(
        {
            'n': 0
        }, inplace=True
    )

    initial_list_status = initial_list_status.replace(
        {
            'w': 0,
            'f': 1
        }, inplace=True
    )

    application_type = application_type.replace(
        {
            'individual': 0,
            'joint_app': 1
        }, inplace=True
    )

    hardship_flag = hardship_flag.replace(
        {
            'n': 0,
            'y': 1
        }, inplace=True
    )

    debt_settlement_flag = debt_settlement_flag.replace(
        {
            'n': 0,
            'y': 1
        }, inplace=True
    )







In [None]:
train_df[obj_cols]

In [None]:
X_train =  train_df.drop('loan_status', axis=1)
y_train = train_df['loan_status'].values

for col in obj_cols:
    print(train_df[col].value_counts(), '\n----\n')

In [None]:
X_test = test_df.drop('loan_status', axis=1).dropna()
y_test = test_df['loan_status'].values

In [None]:
classifier = LogisticRegression(max_iter=300)
classifier.fit(X_train, y_train)

print(
    f'The training coefficient is {classifier.score(X_train, y_train)}\n'
    f'The testing coefficient is {classifier.score(X_test, y_test)}\n'
)

In [None]:
rand_class = RandomForestClassifier(
    random_state=42,
    n_estimators=300
).fit(X_train, y_train)


print(
    f'The random forest training coefficient is {rand_class.score(X_train, y_train)}\n'
    f'The random forest testing coefficient is {rand_class.score(X_test, y_test)}\n'
)

In [None]:
# lab_enc = LabelEncoder()
# train_copy = train_df.copy()
#
# train_copy['loan_status'] = lab_enc.fit_transform(train_copy['loan_status'])
# y = pd.DataFrame(train_copy['loan_status'].values)

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

classifier.fit(X_train_scaled, y_train)

print(
    f'The scaled regression training coefficient is {classifier.score(X_train_scaled, y_train)}\n'
    f'The scaled regression testing coefficient is {classifier.score(X_test_scaled, y_test)}\n'
)

In [None]:
print(f'Scaled Random Forest coefficient: {rand_class.score(X_test_scaled, y_test)}')