In [84]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.model_selection import train_test_split

from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import balanced_accuracy_score

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN

In [85]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))


In [86]:

print(train_df.columns.values)

['Unnamed: 0' 'index' 'loan_amnt' 'int_rate' 'installment'
 'home_ownership' 'annual_inc' 'verification_status' 'loan_status'
 'pymnt_plan' 'dti' 'delinq_2yrs' 'inq_last_6mths' 'open_acc' 'pub_rec'
 'revol_bal' 'total_acc' 'initial_list_status' 'out_prncp' 'out_prncp_inv'
 'total_pymnt' 'total_pymnt_inv' 'total_rec_prncp' 'total_rec_int'
 'total_rec_late_fee' 'recoveries' 'collection_recovery_fee'
 'last_pymnt_amnt' 'collections_12_mths_ex_med' 'policy_code'
 'application_type' 'acc_now_delinq' 'tot_coll_amt' 'tot_cur_bal'
 'open_acc_6m' 'open_act_il' 'open_il_12m' 'open_il_24m'
 'mths_since_rcnt_il' 'total_bal_il' 'il_util' 'open_rv_12m' 'open_rv_24m'
 'max_bal_bc' 'all_util' 'total_rev_hi_lim' 'inq_fi' 'total_cu_tl'
 'inq_last_12m' 'acc_open_past_24mths' 'avg_cur_bal' 'bc_open_to_buy'
 'bc_util' 'chargeoff_within_12_mths' 'delinq_amnt' 'mo_sin_old_il_acct'
 'mo_sin_old_rev_tl_op' 'mo_sin_rcnt_rev_tl_op' 'mo_sin_rcnt_tl'
 'mort_acc' 'mths_since_recent_bc' 'mths_since_recent_inq'
 'num

In [87]:
# # Convert categorical data to numeric on training data, feature and target
# X_2019 = train_df.drop("loan_status", axis=1)
# X_2019 = pd.get_dummies(X_2019)

# df2 = train_df.copy()

# df2['loan_status'] = pd.get_dummies((df2['loan_status']))
# from pandas import DataFrame
# y_2019 = DataFrame(df2["loan_status"])

X_2019 = train_df.drop("loan_status", axis=1)
X_2019 = pd.get_dummies(X_2019)

# df2 = train_df.copy()

df2['loan_status'] = pd.get_dummies((train_df['loan_status']))
from pandas import DataFrame
y_2019 = DataFrame(df2["loan_status"])


In [88]:
# print(train_df['loan_status'])
print(X_2019.columns.values)


['Unnamed: 0' 'index' 'loan_amnt' 'int_rate' 'installment' 'annual_inc'
 'dti' 'delinq_2yrs' 'inq_last_6mths' 'open_acc' 'pub_rec' 'revol_bal'
 'total_acc' 'out_prncp' 'out_prncp_inv' 'total_pymnt' 'total_pymnt_inv'
 'total_rec_prncp' 'total_rec_int' 'total_rec_late_fee' 'recoveries'
 'collection_recovery_fee' 'last_pymnt_amnt' 'collections_12_mths_ex_med'
 'policy_code' 'acc_now_delinq' 'tot_coll_amt' 'tot_cur_bal' 'open_acc_6m'
 'open_act_il' 'open_il_12m' 'open_il_24m' 'mths_since_rcnt_il'
 'total_bal_il' 'il_util' 'open_rv_12m' 'open_rv_24m' 'max_bal_bc'
 'all_util' 'total_rev_hi_lim' 'inq_fi' 'total_cu_tl' 'inq_last_12m'
 'acc_open_past_24mths' 'avg_cur_bal' 'bc_open_to_buy' 'bc_util'
 'chargeoff_within_12_mths' 'delinq_amnt' 'mo_sin_old_il_acct'
 'mo_sin_old_rev_tl_op' 'mo_sin_rcnt_rev_tl_op' 'mo_sin_rcnt_tl'
 'mort_acc' 'mths_since_recent_bc' 'mths_since_recent_inq'
 'num_accts_ever_120_pd' 'num_actv_bc_tl' 'num_actv_rev_tl' 'num_bc_sats'
 'num_bc_tl' 'num_il_tl' 'num_op_rev_tl'

In [89]:
y_2019.value_counts()


loan_status
0              6090
1              6090
dtype: int64

In [90]:
# # Convert categorical data to numeric on testing data
# X_2020test = pd.get_dummies(test_df.drop('loan_status', axis=1))

# # Separate target feature for testing data
# y_2020test = test_df.loc[:, 'loan_status']


# Convert categorical data to numeric on training data, feature and target
X_2020test = test_df.drop("loan_status", axis=1)
X_2020test = pd.get_dummies(X_2020test)

# df2 = train_df.copy()

df2['loan_status'] = pd.get_dummies((test_df['loan_status']))
from pandas import DataFrame
y_2020test = DataFrame(df2["loan_status"])


In [91]:
# Add missing dummy variables to testing set
# Get missing columns in the training test
missing_cols = set( X_2019.columns ) - set( X_2020test.columns )
# Add a missing column in test set with default value equal to 0
for c in missing_cols:
    X_2020test[c] = 0
# Ensure the order of column in the test set is in the same order than in train set
X_2020test = X_2020test[X_2019.columns]


In [92]:
# Feature selection

In [93]:
y_2020test.value_counts()

loan_status
0.0            2351
1.0            2351
dtype: int64

In [94]:
X_train, X_test, y_train, y_test = train_test_split(X_2019, y_2019, random_state=1)
Counter(y_train)
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Testing Model 1 : Logistic Regression 

### Unscaled Data

In [95]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier

LogisticRegression()

In [96]:
# Fit (train) our model by using the training data
classifier.fit(X_train, y_train)

  return f(*args, **kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [97]:
# Validate the model by using the test data
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.6535303776683087
Testing Data Score: 0.6440065681444992


In [98]:
# Split into testing and training data

In [99]:
# Unscale/Scale

In [100]:
# Hyperparameter tuning

In [101]:
# get scores and classication report

In [102]:
## Rinse and Repeat

## Testing Model 2 : RandomForest

### Unscaled Data

In [103]:
# Train a Random Forest Classifier model on unscaled data and print the model score
# Fit random forest and get training and testing score
clf = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train, y_train)
print(f'Training Score: {clf.score(X_train, y_train)}')
print(f'Testing Score: {clf.score(X_test, y_test)}')

  clf = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train, y_train)


Training Score: 1.0
Testing Score: 0.7937602627257799


In [105]:
predictions = clf.predict(X_test)
predictions

array([1, 1, 0, ..., 1, 1, 1], dtype=uint8)

In [106]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions,
                            target_names=["high_risk", "low_risk"]))

              precision    recall  f1-score   support

   high_risk       0.81      0.77      0.79      1546
    low_risk       0.77      0.82      0.80      1499

    accuracy                           0.79      3045
   macro avg       0.79      0.79      0.79      3045
weighted avg       0.79      0.79      0.79      3045



### Scaled Data

In [108]:
# Train a Random Forest Classifier model on the scaled data and print the model score
# Fit random forest and get training and testing score
clf = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train_scaled, y_train)
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')


  clf = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train_scaled, y_train)


Training Score: 1.0
Testing Score: 0.7937602627257799


In [109]:
predictions = clf.predict(X_test_scaled)
predictions

array([1, 1, 0, ..., 1, 1, 1], dtype=uint8)

In [110]:
# Calculate classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions,
                            target_names=["high_risk", "low_risk"]))

              precision    recall  f1-score   support

   high_risk       0.81      0.77      0.79      1546
    low_risk       0.77      0.82      0.80      1499

    accuracy                           0.79      3045
   macro avg       0.79      0.79      0.79      3045
weighted avg       0.79      0.79      0.79      3045



In [111]:
X_test_scaled

array([[-0.86884072, -0.86884072,  0.26219508, ..., -0.1694586 ,
         0.01812499, -0.01812499],
       [-0.97142543, -0.97142543,  0.75296671, ..., -0.1694586 ,
         0.01812499, -0.01812499],
       [-1.77412602, -1.77412602,  1.07196827, ..., -0.1694586 ,
         0.01812499, -0.01812499],
       ...,
       [ 0.5308302 ,  0.5308302 , -0.52303953, ..., -0.1694586 ,
         0.01812499, -0.01812499],
       [ 0.78356135,  0.78356135,  0.26219508, ..., -0.1694586 ,
         0.01812499, -0.01812499],
       [ 1.02208108,  1.02208108, -0.71934818, ..., -0.1694586 ,
         0.01812499, -0.01812499]])

In [None]:
# y_pred = model.predict(X_test)
# balanced_accuracy_score(y_test, y_pred)

In [None]:
# Compare the models and save the best one