In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE
from collections import Counter

In [2]:
data_v5 = pd.read_csv("data_v5.csv", low_memory=False)
data_v5

Unnamed: 0,issue_d,annual_inc,dti,fico_range_low,loan_amnt,open_acc,pub_rec,pub_rec_bankruptcies,tax_liens,loan_status_Fully Paid,home_ownership_ANY,home_ownership_MORTGAGE,home_ownership_NONE,home_ownership_OTHER,home_ownership_OWN,home_ownership_RENT
0,2015-12-01,55000.0,5.91,675.0,3600.0,7.0,0.0,0.0,0.0,1,0,1,0,0,0,0
1,2015-12-01,65000.0,16.06,715.0,24700.0,22.0,0.0,0.0,0.0,1,0,1,0,0,0,0
2,2015-12-01,63000.0,10.78,695.0,20000.0,6.0,0.0,0.0,0.0,1,0,1,0,0,0,0
3,2015-12-01,104433.0,25.37,695.0,10400.0,12.0,0.0,0.0,0.0,1,0,1,0,0,0,0
4,2015-12-01,34000.0,10.20,690.0,11950.0,5.0,0.0,0.0,0.0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1344234,2016-10-01,130000.0,20.59,735.0,18000.0,17.0,0.0,0.0,0.0,1,0,0,0,0,1,0
1344235,2016-10-01,180792.0,22.03,705.0,29400.0,16.0,0.0,0.0,0.0,1,0,1,0,0,0,0
1344236,2016-10-01,157000.0,10.34,735.0,32000.0,14.0,0.0,0.0,0.0,0,0,1,0,0,0,0
1344237,2016-10-01,150000.0,12.25,665.0,16000.0,12.0,4.0,3.0,0.0,1,0,0,0,0,0,1


In [3]:
data_v5.describe()

Unnamed: 0,annual_inc,dti,fico_range_low,loan_amnt,open_acc,pub_rec,pub_rec_bankruptcies,tax_liens,loan_status_Fully Paid,home_ownership_ANY,home_ownership_MORTGAGE,home_ownership_NONE,home_ownership_OTHER,home_ownership_OWN,home_ownership_RENT
count,1344239.0,1344239.0,1344239.0,1344239.0,1344239.0,1344239.0,1344239.0,1344239.0,1344239.0,1344239.0,1344239.0,1344239.0,1344239.0,1344239.0,1344239.0
mean,76272.33,18.28653,696.1737,14421.49,11.59528,0.2153977,0.1344515,0.05216632,0.8003554,0.0002127598,0.4947677,3.347619e-05,0.0001071238,0.1076713,0.3972076
std,69909.32,11.16102,31.84432,8716.773,5.47366,0.6020465,0.3779437,0.3980553,0.3997333,0.01458474,0.4999728,0.005785767,0.01034951,0.309965,0.4893198
min,16.0,-1.0,660.0,500.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,45902.0,11.8,670.0,8000.0,8.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,65000.0,17.62,690.0,12000.0,11.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,90000.0,24.06,710.0,20000.0,14.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
max,10999200.0,999.0,845.0,40000.0,90.0,86.0,12.0,85.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [4]:
data_v5.dtypes

issue_d                     object
annual_inc                 float64
dti                        float64
fico_range_low             float64
loan_amnt                  float64
open_acc                   float64
pub_rec                    float64
pub_rec_bankruptcies       float64
tax_liens                  float64
loan_status_Fully Paid       int64
home_ownership_ANY           int64
home_ownership_MORTGAGE      int64
home_ownership_NONE          int64
home_ownership_OTHER         int64
home_ownership_OWN           int64
home_ownership_RENT          int64
dtype: object

In [5]:
#Setting y/prediction variable column
y = data_v5['loan_status_Fully Paid']
y.ravel()
y

0          1
1          1
2          1
3          1
4          1
          ..
1344234    1
1344235    1
1344236    0
1344237    1
1344238    0
Name: loan_status_Fully Paid, Length: 1344239, dtype: int64

In [6]:
#Setting x/dependent variables 
X = data_v5[[
'annual_inc','fico_range_low', 'dti', 'loan_amnt',
'open_acc',
'pub_rec',
'pub_rec_bankruptcies',
'tax_liens', 'home_ownership_ANY',
'home_ownership_MORTGAGE',
'home_ownership_NONE',
'home_ownership_OTHER',
'home_ownership_OWN',
'home_ownership_RENT']]

In [7]:
# split Data into training and testing sets **USING STRATIFY method**
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [8]:

Counter(y_train)

Counter({0: 201277, 1: 806902})

In [9]:
# SMOTE
X_resampled, y_resampled = SMOTE(random_state=1,
sampling_strategy='auto').fit_resample(
   X_train, y_train)


In [10]:
Counter(y_resampled)

Counter({0: 806902, 1: 806902})

In [11]:
# Instantiate and Fit the model on resampled X and y
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [12]:
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[ 36031,  31062],
       [106282, 162685]])

In [13]:
balanced_accuracy_score(y_test, y_pred)

0.5709409207445588

In [14]:
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.25      0.54      0.60      0.34      0.57      0.32     67093
          1       0.84      0.60      0.54      0.70      0.57      0.33    268967

avg / total       0.72      0.59      0.55      0.63      0.57      0.33    336060

