# Module 03

## Session 07 Imbalance Classification

Analyze data bankloan.csv
* build a logistic regression model:
    - target: default
    - features: employ, debtinc, creddebt, othdebt
* random state 2020, ratio 80:20
* model evaluation using f1 score and stratified 5-fold CV
    - penalized logistic regression
    - logistic regression with SMOTE
* which method is better

# Library

In [4]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import plot_roc_curve
from sklearn.metrics import classification_report

import warnings
warnings.filterwarnings('ignore')

# Data

In [7]:
bankloan = pd.read_csv('./datasets/bankloan.csv')
bankloan.head()

Unnamed: 0,age,ed,employ,address,income,debtinc,creddebt,othdebt,default
0,41,3,17,12,176,9.3,11.359392,5.008608,1
1,27,1,10,6,31,17.3,1.362202,4.000798,0
2,40,1,15,14,55,5.5,0.856075,2.168925,0
3,41,1,15,14,120,2.9,2.65872,0.82128,0
4,24,2,2,0,28,17.3,1.787436,3.056564,1


# Data Splitting

In [8]:
X = bankloan[['employ', 'debtinc', 'creddebt', 'othdebt']]
y = bankloan['default']

In [9]:
X_trainval, X_test, y_trainval, y_test = train_test_split(
    X,
    y,
    stratify=y,
    test_size=0.2,
    random_state=2020
)

# Model

## 1. Penalized Logistic Regression

In [8]:
model = LogisticRegression(class_weight='balanced')
skf = StratifiedKFold(n_splits=5)
model_cv = cross_val_score(model, X_trainval, y_trainval, cv=skf, scoring='f1')

In [14]:
print(model_cv)
print(model_cv.mean())
print(model_cv.std())

[0.63888889 0.58823529 0.61728395 0.58064516 0.60526316]
0.6060632905617759
0.020822978090423456


## 2. Logistic Regression with SMOTE

In [10]:
smote = SMOTE()
model = LogisticRegression()

pipe_model = Pipeline(
    [
        ('balance', smote),
        ('clf', model)
    ]
)

skf = StratifiedKFold(n_splits=5)

In [12]:
model_smote_cv = cross_val_score(pipe_model, X_trainval, y_trainval, cv=skf, scoring='f1')

In [15]:
print(model_smote_cv)
print(model_smote_cv.mean())
print(model_smote_cv.std())

[0.63013699 0.58461538 0.6097561  0.58064516 0.61333333]
0.6036973926202772
0.01856954373049472


# Final Model Performance: Penalized Logistic Regression

In [16]:
model = LogisticRegression(class_weight='balanced')
model.fit(X_trainval, y_trainval)

LogisticRegression(class_weight='balanced')

In [19]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.68      0.78       103
           1       0.48      0.84      0.61        37

    accuracy                           0.72       140
   macro avg       0.70      0.76      0.70       140
weighted avg       0.81      0.72      0.74       140

