# Module 03

## Session 07 Imbalance Classification

Analyze data bankloan.csv
* build a logistic regression model:
    - target: default
    - features: employ, debtinc, creddebt, othdebt
* random state 2020, ratio 80:20
* model evaluation using f1 score and stratified 5-fold CV
* Logistic regression with SMOTE optimize the k neighbor optimize c, solver
* combine the result (before and after)

# Library

In [3]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import plot_roc_curve
from sklearn.metrics import classification_report

import warnings
warnings.filterwarnings('ignore')

# Data

In [4]:
bankloan = pd.read_csv('./datasets/bankloan.csv')
bankloan.head()

Unnamed: 0,age,ed,employ,address,income,debtinc,creddebt,othdebt,default
0,41,3,17,12,176,9.3,11.359392,5.008608,1
1,27,1,10,6,31,17.3,1.362202,4.000798,0
2,40,1,15,14,55,5.5,0.856075,2.168925,0
3,41,1,15,14,120,2.9,2.65872,0.82128,0
4,24,2,2,0,28,17.3,1.787436,3.056564,1


# Data Splitting

In [5]:
X = bankloan[['employ', 'debtinc', 'creddebt', 'othdebt']]
y = bankloan['default']

In [6]:
X_trainval, X_test, y_trainval, y_test = train_test_split(
    X,
    y,
    stratify=y,
    test_size=0.2,
    random_state=2020
)

# Model

In [10]:
smote = SMOTE()
model = LogisticRegression()
pipe_model = Pipeline(
    [
        ('balance', smote),
        ('clf', model)
    ]
)

skf = StratifiedKFold(n_splits=5)

params = {
    'balance__k_neighbors':[2,5,10,15,20],
    'clf__C':[100,10,1,0.1,0.01, 0.01],
    'clf__solver':['lbfgs', 'liblinear', 'newton-cg']
}

grid_search = GridSearchCV(
    pipe_model,
    param_grid=params,
    cv=skf,
    scoring='f1',
    n_jobs=-1
)

In [11]:
grid_search.fit(X_trainval, y_trainval)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=Pipeline(steps=[('balance', SMOTE()),
                                       ('clf', LogisticRegression())]),
             n_jobs=-1,
             param_grid={'balance__k_neighbors': [2, 5, 10, 15, 20],
                         'clf__C': [100, 10, 1, 0.1, 0.01, 0.01],
                         'clf__solver': ['lbfgs', 'liblinear', 'newton-cg']},
             scoring='f1')

In [13]:
print('best score: ', grid_search.best_score_ )
print('best params: ', grid_search.best_params_)

best score:  0.6189702603021059
best params:  {'balance__k_neighbors': 20, 'clf__C': 100, 'clf__solver': 'newton-cg'}


In [15]:
tuning_result = pd.DataFrame(grid_search.cv_results_)
tuning_result.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_balance__k_neighbors,param_clf__C,param_clf__solver,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.011887,0.001659,0.00183,0.000567,2,100,lbfgs,"{'balance__k_neighbors': 2, 'clf__C': 100, 'cl...",0.647887,0.567164,0.625,0.612903,0.605263,0.611644,0.026506,16
1,0.00675,0.001746,0.001473,0.000312,2,100,liblinear,"{'balance__k_neighbors': 2, 'clf__C': 100, 'cl...",0.648649,0.588235,0.625,0.603175,0.621622,0.617336,0.020518,3
2,0.011208,0.002753,0.001444,0.000412,2,100,newton-cg,"{'balance__k_neighbors': 2, 'clf__C': 100, 'cl...",0.638889,0.57971,0.607595,0.580645,0.605263,0.60242,0.021701,63
3,0.010565,0.003611,0.001578,0.000435,2,10,lbfgs,"{'balance__k_neighbors': 2, 'clf__C': 10, 'clf...",0.630137,0.575758,0.609756,0.625,0.605263,0.609183,0.01909,24
4,0.007373,0.00298,0.001447,0.000532,2,10,liblinear,"{'balance__k_neighbors': 2, 'clf__C': 10, 'clf...",0.637681,0.553846,0.609756,0.580645,0.613333,0.599052,0.028959,72


In [16]:
tuning_result[tuning_result['rank_test_score'] == 1]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_balance__k_neighbors,param_clf__C,param_clf__solver,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
74,0.007551,0.001408,0.001836,0.001316,20,100,newton-cg,"{'balance__k_neighbors': 20, 'clf__C': 100, 'c...",0.647887,0.59375,0.625,0.622951,0.605263,0.61897,0.018506,1


In [20]:
tuning_result[(tuning_result['param_balance__k_neighbors']==10)&(tuning_result['param_clf__C']==1)&(tuning_result['param_clf__solver']=='lbfgs')]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_balance__k_neighbors,param_clf__C,param_clf__solver,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
42,0.008003,0.003683,0.001306,0.000356,10,1,lbfgs,"{'balance__k_neighbors': 10, 'clf__C': 1, 'clf...",0.631579,0.606061,0.6,0.580645,0.605263,0.60471,0.016281,51


# compare

In [21]:
smote = SMOTE()
model = LogisticRegression()
pipe_model = Pipeline(
    [
        ('balance', smote),
        ('clf', model)
    ]
)

In [22]:
pipe_model.fit(X_trainval, y_trainval)
y_pred = pipe_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.72      0.81       103
           1       0.52      0.86      0.65        37

    accuracy                           0.76       140
   macro avg       0.73      0.79      0.73       140
weighted avg       0.83      0.76      0.77       140



In [23]:
pipe_model_2 = grid_search.best_estimator_ 

In [24]:
pipe_model_2.fit(X_trainval, y_trainval)
y_pred_2 = pipe_model.predict(X_test)
print(classification_report(y_test, y_pred_2))

              precision    recall  f1-score   support

           0       0.94      0.72      0.81       103
           1       0.52      0.86      0.65        37

    accuracy                           0.76       140
   macro avg       0.73      0.79      0.73       140
weighted avg       0.83      0.76      0.77       140

