# Module 03

## Session 07 Imbalance Classification

Analyze data bankloan.csv
* build a logistic regression model:
    - target: default
    - features: employ, debtinc, creddebt, othdebt
* random state 2020, ratio 80:20
* model evaluation using f1 score and stratified 5-fold CV
* Logistic regression with SMOTE optimize the k neighbor optimize c, solver
* combine the result (before and after)

# Library

In [2]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import plot_roc_curve
from sklearn.metrics import classification_report

import warnings
warnings.filterwarnings('ignore')

# Data

In [3]:
bankloan = pd.read_csv('./datasets/bankloan.csv')
bankloan.head()

Unnamed: 0,age,ed,employ,address,income,debtinc,creddebt,othdebt,default
0,41,3,17,12,176,9.3,11.359392,5.008608,1
1,27,1,10,6,31,17.3,1.362202,4.000798,0
2,40,1,15,14,55,5.5,0.856075,2.168925,0
3,41,1,15,14,120,2.9,2.65872,0.82128,0
4,24,2,2,0,28,17.3,1.787436,3.056564,1


# Data Splitting

In [4]:
X = bankloan[['employ', 'debtinc', 'creddebt', 'othdebt']]
y = bankloan['default']

In [5]:
X_trainval, X_test, y_trainval, y_test = train_test_split(
    X,
    y,
    stratify=y,
    test_size=0.2,
    random_state=2020
)

# Model

In [8]:
smote = SMOTE()
model = LogisticRegression()
pipe_model = Pipeline(
    [
        ('balance', smote),
        ('clf', model)
    ]
)

skf = StratifiedKFold(n_splits=5)

params = {
    'balance__k_neighbors':[2,5,10,15,20],
    'clf__C':[100,10,1,0.1,0.01, 0.01],
    'clf__solver':['lnfgs', 'liblinear', 'newton-cg']
}

grid_search = GridSearchCV(
    pipe_model,
    param_grid=params,
    cv=skf,
    scoring='f1',
    n_jobs=-1
)

In [9]:
grid_search.fit(X_trainval, y_trainval)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=Pipeline(steps=[('balance', SMOTE()),
                                       ('clf', LogisticRegression())]),
             n_jobs=-1,
             param_grid={'balance__k_neighbors': [2, 5, 10, 15, 20],
                         'clf__C': [100, 10, 1, 0.1, 0.01, 0.01],
                         'clf__solver': ['lnfgs', 'liblinear', 'newton-cg']},
             scoring='f1')