## Python Exercise : Combine Cross Validaition with Balancing Method

1. Analyzee data bankloan.csv
    - Build a logistics regression model
        - Target : default
        - Features : employ, debtinc, creddebt, othdebt
    - Random state 2020, ratio 80%:20%
    - Modeling evaluate by f1 score and use stratified 5-fold CV :
        - Penalized logistic regression
        - Logistic regression with SMOTE
    - Which method is better

In [1]:
# pip install imblearn

In [2]:
import pandas as pd
import numpy as np

# preprocess
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
import category_encoders as ce
from imblearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

# model
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import cross_val_score,train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report, f1_score, plot_roc_curve
from imblearn.over_sampling import SMOTE

import seaborn as sns
import matplotlib.pyplot as plt

import warnings

warnings.filterwarnings('ignore')

In [3]:
bank_loan_df = pd.read_csv('data/bankloan.csv')
bank_loan_df

Unnamed: 0,age,ed,employ,address,income,debtinc,creddebt,othdebt,default
0,41,3,17,12,176,9.3,11.359392,5.008608,1
1,27,1,10,6,31,17.3,1.362202,4.000798,0
2,40,1,15,14,55,5.5,0.856075,2.168925,0
3,41,1,15,14,120,2.9,2.658720,0.821280,0
4,24,2,2,0,28,17.3,1.787436,3.056564,1
...,...,...,...,...,...,...,...,...,...
695,36,2,6,15,27,4.6,0.262062,0.979938,1
696,29,2,6,4,21,11.5,0.369495,2.045505,0
697,33,1,15,3,32,7.6,0.491264,1.940736,0
698,45,1,19,22,77,8.4,2.302608,4.165392,0


In [4]:
# probailitas default
len(bank_loan_df[bank_loan_df['default']==1])/700

0.26142857142857145

In [5]:
# prob non default
len(bank_loan_df[bank_loan_df['default']==0])/700

0.7385714285714285

### Data Splitting

In [6]:
x = bank_loan_df[['employ', 'debtinc', 'creddebt', 'othdebt']]
y = bank_loan_df['default']

x_trainval, x_test, y_trainval, y_test = train_test_split(
    x,
    y,
    stratify = y,
    test_size = 0.2,
    random_state = 2020
)

### Penalized Logistic Regression

In [7]:
logreg = LogisticRegression(class_weight='balanced')
skf = StratifiedKFold(n_splits=5)
logreg_cv = cross_val_score(logreg,x_trainval,y_trainval, cv=skf, scoring='f1')

In [8]:
print(logreg_cv)
print(logreg_cv.mean())
print(logreg_cv.std())

[0.63888889 0.58823529 0.61728395 0.58064516 0.60526316]
0.6060632905617759
0.020822978090423456


### Logistic Regression with SMOTE

In [9]:
smote = SMOTE()
logreg = LogisticRegression()

pipe_model = Pipeline([
    ('balance',smote),
    ('clf', logreg)
])

skf = StratifiedKFold(n_splits=5)

In [10]:
logreg_smote_cv = cross_val_score(pipe_model, x_trainval, y_trainval, cv=skf, scoring='f1')

In [11]:
print(logreg_smote_cv)
print(logreg_smote_cv.mean())
print(logreg_smote_cv.std())

[0.64864865 0.53731343 0.61728395 0.61290323 0.62162162]
0.6075541759059653
0.037265779360638726


### Final Model Performance (Test Set)

In [12]:
logreg = LogisticRegression(class_weight='balanced')
logreg.fit(x_trainval, y_trainval)

LogisticRegression(class_weight='balanced')

In [13]:
y_pred = logreg.predict(x_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.68      0.78       103
           1       0.48      0.84      0.61        37

    accuracy                           0.72       140
   macro avg       0.70      0.76      0.70       140
weighted avg       0.81      0.72      0.74       140



## Python Exercise : Combine Hyperparameter Tuning with Balancing Method

1. Analyzee data bankloan.csv
    - Build a logistics regression model
        - Target : default
        - Features : employ, debtinc, creddebt, othdebt
    - Random state 2020, ratio 80%:20%
    - Modeling evaluate by f1 score and use stratified 5-fold CV :
        - Logistic regression with SMOTE optimize the k neighbour
        - Optimize c solver
    - Which method is better

In [14]:
bank_loan_df = pd.read_csv('data/bankloan.csv')
bank_loan_df

Unnamed: 0,age,ed,employ,address,income,debtinc,creddebt,othdebt,default
0,41,3,17,12,176,9.3,11.359392,5.008608,1
1,27,1,10,6,31,17.3,1.362202,4.000798,0
2,40,1,15,14,55,5.5,0.856075,2.168925,0
3,41,1,15,14,120,2.9,2.658720,0.821280,0
4,24,2,2,0,28,17.3,1.787436,3.056564,1
...,...,...,...,...,...,...,...,...,...
695,36,2,6,15,27,4.6,0.262062,0.979938,1
696,29,2,6,4,21,11.5,0.369495,2.045505,0
697,33,1,15,3,32,7.6,0.491264,1.940736,0
698,45,1,19,22,77,8.4,2.302608,4.165392,0


## Splitting Data

In [15]:
x = bank_loan_df[['employ','debtinc','creddebt','othdebt']]
y = bank_loan_df['default']

In [16]:
x_trainval, x_test, y_trainval, y_train = train_test_split(
    x,
    y,
    stratify = y,
    test_size= 0.2,
    random_state=2020
)

### Modeling

In [17]:
from sklearn.neighbors import KNeighborsRegressor 

In [18]:
smote = SMOTE()
logreg = LogisticRegression()

pipe_model = Pipeline([
    ('balance',smote),
    ('clf',logreg)
])
skf = StratifiedKFold(n_splits=5)

hyperparameter = {
    'balance__k_neighbors': [2,5,10,15,20],
    'clf__C' : [100,10,1,0.1,0.01,0.001],
    'clf__solver': ['lbfgs','liblinear','newton-cg']
}

grid_search = GridSearchCV(
    pipe_model,
    param_grid=hyperparameter,
    scoring='f1',
    cv=skf,
    n_jobs=-1
)

In [19]:
grid_search.fit(x_trainval,y_trainval)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=Pipeline(steps=[('balance', SMOTE()),
                                       ('clf', LogisticRegression())]),
             n_jobs=-1,
             param_grid={'balance__k_neighbors': [2, 5, 10, 15, 20],
                         'clf__C': [100, 10, 1, 0.1, 0.01, 0.001],
                         'clf__solver': ['lbfgs', 'liblinear', 'newton-cg']},
             scoring='f1')

In [20]:
grid_search.best_params_

{'balance__k_neighbors': 10, 'clf__C': 0.1, 'clf__solver': 'newton-cg'}

In [21]:
grid_search.best_score_

0.62212376133249

In [22]:
hasil_cv = pd.DataFrame(grid_search.cv_results_)
hasil_cv[hasil_cv['rank_test_score']==1]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_balance__k_neighbors,param_clf__C,param_clf__solver,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
47,0.076298,0.009772,0.010964,0.00109,10,0.1,newton-cg,"{'balance__k_neighbors': 10, 'clf__C': 0.1, 'c...",0.638889,0.588235,0.625,0.645161,0.613333,0.622124,0.020228,1


In [23]:
hasil_cv[(hasil_cv['param_balance__k_neighbors']==5)&(hasil_cv['param_clf__C']==1)&(hasil_cv['param_clf__solver']=='lbfgs')]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_balance__k_neighbors,param_clf__C,param_clf__solver,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
24,0.055811,0.002034,0.010853,0.002725,5,1,lbfgs,"{'balance__k_neighbors': 5, 'clf__C': 1, 'clf_...",0.648649,0.567164,0.609756,0.580645,0.613333,0.603909,0.028341,49


### Compare before after tunning

In [24]:
smote = SMOTE()
logreg = LogisticRegression()

pipe_model = Pipeline([
    ('balance',smote),
    ('clf',logreg)
])

In [25]:
pipe_model.fit(x_trainval, y_trainval)
y_pred = pipe_model.predict(x_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.93      0.69      0.79       103
           1       0.50      0.86      0.63        37

    accuracy                           0.74       140
   macro avg       0.72      0.78      0.71       140
weighted avg       0.82      0.74      0.75       140



In [26]:
pipe_model=grid_search.best_estimator_

In [27]:
pipe_model.fit(x_trainval, y_trainval)
y_pred = pipe_model.predict(x_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.95      0.71      0.81       103
           1       0.52      0.89      0.66        37

    accuracy                           0.76       140
   macro avg       0.74      0.80      0.74       140
weighted avg       0.84      0.76      0.77       140



### 