과제 
1. RandomForest 파라미터 조정
2. StratifiedKFold 적용
3. XGBoost
4. Cost-sensitive learning 적용 (https://dining-developer.tistory.com/27)

# 1. Random Forest 파라미터 조정

In [None]:
from sklearn.model_selection import GridSearchCV

params = {
    'n_estimators':[50, 100, 150],
    'criterion':['gini', 'entropy'],
    'max_depth':[6, 8, 10, 12],
    'min_samples_leaf':[8, 12, 18],
    'min_samples_split':[8, 16, 20]
}

rf_clf = RandomForestClassifier(random_state=0, n_jobs=-1)
grid_cv = GridSearchCV(rf_clf, param_grid=params, cv=2, n_jobs=-1)
grid_cv.fit(X_train, y_train)

In [39]:
pred = grid_cv.best_estimator_.predict(X_test)
accuracy_score(y_test, pred)

0.87265625

In [98]:
grid_cv.best_params_

{'criterion': 'entropy',
 'max_depth': 12,
 'min_samples_leaf': 8,
 'min_samples_split': 20,
 'n_estimators': 50}

In [53]:
rf_cfm = confusion_matrix(y_test, pred)
rf_cfm

array([[10879,    60],
       [ 1570,   291]])

In [112]:
from sklearn.metrics import classification_report
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.87      0.99      0.93     10939
           1       0.83      0.16      0.26      1861

    accuracy                           0.87     12800
   macro avg       0.85      0.58      0.60     12800
weighted avg       0.87      0.87      0.83     12800



# 2. StratifiedKFold 적용

In [40]:
from sklearn.model_selection import StratifiedKFold

rf_clf = RandomForestClassifier(random_state=0, n_jobs=-1)
grid_cv_skf = GridSearchCV(rf_clf, param_grid=params,
                              cv=StratifiedKFold(n_splits=5),
                              n_jobs=-1)
grid_cv_skf.fit(X_train, y_train)
pred_skf = grid_cv_skf.best_estimator_.predict(X_test)
accuracy_score(y_test, pred_skf)

0.873515625

In [54]:
rf_skf_cfm = confusion_matrix(y_test, pred_skf)
rf_skf_cfm

array([[10881,    58],
       [ 1561,   300]])

In [114]:
print(classification_report(y_test, pred_skf))

              precision    recall  f1-score   support

           0       0.87      0.99      0.93     10939
           1       0.84      0.16      0.27      1861

    accuracy                           0.87     12800
   macro avg       0.86      0.58      0.60     12800
weighted avg       0.87      0.87      0.83     12800



# 3. XGBoost

In [47]:
from xgboost import XGBClassifier

xgb_clf = XGBClassifier(n_estimators=400, learning_rate=0.1, max_depth=3)
evals = [(X_test, y_test)]

xgb_clf.fit(X_train, y_train, early_stopping_rounds=400, eval_set=evals,
           eval_metric="logloss", verbose=True)

xgb_pred = xgb_clf.predict(X_test)
xgb_pred_proba = xgb_clf.predict_proba(X_test)[:, 1]
xgb_accuracy = accuracy_score(y_test, xgb_pred)
xgb_accuracy

[0]	validation_0-logloss:0.64053
[1]	validation_0-logloss:0.59756
[2]	validation_0-logloss:0.56163
[3]	validation_0-logloss:0.53150
[4]	validation_0-logloss:0.50620
[5]	validation_0-logloss:0.48457
[6]	validation_0-logloss:0.46616
[7]	validation_0-logloss:0.45050
[8]	validation_0-logloss:0.43694
[9]	validation_0-logloss:0.42528
[10]	validation_0-logloss:0.41516
[11]	validation_0-logloss:0.40642
[12]	validation_0-logloss:0.39867
[13]	validation_0-logloss:0.39195
[14]	validation_0-logloss:0.38618
[15]	validation_0-logloss:0.38111
[16]	validation_0-logloss:0.37652
[17]	validation_0-logloss:0.37260
[18]	validation_0-logloss:0.36902
[19]	validation_0-logloss:0.36589
[20]	validation_0-logloss:0.36306
[21]	validation_0-logloss:0.36075
[22]	validation_0-logloss:0.35853
[23]	validation_0-logloss:0.35679
[24]	validation_0-logloss:0.35499
[25]	validation_0-logloss:0.35348
[26]	validation_0-logloss:0.35218
[27]	validation_0-logloss:0.35069
[28]	validation_0-logloss:0.34940
[29]	validation_0-loglos

[238]	validation_0-logloss:0.32124
[239]	validation_0-logloss:0.32115
[240]	validation_0-logloss:0.32110
[241]	validation_0-logloss:0.32107
[242]	validation_0-logloss:0.32106
[243]	validation_0-logloss:0.32105
[244]	validation_0-logloss:0.32103
[245]	validation_0-logloss:0.32102
[246]	validation_0-logloss:0.32099
[247]	validation_0-logloss:0.32100
[248]	validation_0-logloss:0.32099
[249]	validation_0-logloss:0.32090
[250]	validation_0-logloss:0.32080
[251]	validation_0-logloss:0.32080
[252]	validation_0-logloss:0.32079
[253]	validation_0-logloss:0.32079
[254]	validation_0-logloss:0.32068
[255]	validation_0-logloss:0.32064
[256]	validation_0-logloss:0.32063
[257]	validation_0-logloss:0.32062
[258]	validation_0-logloss:0.32064
[259]	validation_0-logloss:0.32065
[260]	validation_0-logloss:0.32066
[261]	validation_0-logloss:0.32065
[262]	validation_0-logloss:0.32062
[263]	validation_0-logloss:0.32056
[264]	validation_0-logloss:0.32056
[265]	validation_0-logloss:0.32054
[266]	validation_0-l

0.8803125

In [49]:
xgb_accuracy

0.8803125

In [66]:
xgb_cfm = confusion_matrix(y_test, xgb_pred)
xgb_cfm

array([[10848,    91],
       [ 1441,   420]])

In [115]:
print(classification_report(y_test, xgb_pred))

              precision    recall  f1-score   support

           0       0.88      0.99      0.93     10939
           1       0.82      0.23      0.35      1861

    accuracy                           0.88     12800
   macro avg       0.85      0.61      0.64     12800
weighted avg       0.87      0.88      0.85     12800



# 4. Cost-sensitive learning

### Weight 1
from `y_test`

In [71]:
weight1 = {0: y_test.value_counts()[1]/y_test.value_counts()[0],
          1: y_test.value_counts()[0]/y_test.value_counts()[1]}
weight1

{0: 0.17012523996709022, 1: 5.878022568511553}

### Weight 2
from confusion matrix
- 1을 더 잘 맞추도록
- 0은 덜 신경쓰도록

In [59]:
total_cfm = rf_cfm + rf_skf_cfm + xgb_cfm
total_cfm

array([[32608,   209],
       [ 4572,  1011]])

In [61]:
t_tn, t_fp, t_fn, t_tp = total_cfm.ravel()

In [72]:
t_tn, t_fp, t_fn, t_tp

(32608, 209, 4572, 1011)

In [73]:
weight2 = {0: t_fp/t_fn,
          1: t_fn/t_fp}
weight2

{0: 0.045713035870516186, 1: 21.875598086124402}

### Weight3
weight1 + weight2

In [82]:
weight3 = {i: v1+v2 for i, (v1,v2) in enumerate(zip(weight1.values(), weight2.values()))}
weight3

{0: 0.2158382758376064, 1: 27.753620654635956}

### Weight4
weight1 * weight2

In [83]:
weight4 = {i: v1*v2 for i, (v1,v2) in enumerate(zip(weight1.values(), weight2.values()))}
weight4

{0: 0.007776941197095769, 1: 128.58525924992736}

In [88]:
weights = [weight1, weight2, weight3, weight4]

### RandomForest GridSearch + weights

In [93]:
#from sklearn.model_selection import StratifiedKFold

params = {
    'n_estimators':[50, 100, 150],
    'criterion':['gini', 'entropy'],
    'max_depth':[6, 8, 10, 12],
    'min_samples_leaf':[8, 12, 18],
    'min_samples_split':[8, 16, 20],
    'class_weight':weights
}

rf_clf = RandomForestClassifier(random_state=0, n_jobs=-1)
grid_cv_skf_w = GridSearchCV(rf_clf, param_grid=params,
                              cv=StratifiedKFold(n_splits=5),
                              n_jobs=-1)
grid_cv_skf_w.fit(X_train, y_train)
pred_skf_w = grid_cv_skf_w.best_estimator_.predict(X_test)
accuracy_score(y_test, pred_skf_w)

0.374921875

In [95]:
grid_cv_skf_w.best_params_

{'class_weight': {0: 0.17012523996709022, 1: 5.878022568511553},
 'criterion': 'gini',
 'max_depth': 12,
 'min_samples_leaf': 8,
 'min_samples_split': 8,
 'n_estimators': 100}

In [94]:
rf_skf_w_cfm = confusion_matrix(y_test, pred_skf_w)
rf_skf_w_cfm

array([[2975, 7964],
       [  37, 1824]])

In [116]:
print(classification_report(y_test, pred_skf_w))

              precision    recall  f1-score   support

           0       0.99      0.27      0.43     10939
           1       0.19      0.98      0.31      1861

    accuracy                           0.37     12800
   macro avg       0.59      0.63      0.37     12800
weighted avg       0.87      0.37      0.41     12800



### 2차

In [102]:
#from sklearn.model_selection import StratifiedKFold

params = {
    'n_estimators':[50, 100, 150],
    'criterion':['gini', 'entropy'],
    'max_depth':[6, 8, 10, 12],
    'min_samples_leaf':[8, 12, 18],
    'min_samples_split':[8, 16, 20],
    'class_weight':[{0:1, 1:5},
                   {0:1, 1:10},
                   {0:1, 1:15}]
}

rf_clf = RandomForestClassifier(random_state=0, n_jobs=-1)
grid_cv_skf_w2 = GridSearchCV(rf_clf, param_grid=params,
                              cv=StratifiedKFold(n_splits=5),
                              n_jobs=-1)
grid_cv_skf_w2.fit(X_train, y_train)
pred_skf_w2 = grid_cv_skf_w2.best_estimator_.predict(X_test)
accuracy_score(y_test, pred_skf_w2)

0.794375

In [104]:
grid_cv_skf_w2.best_params_

{'class_weight': {0: 1, 1: 5},
 'criterion': 'gini',
 'max_depth': 12,
 'min_samples_leaf': 8,
 'min_samples_split': 8,
 'n_estimators': 150}

In [105]:
rf_skf_w_cfm2 = confusion_matrix(y_test, pred_skf_w2)
rf_skf_w_cfm2

array([[9186, 1753],
       [ 879,  982]])

In [118]:
print(classification_report(y_test, pred_skf_w2))

              precision    recall  f1-score   support

           0       0.91      0.84      0.87     10939
           1       0.36      0.53      0.43      1861

    accuracy                           0.79     12800
   macro avg       0.64      0.68      0.65     12800
weighted avg       0.83      0.79      0.81     12800



### 3차

In [106]:
#from sklearn.model_selection import StratifiedKFold

params = {
    'n_estimators':[50, 100, 150],
    'criterion':['gini', 'entropy'],
    'max_depth':[6, 8, 10, 12],
    'min_samples_leaf':[8, 12, 18],
    'min_samples_split':[8, 16, 20],
    'class_weight':[{0:1, 1:1.5},
                   {0:1, 1:3},
                   {0:1, 1:4.5}]
}

rf_clf = RandomForestClassifier(random_state=0, n_jobs=-1)
grid_cv_skf_w3 = GridSearchCV(rf_clf, param_grid=params,
                              cv=StratifiedKFold(n_splits=5),
                              n_jobs=-1)
grid_cv_skf_w3.fit(X_train, y_train)
pred_skf_w3 = grid_cv_skf_w3.best_estimator_.predict(X_test)
accuracy_score(y_test, pred_skf_w3)

0.875859375

In [111]:
grid_cv_skf_w3.best_params_

{'class_weight': {0: 1, 1: 1.5},
 'criterion': 'gini',
 'max_depth': 12,
 'min_samples_leaf': 8,
 'min_samples_split': 20,
 'n_estimators': 100}

In [107]:
rf_skf_w_cfm3 = confusion_matrix(y_test, pred_skf_w3)
rf_skf_w_cfm3

array([[10784,   155],
       [ 1434,   427]])

In [108]:
rf_cfm

array([[10879,    60],
       [ 1570,   291]])

In [119]:
print(classification_report(y_test, pred_skf_w3))

              precision    recall  f1-score   support

           0       0.88      0.99      0.93     10939
           1       0.73      0.23      0.35      1861

    accuracy                           0.88     12800
   macro avg       0.81      0.61      0.64     12800
weighted avg       0.86      0.88      0.85     12800



# Cost-sensitive learning 적용 결과
> 모델 일반화 성능과 세부 편향성(ex. f1-score) 간의 트레이드 오프 발생