# 할 일
- 하이퍼 파라미터 튜닝
    - 각 모델 document 보면서 찾아보기
- 임계점 변경
    - 변경해보면서 최적값 찾기

# 기본 준비

In [1]:
import pandas as pd
import numpy as np
import copy
from tqdm.notebook import tqdm

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

%matplotlib inline

from sklearn.model_selection import train_test_split
# from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import f1_score, confusion_matrix, precision_recall_curve, roc_curve
from sklearn.preprocessing import Binarizer
# from sklearn.preprocessing import StandardScaler, MinMaxScaler
from imblearn.over_sampling import SMOTE

In [2]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC

In [2]:
pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_rows", None)

In [3]:
# 오차행렬 및 평가지표 출력
def get_clf_eval(y_test, pred=None, pred_proba=None):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    roc_auc = 0
    
    if pred_proba is not None:
        roc_auc = roc_auc_score(y_test, pred_proba)
    
    print('오차 행렬')
    print(confusion)

    print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f}, F1: {3:.4f}, AUC:{4:.4f}'.format(accuracy, precision, recall, f1, roc_auc))


# 임계값 조정하여 오차행렬 및 평가지표 출력
def get_eval_by_threshold(y_test, pred, pred_proba_c1, thresholds):
    pred_proba_c1 = pred_proba_c1.reshape(-1, 1)
    for custom_threshold in thresholds:
        binar = Binarizer(threshold=custom_threshold).fit(pred_proba_c1)
        custom_predict = binar.transform(pred_proba_c1)
        print("분류 임계값:", custom_threshold)
        get_clf_eval(y_test, custom_predict, pred_proba_c1)
    

# 임계값별로 precison_recall_curve 그리기
def precision_recall_curve_plot(y_test, pred_proba_c1): 
    precisions, recalls, thresholds = precision_recall_curve(y_test, pred_proba_c1)
    
    plt.figure(figsize=(8,6))
    threshold_boundary = thresholds.shape[0]

    plt.plot(thresholds, precisions[0:threshold_boundary], linestyle='--', label='precision') 
    plt.plot(thresholds, recalls[0:threshold_boundary],label='recall')

    start, end = plt.xlim()
    plt.xticks(np.round(np.arange(start, end, 0.1),2))
    
    plt.xlabel('Threshold value'); plt.ylabel('Precision and Recall value')
    plt.legend(); plt.grid()
    plt.show()

In [4]:
def make_eval_df(model):
    model.fit(X_train, y_train)
    
    pred = model.predict(X_test)
    confusion = str(confusion_matrix(y_test, pred)).replace("\n", "").replace("[ ", "[")
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)

    try:
        pred_proba = model.predict_proba(X_test)[:, 1]
        roc_auc = roc_auc_score(y_test, pred_proba)
    except:
        roc_auc = 0
    
    return pd.DataFrame([{"model": str(model).replace("\n", ""), 
                          "params": str(model.get_params()),
                          "confusion": confusion,
                          "accuracy": accuracy,
                          "precision": precision,
                          "recall": recall,
                          "f1": f1,
                          "roc_auc": roc_auc}])

col_list = ["model", "params", "confusion", "accuracy", "precision", "recall", "f1", "roc_auc"]

## 데이터 준비

In [6]:
# colab
data_path = "/content/drive/MyDrive/multicampus/data/" 
save_path = "/content/drive/MyDrive/multicampus/data/"

In [5]:
# jupyter
data_path = "../data/"
save_path = "./"

In [6]:
def prepare_data_smote():
    brfss = pd.read_csv(data_path + "brfss.csv")
    X = brfss.drop(["HEARTDISEASE", "ALCOHOL"], axis=1)
    y = brfss["HEARTDISEASE"]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

    smote = SMOTE()
    X_train, y_train = smote.fit_resample(X_train, y_train)
    print(y_train.value_counts())
    print(y_test.value_counts())
    
    return X_train, X_test, y_train, y_test

## RandomForest 

### 모델링
- 하이퍼 파라미터
    - **criterion**
        - "gini", "entropy", "log_loss"
    - **max_depth**
        - 5, 10, 20
    - **max_features**
        - "sqrt", "log2", None
    - **min_impurity_decrease**
        - .0, .001, .00001
    - **n_estimators**
        - 50, 100, 150

---

- 선택한 모델
    1. ```RandomForestClassifier(criterion="gini",     max_depth=5, max_features=None,   min_impurity_decrease=0.0,     n_estimators=100)```
    2. ```RandomForestClassifier(criterion="entropy",  max_depth=5, max_features=None,   min_impurity_decrease=0.0,     n_estimators=150)```
    3. ```RandomForestClassifier(criterion="gini",     max_depth=5, max_features="sqrt", min_impurity_decrease=0.0,     n_estimators=100)```
    4. ```RandomForestClassifier(criterion="gini",     max_depth=5, max_features="sqrt", min_impurity_decrease=0.00001, n_estimators=50)```
    5. ```RandomForestClassifier(criterion="log_loss", max_depth=5, max_features="sqrt", min_impurity_decrease=0.0,     n_estimators=100)```
    6. ```RandomForestClassifier(criterion="log_loss", max_depth=5, max_features="sqrt", min_impurity_decrease=0.0,     n_estimators=50)```
- 선택한 기준
    - ```precision >.123``` and ```roc_auc > .771```

In [14]:
from sklearn.ensemble import RandomForestClassifier

n_estimators_list = [50, 100, 150]
criterion_list = ["gini", "entropy", "log_loss"]
max_features_list = ["sqrt", "log2", None]
max_depth_list = [5, 10, 20]
min_impurity_decrease_list = [.0, .001, .00001]

rf_df = pd.DataFrame(columns=col_list)

X_train, X_test, y_train, y_test = prepare_data_smote()

for n_estimators in tqdm(n_estimators_list, desc="n_estimators"):
    for criterion in tqdm(criterion_list, desc="criterion"):
        for max_features in tqdm(max_features_list, desc="max_features"):
            for max_depth in tqdm(max_depth_list, desc="max_depth"):
                for min_impurity_decrease in tqdm(min_impurity_decrease_list, desc="min_impurity_decrease"):
                        rf_model = RandomForestClassifier(n_estimators=n_estimators,
                                                          criterion=criterion,
                                                          max_features=max_features, 
                                                          max_depth=max_depth,
                                                          min_impurity_decrease=min_impurity_decrease)
                        rf_df = pd.concat([rf_df, make_eval_df(rf_model)], axis=0, ignore_index=True)

0.0    286764
1.0    286764
Name: HEARTDISEASE, dtype: int64
0.0    71660
1.0     4070
Name: HEARTDISEASE, dtype: int64


n_estimators:   0%|          | 0/3 [00:00<?, ?it/s]

criterion:   0%|          | 0/3 [00:00<?, ?it/s]

max_features:   0%|          | 0/3 [00:00<?, ?it/s]

max_depth:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

max_depth:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

max_depth:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

max_features:   0%|          | 0/3 [00:00<?, ?it/s]

max_depth:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

max_depth:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

max_depth:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

max_features:   0%|          | 0/3 [00:00<?, ?it/s]

max_depth:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

max_depth:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

max_depth:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

criterion:   0%|          | 0/3 [00:00<?, ?it/s]

max_features:   0%|          | 0/3 [00:00<?, ?it/s]

max_depth:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

max_depth:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

max_depth:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

max_features:   0%|          | 0/3 [00:00<?, ?it/s]

max_depth:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

max_depth:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

max_depth:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

max_features:   0%|          | 0/3 [00:00<?, ?it/s]

max_depth:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

max_depth:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

max_depth:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

criterion:   0%|          | 0/3 [00:00<?, ?it/s]

max_features:   0%|          | 0/3 [00:00<?, ?it/s]

max_depth:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

max_depth:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

max_depth:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

max_features:   0%|          | 0/3 [00:00<?, ?it/s]

max_depth:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

max_depth:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

max_depth:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

max_features:   0%|          | 0/3 [00:00<?, ?it/s]

max_depth:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

max_depth:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

max_depth:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

min_impurity_decrease:   0%|          | 0/3 [00:00<?, ?it/s]

In [15]:
rf_df.to_csv(save_path + "RandomForest_1.csv", index=False)

### 모델 선택

In [16]:
rf_result = pd.read_csv(save_path + "RandomForest_1.csv")
rf_result = rf_result.sort_values("precision", ascending=False)
rf_result["precision"].unique()

array([0.12536913, 0.12530691, 0.12465169, 0.12456491, 0.12409514,
       0.12147657, 0.1214478 , 0.11337919, 0.11334417, 0.11301917,
       0.11293188, 0.11291668, 0.11289095, 0.11287574, 0.11284414,
       0.11283835, 0.11282553, 0.11276457, 0.11271226, 0.11256874,
       0.11252829, 0.11243581, 0.11243305, 0.1123761 , 0.11222703,
       0.11219268, 0.11219233, 0.11216757, 0.11213314, 0.11199744,
       0.11199715, 0.11198575, 0.11198547, 0.11192647, 0.11190187,
       0.11121761, 0.11121376, 0.11120595, 0.11115056, 0.11107174,
       0.1110246 , 0.11101689, 0.11100882, 0.11095014, 0.11088788,
       0.1108721 , 0.11082964, 0.11082538, 0.11080957, 0.11073446,
       0.11065776, 0.11061043, 0.11058857, 0.11046145, 0.11012862,
       0.1100364 , 0.10969193, 0.10967921, 0.10963855, 0.10962254,
       0.10958386, 0.109435  , 0.10941467, 0.10941293, 0.1092669 ,
       0.10922877, 0.10919129, 0.10910456, 0.10909892, 0.10875548,
       0.10857433, 0.10853864, 0.10853737, 0.10835778])

In [23]:
rf_temp = rf_result[(rf_result["precision"]>.123) & (rf_result["roc_auc"]>.771)]
len(rf_temp)

6

In [24]:
rf_temp

Unnamed: 0,model,params,confusion,accuracy,precision,recall,f1,roc_auc
99,"RandomForestClassifier(max_depth=5, max_features=None)","{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 5, 'max_features': None, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}",[[52112 19548] [1268 2802]],0.725129,0.125369,0.688452,0.212112,0.771558
207,"RandomForestClassifier(criterion='entropy', max_depth=5, max_features=None, n_estimators=150)","{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'entropy', 'max_depth': 5, 'max_features': None, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 150, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}",[[52112 19548] [1268 2802]],0.725129,0.125369,0.688452,0.212112,0.771485
81,RandomForestClassifier(max_depth=5),"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 5, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}",[[51555 20105] [1207 2863]],0.718579,0.124652,0.70344,0.211776,0.777142
2,"RandomForestClassifier(max_depth=5, min_impurity_decrease=1e-05, n_estimators=50)","{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 5, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 1e-05, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 50, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}",[[51555 20105] [1207 2863]],0.718579,0.124652,0.70344,0.211776,0.776938
135,"RandomForestClassifier(criterion='log_loss', max_depth=5)","{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'log_loss', 'max_depth': 5, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}",[[51539 20121] [1207 2863]],0.718368,0.124565,0.70344,0.211651,0.777004
54,"RandomForestClassifier(criterion='log_loss', max_depth=5, n_estimators=50)","{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'log_loss', 'max_depth': 5, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 50, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}",[[51332 20328] [1190 2880]],0.715859,0.124095,0.707617,0.211159,0.776555


In [25]:
rf_temp.to_csv(save_path + "RandomForest_final.csv", index=False)