In [None]:
"""
* 주제 : 와인종류 분류하기
* 사용데이터 : 08_wine.csv


<분석방법>
 - 스케일링 : 모든 스케일러 방식 사용
 - 데이터분류 : 훈련:검증:테스트 = 6 : 2 : 2로 분류하여 사용
 - 튜닝 없이 전체 모델 훈련하여 ~ 평가까지..
 - 하이퍼파라메터 튜닝 후 전체 모델 훈련하여 ~ 평가까지..


<모델별 하이퍼파라메터 변수들>
 * 랜덤포레스트, 엑스트라트리, 그레디언트부스트
  - n_estimators = [50, 100]
    : 트리 갯수 지정(보통 50~1000 사이값 사용)
  - max_depth = [None, 10]
    : 트리가 뻗어나가는 최대깊이 지정(None은 제한없음, 보통 3~10 사이값 사용)
  - min_samples_split = [2, 5]
    : 노드를 분할하기 위한 최소한의 샘플 수 (보통 2~10 사이값 사용)
  - min_samples_leaf = [1, 2, 4]
    : 리프노드(결정노드)의 최소한의 샘플 수 (보통 1~10 사이값 사용)


 * 히스트그레디언트부스트
  - max_iter = [50, 100]
    : 트리 갯수 지정(보통 50~1000 사이값 사용)
  - max_depth = [None, 10]
    : 트리가 뻗어나가는 최대깊이 지정(None은 제한없음, 보통 3~10 사이값 사용)
  - min_samples_leaf = [1, 2, 4]
    : 리프노드(결정노드)의 최소한의 샘플 수 (보통 1~10 사이값 사용)


 * 엑스지부스트
  - n_estimators = [50, 100]
    : 트리 갯수 지정(보통 50~1000 사이값 사용)
  - max_depth = [None, 10]
    : 트리가 뻗어나가는 최대깊이 지정(None은 제한없음, 보통 3~10 사이값 사용)
  - min_child_weight = [1, 2, 4]
    : 리프노드(결정노드)의 최소한의 샘플 수 (보통 1~10 사이값 사용)


<튜닝 자동화 모델(GridSearchCV 클래스)에서 사용할 튜닝 속성>
 * scoring = 정확도 사용("accuracy")
    : 튜닝 자동화 모델(클래스)에 사용할 평가 방법 지정
    : 분류에서는 정확도를 이용

 * refit = 정확도 사용("accuracy")
    : 튜닝 자동화 모델(클래스)에 사용할 모델 선정 기준 지정
    : scoring에서 한가지만 사용되기에 생략가능


 * cv = 5
    : 튜닝 자동화 모델(클래스)에 사용할 "교차검증" 시 사용할 Fold 갯수
    : Fold(폴드) -> 모델 훈련시 사용할 훈련데이터를 내부적으로
                 -> 폴드의 갯수만큼 다시 분류(훈련:검증으로)하여 사용하는 방법


<해석>
 * 모든 처리 과정 각각에 대해서 해석 작성


<최종 결과>
 * 데이터프레임에 아래 항목 저장하여 비교 후 모델 선정 (튜닝 전과 후로 각각 진행)
  - 모델명, 훈련정확도, 검증정확도, (훈련-검증)정확도, 정밀도, 재현율, F1-Score


 * 모델 선정 후, 테스트 데이터로 최종 예측 진행 (최종 선정 모델로 한번 진행)


<개별 제출>
 - 구글드라이브에 제출
 - 제출 파일명 : 0조_이름_앙상블전체훈련및평가.ipynb
"""

In [47]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

plt.rc("font", family="Malgun Gothic")
plt.rcParams["axes.unicode_minus"] = False

from sklearn import set_config
set_config(display="text")

In [48]:
df_wine = pd.read_csv("./data/08_wine.csv")

In [49]:
X = np.array(df_wine.iloc[:,:3])
y = np.array(df_wine.iloc[:,-1])

In [50]:
from sklearn.model_selection import train_test_split

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.4, random_state=42
)


X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42
)

print(X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape)

(3898, 3) (3898,) (1299, 3) (1299,) (1300, 3) (1300,)


#  튜닝 없이 전체 모델 훈련하여 ~ 평가

In [51]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from xgboost import XGBClassifier

In [52]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [80]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
scaler_list = [StandardScaler(), MinMaxScaler(), RobustScaler()]

df_model = pd.DataFrame()

model_list = [RandomForestClassifier(), ExtraTreesClassifier(), GradientBoostingClassifier(), HistGradientBoostingClassifier(), XGBClassifier()]
              
for ss in scaler_list :

    scaler_nm = ss.__class__.__name__

    # ss = StandardScaler()
    ss.fit(X_train)

    X_train_scaled = ss.transform(X_train)
    X_val_scaled   = ss.transform(X_val)
    X_test_scaled  = ss.transform(X_test)

    for m in model_list :
        print(f"========== [{m.__class__.__name__} / {scaler_nm}] ============")

        model = m

        model.fit(X_train_scaled, y_train)

        train_score = model.score(X_train_scaled, y_train)
        val_score   = model.score(X_val_scaled, y_val)
        test_score  = model.score(X_test_scaled, y_test)
        minus = train_score - val_score

        train_pred = model.predict(X_train_scaled)
        val_pred   = model.predict(X_val_scaled)
        test_pred  = model.predict(X_test_scaled)

        
        train_ace = accuracy_score(y_train, train_pred)
        train_pre = precision_score(y_train, train_pred)
        train_rec = recall_score(y_train, train_pred)
        train_f1 = f1_score(y_train, train_pred)

        val_ace = accuracy_score(y_val, val_pred)
        val_pre = precision_score(y_val, val_pred)
        val_rec = recall_score(y_val, val_pred)
        val_f1 = f1_score(y_val, val_pred)
     
        test_ace = accuracy_score(y_test, test_pred)
        test_pre = precision_score(y_test, test_pred)
        test_rec = recall_score(y_test, test_pred)
        test_f1 = f1_score(y_test, test_pred)

        ### 사용한 모델이름 추출
        model_name = model.__class__.__name__

        column_nm = ["모델명", "스케일러", "훈련", "검증", "테스트","훈련-검증",
                     "train_ace", "train_pre", "train_rec", "train_f1",
                     "val_ace", "val_pre", "val_rec", "val_f1",
                     "test_ace", "test_pre", "test_rec", "test_f1"]
        
        temp_list = [[model_name, scaler_nm, f"{train_score:.4f}", f"{val_score:.4f}", f"{test_score:.4f}", f"{minus:.4f}",
                      f"{train_ace:.4f}", f"{train_pre:.4f}", f"{train_rec:.4f}", f"{train_f1:.4f}",
                      f"{val_ace:.4f}", f"{val_pre:.4f}", f"{val_rec:.4f}", f"{val_f1:.4f}",
                      f"{test_ace:.4f}", f"{test_pre:.4f}", f"{test_rec:.4f}", f"{test_f1:.4f}"]]

        temp_df = pd.DataFrame(data=temp_list, columns=column_nm)
        df_model = pd.concat([df_model, temp_df], ignore_index=True)

df_model



Unnamed: 0,모델명,스케일러,훈련,검증,테스트,훈련-검증,train_ace,train_pre,train_rec,train_f1,val_ace,val_pre,val_rec,val_f1,test_ace,test_pre,test_rec,test_f1
0,RandomForestClassifier,StandardScaler,0.9974,0.8814,0.8708,0.116,0.9974,0.9973,0.9993,0.9983,0.8814,0.9216,0.9197,0.9206,0.8708,0.9151,0.9074,0.9112
1,ExtraTreesClassifier,StandardScaler,0.9974,0.8822,0.8685,0.1152,0.9974,0.9997,0.997,0.9983,0.8822,0.9234,0.9186,0.921,0.8685,0.9011,0.9211,0.911
2,GradientBoostingClassifier,StandardScaler,0.8917,0.8645,0.8585,0.0272,0.8917,0.9226,0.9368,0.9297,0.8645,0.9119,0.9063,0.9091,0.8585,0.9006,0.9063,0.9035
3,HistGradientBoostingClassifier,StandardScaler,0.9402,0.8768,0.8569,0.0634,0.9402,0.9635,0.958,0.9608,0.8768,0.9282,0.9053,0.9166,0.8569,0.909,0.8937,0.9013
4,XGBClassifier,StandardScaler,0.9618,0.873,0.8538,0.0888,0.9618,0.9755,0.9745,0.975,0.873,0.918,0.9114,0.9147,0.8538,0.906,0.8926,0.8993
5,RandomForestClassifier,MinMaxScaler,0.9974,0.883,0.8738,0.1144,0.9974,0.9983,0.9983,0.9983,0.883,0.9235,0.9197,0.9216,0.8738,0.9163,0.9105,0.9134
6,ExtraTreesClassifier,MinMaxScaler,0.9974,0.8853,0.8708,0.1121,0.9974,0.9997,0.997,0.9983,0.8853,0.9211,0.9258,0.9235,0.8708,0.9031,0.9221,0.9125
7,GradientBoostingClassifier,MinMaxScaler,0.8917,0.8645,0.8585,0.0272,0.8917,0.9226,0.9368,0.9297,0.8645,0.9119,0.9063,0.9091,0.8585,0.9006,0.9063,0.9035
8,HistGradientBoostingClassifier,MinMaxScaler,0.9402,0.8768,0.8569,0.0634,0.9402,0.9635,0.958,0.9608,0.8768,0.9282,0.9053,0.9166,0.8569,0.909,0.8937,0.9013
9,XGBClassifier,MinMaxScaler,0.9618,0.873,0.8538,0.0888,0.9618,0.9755,0.9745,0.975,0.873,0.918,0.9114,0.9147,0.8538,0.906,0.8926,0.8993


In [54]:
"""
 - 과소 및 과대 적합이 일어나지 않아야 함
 - 재현율과 f1-score가 모두 높으면 우수한 모델로 평가할 수 있음
 - 재현율이 현저히 낮은 경우에는 모델 선정 시 고민 필요
 - 일반화되고, 정확도 높고, 재현율 높고, f1-score가 높은 모델을 선정
   (여러가지 종합적으로 고려하여 판단해야 함)
"""

'\n - 과소 및 과대 적합이 일어나지 않아야 함\n - 재현율과 f1-score가 모두 높으면 우수한 모델로 평가할 수 있음\n - 재현율이 현저히 낮은 경우에는 모델 선정 시 고민 필요\n - 일반화되고, 정확도 높고, 재현율 높고, f1-score가 높은 모델을 선정\n   (여러가지 종합적으로 고려하여 판단해야 함)\n'

## 과소 및 과대 적합이 일어나지 않아야 함

In [81]:
df_model['훈련-검증'] = pd.to_numeric(df_model['훈련-검증'], errors='coerce')
df_filtered = df_model[df_model['훈련-검증']>0.01]

In [82]:
df_filtered = df_filtered[df_filtered['훈련-검증'] < 0.09]
df_filtered.head(3)

Unnamed: 0,모델명,스케일러,훈련,검증,테스트,훈련-검증,train_ace,train_pre,train_rec,train_f1,val_ace,val_pre,val_rec,val_f1,test_ace,test_pre,test_rec,test_f1
2,GradientBoostingClassifier,StandardScaler,0.8917,0.8645,0.8585,0.0272,0.8917,0.9226,0.9368,0.9297,0.8645,0.9119,0.9063,0.9091,0.8585,0.9006,0.9063,0.9035
3,HistGradientBoostingClassifier,StandardScaler,0.9402,0.8768,0.8569,0.0634,0.9402,0.9635,0.958,0.9608,0.8768,0.9282,0.9053,0.9166,0.8569,0.909,0.8937,0.9013
4,XGBClassifier,StandardScaler,0.9618,0.873,0.8538,0.0888,0.9618,0.9755,0.9745,0.975,0.873,0.918,0.9114,0.9147,0.8538,0.906,0.8926,0.8993


## 일반화가 된 모델 중 재현율이 낮은 모델

In [83]:
df_selected = df_filtered.sort_values(['훈련-검증','train_rec','val_rec','test_rec'],ascending=[True, False, False, False])
df_selected.head(3)

Unnamed: 0,모델명,스케일러,훈련,검증,테스트,훈련-검증,train_ace,train_pre,train_rec,train_f1,val_ace,val_pre,val_rec,val_f1,test_ace,test_pre,test_rec,test_f1
2,GradientBoostingClassifier,StandardScaler,0.8917,0.8645,0.8585,0.0272,0.8917,0.9226,0.9368,0.9297,0.8645,0.9119,0.9063,0.9091,0.8585,0.9006,0.9063,0.9035
7,GradientBoostingClassifier,MinMaxScaler,0.8917,0.8645,0.8585,0.0272,0.8917,0.9226,0.9368,0.9297,0.8645,0.9119,0.9063,0.9091,0.8585,0.9006,0.9063,0.9035
12,GradientBoostingClassifier,RobustScaler,0.8917,0.863,0.8562,0.0288,0.8917,0.9226,0.9368,0.9297,0.863,0.9117,0.9042,0.908,0.8562,0.9003,0.9032,0.9017


## 일반화가 됐고 재현율이 낮은 모델 중 F-1스코어가 가장 높은 모델(최종 모델 선정)

In [85]:
df_selected = df_filtered.sort_values(['훈련-검증', 'train_f1','val_f1','test_f1'],ascending=[True, False, False, False])
df_selected.head(3)

Unnamed: 0,모델명,스케일러,훈련,검증,테스트,훈련-검증,train_ace,train_pre,train_rec,train_f1,val_ace,val_pre,val_rec,val_f1,test_ace,test_pre,test_rec,test_f1
2,GradientBoostingClassifier,StandardScaler,0.8917,0.8645,0.8585,0.0272,0.8917,0.9226,0.9368,0.9297,0.8645,0.9119,0.9063,0.9091,0.8585,0.9006,0.9063,0.9035
7,GradientBoostingClassifier,MinMaxScaler,0.8917,0.8645,0.8585,0.0272,0.8917,0.9226,0.9368,0.9297,0.8645,0.9119,0.9063,0.9091,0.8585,0.9006,0.9063,0.9035
12,GradientBoostingClassifier,RobustScaler,0.8917,0.863,0.8562,0.0288,0.8917,0.9226,0.9368,0.9297,0.863,0.9117,0.9042,0.908,0.8562,0.9003,0.9032,0.9017


## 하이퍼파라메터 튜닝 후 전체 모델 훈련하여 ~ 평가

In [70]:
from sklearn.model_selection import GridSearchCV

In [73]:
### 모델별 하이퍼파라메터 정의하기 ###################

# 1. XGBRegressor
xgb_params = {"n_estimators" : [50, 100], "max_depth" : [None, 10], "min_child_weight" : [1, 2, 4]}

# 2. HistGradientBoosting
hgb_params = {"max_iter" : [50, 100], "max_depth" : [None, 10], "min_samples_leaf" : [1, 2, 4]}

# 3. RandomForest, ExtraTrees, GradentBoosting
rf_et_gb_params = {"n_estimators" : [50, 100], "max_depth" : [None, 10], "min_samples_split" : [2, 5], "min_samples_leaf" : [1, 2, 4]}
                    


####################### GridSearchCV 자체 설정값 ################
### 평가방법 정의 : 회귀에서 MAE, MSE, R2 평가 방법 중에 보통 r2 사용
scoring = ["accuracy"]

### 모델 선정 기준 정의하기 : 평가 결고 중에 선정 기준으로 사용할 평가 방법 지정
# - 회귀에서는 r2(결정계수) 사용
refit = "accuracy"

### GridSearchCV 튜닝 모델에서 교차검증에 사용할 그룹(Fold)의 갯수 지정
# - 3 또는 5 (홀수 값 사용)
cv = 5

In [95]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
scaler_list = [StandardScaler(), MinMaxScaler(), RobustScaler()]

df_model_tuned = pd.DataFrame()

model_list = [RandomForestClassifier(), ExtraTreesClassifier(), GradientBoostingClassifier(), HistGradientBoostingClassifier(), XGBClassifier()]

result_model = []

result_scaler = []
              
for ss in scaler_list :

    scaler_nm = ss.__class__.__name__

    # ss = StandardScaler()
    ss.fit(X_train)

    X_train_scaled = ss.transform(X_train)
    X_val_scaled   = ss.transform(X_val)
    X_test_scaled  = ss.transform(X_test)

    for m in model_list :
        print(f"========== [{m.__class__.__name__} / {scaler_nm}] ============")

        ### 모델 생성------------------
        model = m
        ### 사용한 모델이름 추출
        model_name = model.__class__.__name__

        #################### [GridSearchCV 튜닝 클래스 적용 Start] ###############
        gridParams = {}

        if model_name in ["XGBClassifier"] :
            print("1")
            gridParams = xgb_params
            print(xgb_params)
        elif model_name in ["HistGradientBoostingClassifier"] :
            print("2")
            gridParams = hgb_params
            print(hgb_params)
        else :            
            print("3")
            gridParams = rf_et_gb_params
            print(rf_et_gb_params)

        ### 튜닝 모델(클래스) 생성하기(튜닝 모델 생성하기)
        # - 일반적으로 모델 훈련하는 방법과 동일
        grid_search_model = GridSearchCV(
            ### 튜닝에 사용할 실제 모델 정의하기
            estimator=model,
            ### 위에서 설정한 하이퍼파라메터 딕셔너리 변수 정의
            param_grid=gridParams,
            ### 모델 평가 방법 정의
            scoring=scoring,
            ### 모델 선정 기준 정의
            refit=refit,
            ### 교차검증(CV)에 사용할 Fold 갯수 지정
            cv=cv,
            ### CPU-Core 갯수 정의
            n_jobs=-1
        )
    
        grid_search_model.fit(X_train_scaled, y_train)
        model = grid_search_model.best_estimator_

        train_score = model.score(X_train_scaled, y_train)
        val_score   = model.score(X_val_scaled, y_val)
        test_score  = model.score(X_test_scaled, y_test)
        minus = train_score - val_score

        train_pred = model.predict(X_train_scaled)
        val_pred   = model.predict(X_val_scaled)
        test_pred  = model.predict(X_test_scaled)
        
        train_ace = accuracy_score(y_train, train_pred)
        train_pre = precision_score(y_train, train_pred)
        train_rec = recall_score(y_train, train_pred)
        train_f1 = f1_score(y_train, train_pred)

        val_ace = accuracy_score(y_val, val_pred)
        val_pre = precision_score(y_val, val_pred)
        val_rec = recall_score(y_val, val_pred)
        val_f1 = f1_score(y_val, val_pred)
     
        test_ace = accuracy_score(y_test, test_pred)
        test_pre = precision_score(y_test, test_pred)
        test_rec = recall_score(y_test, test_pred)
        test_f1 = f1_score(y_test, test_pred)

        ### 사용한 모델이름 추출
        model_name = model.__class__.__name__

        column_nm = ["모델명", "스케일러", "훈련", "검증", "테스트","훈련-검증",
                     "train_ace", "train_pre", "train_rec", "train_f1",
                     "val_ace", "val_pre", "val_rec", "val_f1",
                     "test_ace", "test_pre", "test_rec", "test_f1"]
        
        temp_list = [[model_name, scaler_nm, f"{train_score:.4f}", f"{val_score:.4f}", f"{test_score:.4f}", f"{minus:.4f}",
                      f"{train_ace:.4f}", f"{train_pre:.4f}", f"{train_rec:.4f}", f"{train_f1:.4f}",
                      f"{val_ace:.4f}", f"{val_pre:.4f}", f"{val_rec:.4f}", f"{val_f1:.4f}",
                      f"{test_ace:.4f}", f"{test_pre:.4f}", f"{test_rec:.4f}", f"{test_f1:.4f}"]]

        temp_df = pd.DataFrame(data=temp_list, columns=column_nm)
        df_model_tuned = pd.concat([df_model_tuned, temp_df], ignore_index=True)

        result_model.append(model)

        result_scaler.append(ss)

df_model_tuned

3
{'n_estimators': [50, 100], 'max_depth': [None, 10], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 2, 4]}
3
{'n_estimators': [50, 100], 'max_depth': [None, 10], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 2, 4]}
3
{'n_estimators': [50, 100], 'max_depth': [None, 10], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 2, 4]}
2
{'max_iter': [50, 100], 'max_depth': [None, 10], 'min_samples_leaf': [1, 2, 4]}
1
{'n_estimators': [50, 100], 'max_depth': [None, 10], 'min_child_weight': [1, 2, 4]}
3
{'n_estimators': [50, 100], 'max_depth': [None, 10], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 2, 4]}
3
{'n_estimators': [50, 100], 'max_depth': [None, 10], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 2, 4]}
3
{'n_estimators': [50, 100], 'max_depth': [None, 10], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 2, 4]}
2
{'max_iter': [50, 100], 'max_depth': [None, 10], 'min_samples_leaf': [1, 2, 4]}
1
{'n_estimators': [50, 100], 'max_depth': [None, 10], 'min_

Unnamed: 0,모델명,스케일러,훈련,검증,테스트,훈련-검증,train_ace,train_pre,train_rec,train_f1,val_ace,val_pre,val_rec,val_f1,test_ace,test_pre,test_rec,test_f1
0,RandomForestClassifier,StandardScaler,0.9972,0.8822,0.8769,0.115,0.9972,0.9983,0.998,0.9982,0.8822,0.9234,0.9186,0.921,0.8769,0.9158,0.9158,0.9158
1,ExtraTreesClassifier,StandardScaler,0.9833,0.8861,0.8654,0.0973,0.9833,0.9857,0.9926,0.9891,0.8861,0.9078,0.9434,0.9253,0.8654,0.8942,0.9253,0.9095
2,GradientBoostingClassifier,StandardScaler,0.9969,0.8822,0.8638,0.1147,0.9969,0.9977,0.9983,0.998,0.8822,0.9287,0.9125,0.9205,0.8638,0.9107,0.9021,0.9064
3,HistGradientBoostingClassifier,StandardScaler,0.9507,0.8761,0.8592,0.0747,0.9507,0.9748,0.9604,0.9675,0.8761,0.9245,0.9083,0.9164,0.8592,0.9102,0.8958,0.9029
4,XGBClassifier,StandardScaler,0.9915,0.8868,0.8623,0.1047,0.9915,0.9956,0.9933,0.9945,0.8868,0.9265,0.9217,0.9241,0.8623,0.9105,0.9,0.9052
5,RandomForestClassifier,MinMaxScaler,0.9974,0.8853,0.8731,0.1121,0.9974,0.998,0.9987,0.9983,0.8853,0.9272,0.9186,0.9229,0.8731,0.9153,0.9105,0.9129
6,ExtraTreesClassifier,MinMaxScaler,0.9805,0.8861,0.87,0.0944,0.9805,0.9811,0.9936,0.9873,0.8861,0.9111,0.9392,0.9249,0.87,0.8981,0.9274,0.9125
7,GradientBoostingClassifier,MinMaxScaler,0.9869,0.8776,0.8708,0.1093,0.9869,0.9929,0.9899,0.9914,0.8776,0.9229,0.9125,0.9177,0.8708,0.9151,0.9074,0.9112
8,HistGradientBoostingClassifier,MinMaxScaler,0.9507,0.8761,0.8592,0.0747,0.9507,0.9748,0.9604,0.9675,0.8761,0.9245,0.9083,0.9164,0.8592,0.9102,0.8958,0.9029
9,XGBClassifier,MinMaxScaler,0.9915,0.8868,0.8623,0.1047,0.9915,0.9956,0.9933,0.9945,0.8868,0.9265,0.9217,0.9241,0.8623,0.9105,0.9,0.9052


## 과소 및 과대 적합이 일어나지 않아야 함

In [100]:
df_model_tuned['훈련-검증'] = pd.to_numeric(df_model_tuned['훈련-검증'], errors='coerce')
df_filtered = df_model_tuned[df_model_tuned['훈련-검증']>0.01]

In [101]:
df_filtered = df_filtered[df_filtered['훈련-검증'] < 0.09]
df_filtered.head(3)

Unnamed: 0,모델명,스케일러,훈련,검증,테스트,훈련-검증,train_ace,train_pre,train_rec,train_f1,val_ace,val_pre,val_rec,val_f1,test_ace,test_pre,test_rec,test_f1
3,HistGradientBoostingClassifier,StandardScaler,0.9507,0.8761,0.8592,0.0747,0.9507,0.9748,0.9604,0.9675,0.8761,0.9245,0.9083,0.9164,0.8592,0.9102,0.8958,0.9029
8,HistGradientBoostingClassifier,MinMaxScaler,0.9507,0.8761,0.8592,0.0747,0.9507,0.9748,0.9604,0.9675,0.8761,0.9245,0.9083,0.9164,0.8592,0.9102,0.8958,0.9029
13,HistGradientBoostingClassifier,RobustScaler,0.9507,0.8761,0.8592,0.0747,0.9507,0.9748,0.9604,0.9675,0.8761,0.9245,0.9083,0.9164,0.8592,0.9102,0.8958,0.9029


## 일반화가 된 모델 중 재현율이 낮은 모델

In [102]:
df_selected = df_filtered.sort_values(['훈련-검증','train_rec','val_rec','test_rec'],ascending=[True, False, False, False])
df_selected.head(3)

Unnamed: 0,모델명,스케일러,훈련,검증,테스트,훈련-검증,train_ace,train_pre,train_rec,train_f1,val_ace,val_pre,val_rec,val_f1,test_ace,test_pre,test_rec,test_f1
3,HistGradientBoostingClassifier,StandardScaler,0.9507,0.8761,0.8592,0.0747,0.9507,0.9748,0.9604,0.9675,0.8761,0.9245,0.9083,0.9164,0.8592,0.9102,0.8958,0.9029
8,HistGradientBoostingClassifier,MinMaxScaler,0.9507,0.8761,0.8592,0.0747,0.9507,0.9748,0.9604,0.9675,0.8761,0.9245,0.9083,0.9164,0.8592,0.9102,0.8958,0.9029
13,HistGradientBoostingClassifier,RobustScaler,0.9507,0.8761,0.8592,0.0747,0.9507,0.9748,0.9604,0.9675,0.8761,0.9245,0.9083,0.9164,0.8592,0.9102,0.8958,0.9029


## 일반화가 됐고 재현율이 낮은 모델 중 F-1스코어가 가장 높은 모델(최종 모델 선정)

In [103]:
df_selected = df_filtered.sort_values(['훈련-검증', 'train_f1','val_f1','test_f1'],ascending=[True, False, False, False])
df_selected.head(3)

Unnamed: 0,모델명,스케일러,훈련,검증,테스트,훈련-검증,train_ace,train_pre,train_rec,train_f1,val_ace,val_pre,val_rec,val_f1,test_ace,test_pre,test_rec,test_f1
3,HistGradientBoostingClassifier,StandardScaler,0.9507,0.8761,0.8592,0.0747,0.9507,0.9748,0.9604,0.9675,0.8761,0.9245,0.9083,0.9164,0.8592,0.9102,0.8958,0.9029
8,HistGradientBoostingClassifier,MinMaxScaler,0.9507,0.8761,0.8592,0.0747,0.9507,0.9748,0.9604,0.9675,0.8761,0.9245,0.9083,0.9164,0.8592,0.9102,0.8958,0.9029
13,HistGradientBoostingClassifier,RobustScaler,0.9507,0.8761,0.8592,0.0747,0.9507,0.9748,0.9604,0.9675,0.8761,0.9245,0.9083,0.9164,0.8592,0.9102,0.8958,0.9029


In [104]:
df_wine_selected = df_selected[["모델명", "train_ace", "val_ace", "훈련-검증", "train_pre", "train_rec", "train_f1"]]

In [105]:
df_wine_selected

Unnamed: 0,모델명,train_ace,val_ace,훈련-검증,train_pre,train_rec,train_f1
3,HistGradientBoostingClassifier,0.9507,0.8761,0.0747,0.9748,0.9604,0.9675
8,HistGradientBoostingClassifier,0.9507,0.8761,0.0747,0.9748,0.9604,0.9675
13,HistGradientBoostingClassifier,0.9507,0.8761,0.0747,0.9748,0.9604,0.9675


In [106]:
import pickle

save_file = "./model/hgb_model.pickle"

with open(save_file, "wb") as fw :
    pickle.dump(result_model[3], fw)

save_file = "./model/hgb_model_scaler.pickle"
with open(save_file, "wb") as fw :
    pickle.dump(result_scaler[3], fw)

In [107]:
file_nm = "./model/hgb_model.pickle"

with open(file_nm, "rb") as fw :
    new_model = pickle.load(fw)

file_nm = "./model/hgb_model_scaler.pickle"
with open(file_nm, "rb") as fw :
    new_scaler = pickle.load(fw)

new_model, new_scaler

(HistGradientBoostingClassifier(max_depth=10, min_samples_leaf=2),
 StandardScaler())

In [114]:
X_new = pd.DataFrame([[12.0,8.0,3.5]], columns=["alcohol","sugar","pH"])

X_new_scaled = new_scaler.transform(X_new)

y_pred = new_model.predict(X_new_scaled)

print(y_pred[0])

1.0


