- log, index 컬럼 중 의미있는 컬럼 찾기

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# EDA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from category_encoders import OneHotEncoder, OrdinalEncoder
from sklearn.decomposition import PCA

# Model
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Score
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, recall_score, precision_score, f1_score

# Class Imbalance import
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE

# Hyperparameter Tuning
from sklearn.model_selection import RandomizedSearchCV
from tqdm import tqdm

In [3]:
# Seed 고정
seed = 1
np.random.seed(seed)

In [4]:
df_org = pd.read_csv('data/mulit_classification_data.csv')
df_org.shape

(1941, 34)

In [5]:
# 14번째부터 27번째 컬럼 삭제
columns_to_drop = [i for i in range(14, 27)]
df = df_org.drop(columns=df_org.columns[columns_to_drop])
df

Unnamed: 0,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,...,TypeOfSteel_A300,TypeOfSteel_A400,Steel_Plate_Thickness,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
0,42,50,270900,270944,267,17,44,24220,76,108,...,1,0,80,1,0,0,0,0,0,0
1,645,651,2538079,2538108,108,10,30,11397,84,123,...,1,0,80,1,0,0,0,0,0,0
2,829,835,1553913,1553931,71,8,19,7972,99,125,...,1,0,100,1,0,0,0,0,0,0
3,853,860,369370,369415,176,13,45,18996,99,126,...,0,1,290,1,0,0,0,0,0,0
4,1289,1306,498078,498335,2409,60,260,246930,37,126,...,0,1,185,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1936,249,277,325780,325796,273,54,22,35033,119,141,...,0,1,40,0,0,0,0,0,0,1
1937,144,175,340581,340598,287,44,24,34599,112,133,...,0,1,40,0,0,0,0,0,0,1
1938,145,174,386779,386794,292,40,22,37572,120,140,...,0,1,40,0,0,0,0,0,0,1
1939,137,170,422497,422528,419,97,47,52715,117,140,...,0,1,40,0,0,0,0,0,0,1


In [6]:
df = pd.concat([df, df_org[['Empty_Index']]], axis=1)
df

Unnamed: 0,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,...,TypeOfSteel_A400,Steel_Plate_Thickness,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults,Empty_Index
0,42,50,270900,270944,267,17,44,24220,76,108,...,0,80,1,0,0,0,0,0,0,0.2415
1,645,651,2538079,2538108,108,10,30,11397,84,123,...,0,80,1,0,0,0,0,0,0,0.3793
2,829,835,1553913,1553931,71,8,19,7972,99,125,...,0,100,1,0,0,0,0,0,0,0.3426
3,853,860,369370,369415,176,13,45,18996,99,126,...,1,290,1,0,0,0,0,0,0,0.4413
4,1289,1306,498078,498335,2409,60,260,246930,37,126,...,1,185,1,0,0,0,0,0,0,0.4486
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1936,249,277,325780,325796,273,54,22,35033,119,141,...,1,40,0,0,0,0,0,0,1,0.3906
1937,144,175,340581,340598,287,44,24,34599,112,133,...,1,40,0,0,0,0,0,0,1,0.4554
1938,145,174,386779,386794,292,40,22,37572,120,140,...,1,40,0,0,0,0,0,0,1,0.3287
1939,137,170,422497,422528,419,97,47,52715,117,140,...,1,40,0,0,0,0,0,0,1,0.5904


In [7]:
encoding_list = ['TypeOfSteel_A300', 'TypeOfSteel_A400']

# 오디날 인코딩 수행
df['Type_of_Steel'] = df[encoding_list].idxmax(axis=1).apply(lambda x: encoding_list.index(x))

# 오디날 인코딩된 컬럼 삭제
df = df.drop(encoding_list, axis=1)

# 결과 출력
print(df['Type_of_Steel'])

0       0
1       0
2       0
3       1
4       1
       ..
1936    1
1937    1
1938    1
1939    1
1940    0
Name: Type_of_Steel, Length: 1941, dtype: int64


In [8]:
target_list = ['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']

# 오디날 인코딩 수행
df['Target'] = df[target_list].idxmax(axis=1).apply(lambda x: target_list.index(x))

# 오디날 인코딩된 컬럼 삭제
df = df.drop(target_list, axis=1)

# 결과 출력
print(df['Target'])

0       0
1       0
2       0
3       0
4       0
       ..
1936    6
1937    6
1938    6
1939    6
1940    6
Name: Target, Length: 1941, dtype: int64


In [9]:
# 행들을 뒤죽박죽으로 섞기
df = df.sample(frac=1, random_state=42)
print(df['Target'])

1605    6
1502    6
70      0
976     5
1052    5
       ..
1130    5
1294    6
860     4
1459    6
1126    5
Name: Target, Length: 1941, dtype: int64


### 데이터셋 분리 (train / val / test)

In [10]:
# target / feature 분리

target = 'Target'
x = df.drop(target, axis=1)
y = df[target]

print(f'x shape : {x.shape}')
print(f'y shape : {y.shape}')

x shape : (1941, 14)
y shape : (1941,)


In [11]:
## 데이터셋 분리 (train / val / test)# train / val / test 분리

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=1)

print(f'train data : x{x_train.shape}, y{y_train.shape}')
print(f'val data : x{x_val.shape}, y{y_val.shape}')
print(f'test data : x{x_test.shape}, y{y_test.shape}')

train data : x(1241, 14), y(1241,)
val data : x(311, 14), y(311,)
test data : x(389, 14), y(389,)


## Model 선택

In [12]:
# 손실값 계산 함수 (수정 중)
def compute_loss(y_true, y_pred):
    M = len(y_true)
    N = 1 if isinstance(y_pred, (int, float)) else np.array(y_pred).shape[1]
    square_sum = np.sum(np.square(y_true - np.array(y_pred)))
    loss = square_sum / (M * N)
    
    return loss

# 정확도 반환 함수
def eval_models(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    
    return accuracy

In [13]:
def train_val_score(model):
    y_train_pred = model.predict(x_train)
    y_val_pred = model.predict(x_val)

    model_eval = pd.DataFrame(index=['Accuracy', 'Loss'], columns=['Train', 'Val'])
    model_eval['Train'] = eval_models(y_train, y_train_pred)
    model_eval['Val'] = eval_models(y_val, y_val_pred)
    
    display(model_eval)
    return model_eval

In [14]:
# 기준모델
base = y_train.mode()[0]
baseline = [base] * len(y_train)

# 모델별 score
eval_table = pd.DataFrame(index=['Accuracy'], 
                          columns=['Baseline', 'RandomForest', 'XGB'])
eval_table['Baseline'] = eval_models(y_train, baseline)
eval_table

Unnamed: 0,Baseline,RandomForest,XGB
Accuracy,0.355359,,


In [15]:
# RandomForestClassifier

randomforest = RandomForestClassifier()

randomforest.fit(x_train, y_train)

In [16]:
# train-val score
train_val_score(randomforest)

# 모델별 score
y_val_pred = randomforest.predict(x_val)
eval_table['RandomForest'] = eval_models(y_val, y_val_pred)
eval_table

Unnamed: 0,Train,Val
Accuracy,1.0,0.78135
Loss,1.0,0.78135


Unnamed: 0,Baseline,RandomForest,XGB
Accuracy,0.355359,0.78135,


In [17]:
xgb = XGBClassifier()

xgb.fit(x_train, y_train)

In [18]:
# train-val score
train_val_score(xgb)

# 모델별 score
y_val_pred = xgb.predict(x_val)
eval_table['XGB'] = eval_models(y_val, y_val_pred)
eval_table

Unnamed: 0,Train,Val
Accuracy,1.0,0.836013
Loss,1.0,0.836013


Unnamed: 0,Baseline,RandomForest,XGB
Accuracy,0.355359,0.78135,0.836013


## Class Imbalance 해결

In [19]:
u_sampler = RandomUnderSampler(random_state=1)
x_train_u, y_train_u = u_sampler.fit_resample(x_train, y_train)

o_sampler = RandomOverSampler(random_state=1)
x_train_o, y_train_o = o_sampler.fit_resample(x_train, y_train)

smote = SMOTE(random_state=1)
x_train_s, y_train_s = smote.fit_resample(x_train, y_train)

In [20]:
def sampling_score(model):
    
    print(f'MODEL : {model}')
    
    x_samples = [x_train_u, x_train_o, x_train_s]
    y_samples = [y_train_u, y_train_o, y_train_s]
    sample_eval = pd.DataFrame(index=['Accuracy'], columns=['Under', 'Over', 'SMOTE'])
    
    for i, x, y in zip(range(3), x_samples, y_samples):
        model.fit(x, y)
        y_val_pred = model.predict(x_val)
        
        sample_eval[sample_eval.columns[i]] = eval_models(y_val, y_val_pred)

    return sample_eval

In [21]:
sampling_score(randomforest)


MODEL : RandomForestClassifier()


Unnamed: 0,Under,Over,SMOTE
Accuracy,0.598071,0.778135,0.771704


In [22]:
sampling_score(xgb)

MODEL : XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              objective='multi:softprob', predictor=None, ...)


Unnamed: 0,Under,Over,SMOTE
Accuracy,0.617363,0.842444,0.800643


In [23]:
# 컨퓨전 리포트 출력
xgb.fit(x_train_o, y_train_o)
y_val_pred = xgb.predict(x_val)
report = classification_report(y_val, y_val_pred)
print(report)

              precision    recall  f1-score   support

           0       0.87      0.46      0.60        28
           1       0.91      0.91      0.91        22
           2       1.00      0.95      0.97        58
           3       1.00      1.00      1.00        12
           4       0.89      1.00      0.94         8
           5       0.80      0.79      0.79        75
           6       0.77      0.88      0.82       108

    accuracy                           0.84       311
   macro avg       0.89      0.86      0.86       311
weighted avg       0.85      0.84      0.84       311



In [24]:
y_test_pred = xgb.predict(x_test)
report = classification_report(y_test, y_test_pred)
print(report)

              precision    recall  f1-score   support

           0       0.64      0.52      0.57        27
           1       0.94      0.98      0.96        46
           2       0.96      0.97      0.96        89
           3       0.92      1.00      0.96        11
           4       1.00      0.82      0.90        11
           5       0.67      0.58      0.62        81
           6       0.71      0.79      0.75       124

    accuracy                           0.80       389
   macro avg       0.83      0.81      0.82       389
weighted avg       0.79      0.80      0.79       389



RandomOverSampler 증강 갯수 조절

In [25]:
# 성능이 오히려 떨어져서 쓰지 않기로
'''
sampling_strategy_dict = {1:20000}

ros = RandomOverSampler(sampling_strategy=sampling_strategy_dict)
x_train_over, y_train_over = ros.fit_resample(x_train, y_train)
print(x_train_over.shape, y_train_over.shape)
'''

'\nsampling_strategy_dict = {1:20000}\n\nros = RandomOverSampler(sampling_strategy=sampling_strategy_dict)\nx_train_over, y_train_over = ros.fit_resample(x_train, y_train)\nprint(x_train_over.shape, y_train_over.shape)\n'

In [26]:
'''
xgb.fit(x_train_over, y_train_over)
y_val_pred = xgb.predict(x_val)
report = classification_report(y_val, y_val_pred)
print(report)
'''

'\nxgb.fit(x_train_over, y_train_over)\ny_val_pred = xgb.predict(x_val)\nreport = classification_report(y_val, y_val_pred)\nprint(report)\n'

SMOTETomek 이용

In [27]:
# 성능이 더 떨어져서 쓰지 않기로
'''
from imblearn.combine import SMOTETomek

sampling_strategy_dict = {0: 5000, 5: 2000, 6: 2000} # 레이블당 증강할 갯수 지정
x_resampled, y_resampled = SMOTETomek(sampling_strategy=sampling_strategy_dict).fit_resample(x_train, y_train) #데이터 증강
# x_resampled, y_resampled = SMOTETomek().fit_resample(x_train, y_train)
print(x_resampled.shape, y_resampled.shape)
'''

'\nfrom imblearn.combine import SMOTETomek\n\nsampling_strategy_dict = {0: 5000, 5: 2000, 6: 2000} # 레이블당 증강할 갯수 지정\nx_resampled, y_resampled = SMOTETomek(sampling_strategy=sampling_strategy_dict).fit_resample(x_train, y_train) #데이터 증강\n# x_resampled, y_resampled = SMOTETomek().fit_resample(x_train, y_train)\nprint(x_resampled.shape, y_resampled.shape)\n'

In [28]:
'''
xgb.fit(x_resampled, y_resampled)
y_val_pred = xgb.predict(x_val)
report = classification_report(y_val, y_val_pred)
print(report)
'''

# 1,2,3,4 레이블 recall에서 모두 좋은 성적을 내는 RandomOverSampler가 더 적당해보인다

'\nxgb.fit(x_resampled, y_resampled)\ny_val_pred = xgb.predict(x_val)\nreport = classification_report(y_val, y_val_pred)\nprint(report)\n'

In [29]:
'''
y_test_pred = xgb.predict(x_test)
report = classification_report(y_test, y_test_pred)
print(report)
'''

'\ny_test_pred = xgb.predict(x_test)\nreport = classification_report(y_test, y_test_pred)\nprint(report)\n'

# 예측 확률 뽑아보기

In [97]:
def get_mismatched_indices(y_true, y_pred):
    mismatched_indices = np.where(y_true != y_pred)[0]
    return mismatched_indices.tolist()

def get_matched_indices(y_true, y_pred):
    mismatched_indices = np.where(y_true == y_pred)[0]
    return mismatched_indices.tolist()

# 다른 부분의 인덱스 추출
mismatched_indices = get_mismatched_indices(y_val, y_val_pred)
matched_indices = get_matched_indices(y_val, y_val_pred)

# 결과 확인
print("맞은 것:", matched_indices)
print("틀린 것:", mismatched_indices)


맞은 것: [0, 1, 2, 4, 5, 6, 7, 8, 9, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 37, 38, 39, 40, 41, 42, 44, 46, 47, 48, 50, 51, 52, 53, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 80, 81, 82, 83, 84, 85, 87, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 109, 110, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 125, 126, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 157, 158, 159, 160, 161, 162, 163, 164, 166, 167, 168, 169, 170, 171, 173, 174, 175, 176, 177, 179, 180, 181, 182, 185, 186, 188, 189, 190, 191, 192, 194, 195, 196, 197, 198, 200, 201, 202, 203, 205, 206, 207, 208, 209, 210, 212, 213, 214, 215, 216, 218, 219, 220, 222, 223, 225, 226, 227, 229, 230, 231, 232, 234, 235, 236, 237, 238, 239, 241, 242, 243, 245, 246, 247, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260

In [98]:
predictions_proba = randomforest.predict_proba(x_val)
predictions_proba = pd.DataFrame(predictions_proba, columns=target_list)
predictions_proba

Unnamed: 0,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
0,0.14,0.13,0.02,0.01,0.07,0.18,0.45
1,0.00,0.00,0.99,0.00,0.00,0.00,0.01
2,0.10,0.01,0.00,0.01,0.01,0.60,0.27
3,0.39,0.00,0.00,0.00,0.00,0.24,0.37
4,0.13,0.00,0.00,0.00,0.01,0.59,0.27
...,...,...,...,...,...,...,...
306,0.06,0.00,0.02,0.00,0.00,0.79,0.13
307,0.10,0.01,0.02,0.00,0.00,0.05,0.82
308,0.00,0.00,0.00,0.99,0.00,0.00,0.01
309,0.02,0.08,0.48,0.00,0.00,0.05,0.37


In [99]:
# 맞은 것들의 예측 확률 비교
max_values = predictions_proba.apply(lambda row: row.max(), axis=1)
max_values.iloc[matched_indices]

0      0.45
1      0.99
2      0.60
4      0.59
5      0.81
       ... 
305    0.55
307    0.82
308    0.99
309    0.48
310    0.82
Length: 262, dtype: float64

In [100]:
x_val = pd.DataFrame(x_val)
x_val.iloc[matched_indices].describe()

Unnamed: 0,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,Length_of_Conveyer,Steel_Plate_Thickness,Empty_Index,Type_of_Steel
count,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0
mean,569.480916,618.580153,1523575.0,1523622.0,2120.667939,120.896947,83.015267,239310.9,83.087786,130.591603,1466.717557,82.709924,0.410732,0.610687
std,517.690019,491.029148,1397627.0,1397621.0,4419.584853,211.95205,131.819334,516756.0,33.515764,20.210321,145.592239,58.22206,0.132428,0.488528
min,0.0,6.0,9228.0,9246.0,12.0,4.0,4.0,1537.0,7.0,71.0,1306.0,40.0,0.0714,0.0
25%,53.25,196.75,484364.2,484428.8,87.5,15.0,13.0,9795.5,59.5,124.0,1358.0,40.0,0.3202,0.0
50%,442.0,492.0,1127812.0,1127830.0,171.5,25.0,24.0,18187.0,89.0,127.0,1367.0,70.0,0.40985,1.0
75%,1050.5,1065.5,2079438.0,2079452.0,1064.75,93.25,90.75,118728.0,104.0,140.0,1652.0,97.5,0.502025,1.0
max,1688.0,1694.0,7655518.0,7655567.0,25473.0,1050.0,684.0,3061597.0,195.0,252.0,1698.0,300.0,0.7906,1.0


In [205]:
x_val.iloc[mismatched_indices]

Unnamed: 0,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,Length_of_Conveyer,Steel_Plate_Thickness,Empty_Index,Type_of_Steel
69,1610,1618,1944129,1944138,56,12,9,4674,71,101,1656,100,0.2222,0
985,204,226,921087,921104,105,37,20,13505,100,148,1368,69,0.7193,0
1281,1603,1613,1995544,1995586,308,25,42,22991,55,94,1656,80,0.2667,0
1090,1038,1047,603596,603608,80,12,12,9136,100,133,1358,40,0.2593,1
136,1251,1256,458923,458941,70,15,18,8051,106,127,1356,40,0.2222,1
158,1166,1185,2258648,2258662,123,33,17,15858,116,143,1708,100,0.5376,0
1416,905,916,1046618,1046624,41,12,7,4162,88,117,1687,100,0.3788,0
152,19,28,808173,808199,119,26,26,14471,97,143,1358,70,0.4915,0
1661,340,352,573781,573792,103,14,11,11535,85,133,1694,80,0.2197,0
62,626,635,2071945,2071956,76,14,11,8063,84,126,1362,120,0.2323,0


In [102]:
y_val_compare = pd.DataFrame(y_val)
y_val_compare['Pred'] = y_val_pred
y_val_compare = y_val_compare.iloc[mismatched_indices]


Unnamed: 0,Target,Pred
69,0,5
985,5,6
1281,6,5
1090,5,6
136,0,6
158,1,6
1416,6,5
152,0,1
1661,6,5
62,0,6


In [103]:
y_val_compare[y_val_compare['Target'] == 0]

Unnamed: 0,Target,Pred
69,0,5
136,0,6
152,0,1
62,0,6
124,0,6
25,0,6
27,0,6
49,0,5
109,0,5
129,0,6


In [104]:
y_val_compare[y_val_compare['Target'] == 5]

Unnamed: 0,Target,Pred
985,5,6
1090,5,6
1012,5,4
1087,5,6
867,5,6
952,5,6
1003,5,6
928,5,6
869,5,6
1078,5,6


In [105]:
y_val_compare[y_val_compare['Target'] == 6]

Unnamed: 0,Target,Pred
1281,6,5
1416,6,5
1661,6,5
1347,6,0
1452,6,5
1628,6,5
1409,6,5
1438,6,5
1796,6,5
1641,6,5


# 하이퍼파라미터 튜닝

In [447]:
# 하이퍼파라미터 range 설정
param_dist = {
    'n_estimators': [100, 200, 300, 400, 500],  # 결정 트리 개수
    'max_depth': [3, 5, 7, 9],  # 트리의 최대 깊이
    'learning_rate': [0.01, 0.05, 0.1, 0.2],  # 학습률
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],  # 각 트리마다 사용되는 샘플 비율
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],  # 각 트리마다 사용되는 특성 비율
}

# 랜덤 서치를 사용하여 하이퍼파라미터 튜닝 (20초 소요)
random_search = RandomizedSearchCV(xgb, param_distributions=param_dist, n_iter=10, scoring='accuracy', cv=3, random_state=42, n_jobs=-1)
random_search.fit(x_train_o, y_train_o)

# 최적의 하이퍼파라미터 출력
print("최적의 하이퍼파라미터:", random_search.best_params_)

# 검증용 데이터로 모델 평가
y_val_pred = random_search.predict(x_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print("검증 정확도:", val_accuracy)

최적의 하이퍼파라미터: {'subsample': 0.7, 'n_estimators': 100, 'max_depth': 5, 'learning_rate': 0.2, 'colsample_bytree': 0.8}
검증 정확도: 0.7845659163987139
