# 제품 이상여부 판별 프로젝트


### 데이터 읽어오기


In [1]:
import os
from pprint import pprint

import optuna
import numpy as np
import pandas as pd
#import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [2]:
import pandas as pd

THRESHOLD = 0.3
RANDOM_STATE = 110

train_data = pd.read_csv("./data/final_train_data_ver2.csv")
test_data = pd.read_csv("./data/final_test_data_ver2.csv")

In [3]:
# dam, fill1, fill2 공통 변수
var_dam_fill = [
    'Equipment_same_num',
    'PalletID_Collect_Result_encoded',
    'Production_Qty_Collect_Result_encoded',
    'WorkMode Collect Result',
    'Receip_n_suffix_3',
    'time_gap_All'
]

In [4]:
# 전체 공통 변수
### correlation 확인을 위한 변수 리스트
var_all_corr = [
    'model_suffix_encoded',
    'cleaned_workorder_encoded'
]

### train
var_all_train = [
    'target',
    'model_suffix_encoded',
    'cleaned_workorder_encoded'
]

### test
var_all_test = [
    'Set ID',
    'target',
    'model_suffix_encoded',
    'cleaned_workorder_encoded'
]

In [5]:
# '_Dam'을 포함하는 변수 선택
dam_variables = [var for var in train_data.columns if '_Dam' in var]

# train
final_columns_train = var_dam_fill + var_all_train + dam_variables
train_data_dam = train_data[final_columns_train]

# test 
final_columns_test = var_dam_fill + var_all_test + dam_variables
test_data_dam = test_data[final_columns_test]

In [6]:
# '_Fill1'을 포함하는 변수 선택
fill1_variables = [var for var in train_data.columns if '_Fill1' in var]

# train
final_columns_train = var_dam_fill + var_all_train + fill1_variables
train_data_fill1 = train_data[final_columns_train]

# test 
final_columns_test = var_dam_fill + var_all_test + fill1_variables
test_data_fill1 = test_data[final_columns_test]

In [7]:
# '_Fill2'을 포함하는 변수 선택
fill2_variables = [var for var in train_data.columns if '_Fill2' in var]

# train
final_columns_train = var_dam_fill + var_all_train + fill2_variables
train_data_fill2 = train_data[final_columns_train]

# test 
final_columns_test = var_dam_fill + var_all_test + fill2_variables
test_data_fill2 = test_data[final_columns_test]

In [8]:
# '_AutoClave'을 포함하는 변수 선택
autoclave_variables = [var for var in train_data.columns if '_AutoClave' in var]

# train
final_columns_train = var_all_train + autoclave_variables
train_data_autoclave = train_data[final_columns_train]

# test 
final_columns_test = var_all_test + autoclave_variables
test_data_autoclave = test_data[final_columns_test]

In [9]:
# 각 DataFrame의 칼럼 수 계산
num_columns_train_data = train_data.shape[1]
num_columns_train_data_dam = train_data_dam.shape[1]
num_columns_train_data_autoclave = train_data_autoclave.shape[1]
num_columns_train_data_fill1 = train_data_fill1.shape[1]
num_columns_train_data_fill2 = train_data_fill2.shape[1]

num_columns_test_data = test_data.shape[1]
num_columns_test_data_dam = test_data_dam.shape[1]
num_columns_test_data_autoclave = test_data_autoclave.shape[1]
num_columns_test_data_fill1 = test_data_fill1.shape[1]
num_columns_test_data_fill2 = test_data_fill2.shape[1]

# 각 DataFrame의 칼럼 수 출력
print("----train data-----")
print(f"train_data DataFrame의 칼럼 수: {num_columns_train_data}")
print(f"train_data_dam DataFrame의 칼럼 수: {num_columns_train_data_dam}")
print(f"train_data_autoclave DataFrame의 칼럼 수: {num_columns_train_data_autoclave}")
print(f"train_data_fill1 DataFrame의 칼럼 수: {num_columns_train_data_fill1}")
print(f"train_data_fill2 DataFrame의 칼럼 수: {num_columns_train_data_fill2}")
print("----test data-----")
print(f"test_data DataFrame의 칼럼 수: {num_columns_test_data}")
print(f"test_data_dam DataFrame의 칼럼 수: {num_columns_test_data_dam}")
print(f"test_data_autoclave DataFrame의 칼럼 수: {num_columns_test_data_autoclave}")
print(f"test_data_fill1 DataFrame의 칼럼 수: {num_columns_test_data_fill1}")
print(f"test_data_fill2 DataFrame의 칼럼 수: {num_columns_test_data_fill2}")

----train data-----
train_data DataFrame의 칼럼 수: 44
train_data_dam DataFrame의 칼럼 수: 25
train_data_autoclave DataFrame의 칼럼 수: 8
train_data_fill1 DataFrame의 칼럼 수: 16
train_data_fill2 DataFrame의 칼럼 수: 16
----test data-----
test_data DataFrame의 칼럼 수: 45
test_data_dam DataFrame의 칼럼 수: 26
test_data_autoclave DataFrame의 칼럼 수: 9
test_data_fill1 DataFrame의 칼럼 수: 17
test_data_fill2 DataFrame의 칼럼 수: 17


---

## Optuna

스레스홀드 0.3으로 맞춘상태에서 튜닝 진행한 것

In [16]:
RANDOM_STATE = 110
THRESHOLD = 0.3

In [17]:
import optuna
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

def objectiveCatBoost(trial, x_tr, y_tr, x_val, y_val):
    
    # 'Normal'과 'AbNormal'을 숫자로 변환
    y_tr = y_tr.map({'Normal': 0, 'AbNormal': 1})
    y_val = y_val.map({'Normal': 0, 'AbNormal': 1})
    
    param = {
        'iterations': trial.suggest_int('iterations', 800, 5000),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.2),
        'depth': trial.suggest_int('depth', 4, 13),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 5),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10, log=True),
        'random_strength': trial.suggest_float('random_strength', 0, 10),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.1, 10),
        'border_count': trial.suggest_int('border_count', 128, 300),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1, 10),
        'grow_policy': trial.suggest_categorical('grow_policy', ['SymmetricTree', 'Depthwise', 'Lossguide']),
        
        'random_seed': RANDOM_STATE,
        'eval_metric': 'F1',
        'logging_level': 'Silent',
        'boosting_type': 'Plain'
    }


    model = CatBoostClassifier(**param)
    model.fit(x_tr, y_tr)
    pred_proba = model.predict_proba(x_val)[:, 1]  # 양성 클래스 확률
    pred = (pred_proba >= THRESHOLD).astype(int)  # 스레드홀드에 따른 예측
    
    score = f1_score(y_val, pred, average="binary")
    
    return score

# 데이터셋 분할
x_train, x_val, y_train, y_val = train_test_split(
    train_data_fill2.drop("target", axis=1), 
    train_data_fill2["target"],              
    test_size=0.2,
    shuffle=True,
    random_state=RANDOM_STATE,
)

# 하이퍼 파라미터 튜닝
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=RANDOM_STATE))
study.optimize(lambda trial: objectiveCatBoost(trial, x_train, y_train, x_val, y_val), n_trials=400)

print('Best trial: score {}, \nparams {}'.format(study.best_trial.value, study.best_trial.params))

[I 2024-08-26 04:19:01,832] A new study created in memory with name: no-name-029e0791-6fea-4ad6-aaa9-5fb7b9760925
[I 2024-08-26 04:19:07,326] Trial 0 finished with value: 0.17073170731707316 and parameters: {'iterations': 848, 'learning_rate': 0.13339879944049993, 'depth': 7, 'l2_leaf_reg': 1.099256962998379, 'random_strength': 8.185937574992959, 'bagging_temperature': 6.85638497930364, 'border_count': 338, 'scale_pos_weight': 6.735006454525373, 'grow_policy': 'SymmetricTree'}. Best is trial 0 with value: 0.17073170731707316.
[I 2024-08-26 04:19:14,101] Trial 1 finished with value: 0.20552147239263804 and parameters: {'iterations': 1371, 'learning_rate': 0.031403208613589216, 'depth': 3, 'l2_leaf_reg': 0.5027927228085797, 'random_strength': 5.961831701274088, 'bagging_temperature': 0.31183600257184896, 'border_count': 227, 'scale_pos_weight': 1.698550183782988, 'grow_policy': 'Lossguide'}. Best is trial 1 with value: 0.20552147239263804.
[I 2024-08-26 04:19:25,387] Trial 2 finished wit

[I 2024-08-26 04:27:52,239] Trial 20 finished with value: 0.1871101871101871 and parameters: {'iterations': 2079, 'learning_rate': 0.07833476292419154, 'depth': 6, 'l2_leaf_reg': 1.6980906710840935, 'random_strength': 4.4814478390215955, 'bagging_temperature': 1.4265802934728589, 'border_count': 106, 'scale_pos_weight': 3.4054211411074755, 'grow_policy': 'Depthwise'}. Best is trial 13 with value: 0.21834061135371177.
[I 2024-08-26 04:28:00,075] Trial 21 finished with value: 0.20467836257309943 and parameters: {'iterations': 1589, 'learning_rate': 0.021368534455665816, 'depth': 3, 'l2_leaf_reg': 0.546617532933024, 'random_strength': 6.069496254444898, 'bagging_temperature': 0.7929899753984693, 'border_count': 213, 'scale_pos_weight': 1.8951360921994538, 'grow_policy': 'Lossguide'}. Best is trial 13 with value: 0.21834061135371177.
[I 2024-08-26 04:28:09,111] Trial 22 finished with value: 0.21629629629629632 and parameters: {'iterations': 1482, 'learning_rate': 0.025533039996572054, 'dep

[I 2024-08-26 04:32:34,419] Trial 40 finished with value: 0.20049504950495048 and parameters: {'iterations': 2395, 'learning_rate': 0.013724576502456363, 'depth': 8, 'l2_leaf_reg': 0.6111157966141966, 'random_strength': 1.608776753751552, 'bagging_temperature': 5.977290700746135, 'border_count': 199, 'scale_pos_weight': 2.4359122475658586, 'grow_policy': 'Lossguide'}. Best is trial 33 with value: 0.22539229671897287.
[I 2024-08-26 04:32:38,767] Trial 41 finished with value: 0.1606686788948224 and parameters: {'iterations': 1272, 'learning_rate': 0.031196780242549134, 'depth': 4, 'l2_leaf_reg': 2.0204240300695555, 'random_strength': 7.314359704186016, 'bagging_temperature': 2.788361441269437, 'border_count': 218, 'scale_pos_weight': 9.758876338884903, 'grow_policy': 'SymmetricTree'}. Best is trial 33 with value: 0.22539229671897287.
[I 2024-08-26 04:32:43,146] Trial 42 finished with value: 0.1991465149359886 and parameters: {'iterations': 1023, 'learning_rate': 0.03772938680013555, 'dep

[I 2024-08-26 04:36:12,088] Trial 60 finished with value: 0.19768934531450577 and parameters: {'iterations': 3177, 'learning_rate': 0.02234427456431744, 'depth': 11, 'l2_leaf_reg': 0.10006261875806695, 'random_strength': 11.7201737958028, 'bagging_temperature': 4.891080186126107, 'border_count': 264, 'scale_pos_weight': 2.274827915733778, 'grow_policy': 'Lossguide'}. Best is trial 33 with value: 0.22539229671897287.
[I 2024-08-26 04:36:32,882] Trial 61 finished with value: 0.18156028368794327 and parameters: {'iterations': 2906, 'learning_rate': 0.02775649424903602, 'depth': 10, 'l2_leaf_reg': 0.1864100164513773, 'random_strength': 0.8086245162581711, 'bagging_temperature': 4.860511823388056, 'border_count': 308, 'scale_pos_weight': 1.7651929124580004, 'grow_policy': 'Lossguide'}. Best is trial 33 with value: 0.22539229671897287.
[I 2024-08-26 04:36:48,890] Trial 62 finished with value: 0.19587628865979384 and parameters: {'iterations': 2241, 'learning_rate': 0.053112844819440445, 'dep

[I 2024-08-26 04:40:43,558] Trial 80 finished with value: 0.19565217391304346 and parameters: {'iterations': 2354, 'learning_rate': 0.010671076651608534, 'depth': 8, 'l2_leaf_reg': 0.10685866788835727, 'random_strength': 10.627749495516671, 'bagging_temperature': 0.7829432592613319, 'border_count': 206, 'scale_pos_weight': 3.5635156942048285, 'grow_policy': 'Depthwise'}. Best is trial 33 with value: 0.22539229671897287.
[I 2024-08-26 04:40:48,803] Trial 81 finished with value: 0.21 and parameters: {'iterations': 1233, 'learning_rate': 0.03126819952637053, 'depth': 6, 'l2_leaf_reg': 0.2054564801725654, 'random_strength': 7.198744088576087, 'bagging_temperature': 0.11456814524896475, 'border_count': 185, 'scale_pos_weight': 2.3365973594949354, 'grow_policy': 'SymmetricTree'}. Best is trial 33 with value: 0.22539229671897287.
[I 2024-08-26 04:40:54,519] Trial 82 finished with value: 0.21660649819494587 and parameters: {'iterations': 1356, 'learning_rate': 0.019366219070503326, 'depth': 6,

[I 2024-08-26 04:42:39,915] Trial 100 finished with value: 0.21696658097686378 and parameters: {'iterations': 1601, 'learning_rate': 0.014528521737301906, 'depth': 5, 'l2_leaf_reg': 0.15648367544846806, 'random_strength': 8.558371157515827, 'bagging_temperature': 8.972482288987703, 'border_count': 98, 'scale_pos_weight': 4.392005080187333, 'grow_policy': 'SymmetricTree'}. Best is trial 87 with value: 0.23766816143497757.
[I 2024-08-26 04:42:45,171] Trial 101 finished with value: 0.2092267706302794 and parameters: {'iterations': 1411, 'learning_rate': 0.03605142864114026, 'depth': 5, 'l2_leaf_reg': 0.13386253803838927, 'random_strength': 9.466068784119264, 'bagging_temperature': 7.64431749136204, 'border_count': 79, 'scale_pos_weight': 3.834189546861555, 'grow_policy': 'SymmetricTree'}. Best is trial 87 with value: 0.23766816143497757.
[I 2024-08-26 04:42:50,108] Trial 102 finished with value: 0.2079701120797011 and parameters: {'iterations': 1188, 'learning_rate': 0.02365025788415418, 

[I 2024-08-26 04:44:02,825] Trial 120 finished with value: 0.2288077188146106 and parameters: {'iterations': 970, 'learning_rate': 0.0164043542768111, 'depth': 4, 'l2_leaf_reg': 0.18088254351886782, 'random_strength': 11.687362897978376, 'bagging_temperature': 7.99798402756077, 'border_count': 140, 'scale_pos_weight': 3.4392254563311324, 'grow_policy': 'SymmetricTree'}. Best is trial 117 with value: 0.24251069900142652.
[I 2024-08-26 04:44:06,135] Trial 121 finished with value: 0.2386206896551724 and parameters: {'iterations': 970, 'learning_rate': 0.016196765702989654, 'depth': 4, 'l2_leaf_reg': 0.1822962928040701, 'random_strength': 11.718916391805204, 'bagging_temperature': 7.997553949591617, 'border_count': 141, 'scale_pos_weight': 3.439431288830275, 'grow_policy': 'SymmetricTree'}. Best is trial 117 with value: 0.24251069900142652.
[I 2024-08-26 04:44:08,723] Trial 122 finished with value: 0.19721577726218098 and parameters: {'iterations': 758, 'learning_rate': 0.00546483987501487

[I 2024-08-26 04:44:55,021] Trial 140 finished with value: 0.2311377245508982 and parameters: {'iterations': 1068, 'learning_rate': 0.005784239379072839, 'depth': 3, 'l2_leaf_reg': 0.24552957914240905, 'random_strength': 10.418702942237681, 'bagging_temperature': 8.061226317541664, 'border_count': 105, 'scale_pos_weight': 3.738509898603372, 'grow_policy': 'SymmetricTree'}. Best is trial 117 with value: 0.24251069900142652.
[I 2024-08-26 04:44:57,399] Trial 141 finished with value: 0.22831050228310504 and parameters: {'iterations': 696, 'learning_rate': 0.025185682246894193, 'depth': 4, 'l2_leaf_reg': 0.2667015875753152, 'random_strength': 11.124570925042542, 'bagging_temperature': 8.911959459207994, 'border_count': 97, 'scale_pos_weight': 3.610794449821383, 'grow_policy': 'SymmetricTree'}. Best is trial 117 with value: 0.24251069900142652.
[I 2024-08-26 04:45:00,831] Trial 142 finished with value: 0.22999327505043715 and parameters: {'iterations': 1026, 'learning_rate': 0.0169080665783

[I 2024-08-26 04:45:58,188] Trial 160 finished with value: 0.21451509312780986 and parameters: {'iterations': 1096, 'learning_rate': 0.025575909674112572, 'depth': 3, 'l2_leaf_reg': 0.18014819754821332, 'random_strength': 11.849849026106018, 'bagging_temperature': 7.409855296390104, 'border_count': 101, 'scale_pos_weight': 3.6775097278958397, 'grow_policy': 'SymmetricTree'}. Best is trial 117 with value: 0.24251069900142652.
[I 2024-08-26 04:46:01,339] Trial 161 finished with value: 0.2383838383838384 and parameters: {'iterations': 1027, 'learning_rate': 0.008597980544006638, 'depth': 3, 'l2_leaf_reg': 0.19921509443646238, 'random_strength': 11.215511657603106, 'bagging_temperature': 8.345386413412864, 'border_count': 86, 'scale_pos_weight': 3.48117229757425, 'grow_policy': 'SymmetricTree'}. Best is trial 117 with value: 0.24251069900142652.
[I 2024-08-26 04:46:04,207] Trial 162 finished with value: 0.23028391167192427 and parameters: {'iterations': 922, 'learning_rate': 0.012365115931

[I 2024-08-26 04:48:27,897] Trial 180 finished with value: 0.2268370607028754 and parameters: {'iterations': 870, 'learning_rate': 0.012026474908466377, 'depth': 4, 'l2_leaf_reg': 0.1551442687911059, 'random_strength': 11.962383671576035, 'bagging_temperature': 7.185429462753534, 'border_count': 138, 'scale_pos_weight': 3.1532205103647044, 'grow_policy': 'Depthwise'}. Best is trial 117 with value: 0.24251069900142652.
[I 2024-08-26 04:48:30,182] Trial 181 finished with value: 0.23104693140794222 and parameters: {'iterations': 612, 'learning_rate': 0.008305131326300126, 'depth': 4, 'l2_leaf_reg': 0.19071746922714441, 'random_strength': 11.01783867204407, 'bagging_temperature': 7.954086327559297, 'border_count': 94, 'scale_pos_weight': 3.5129931408777226, 'grow_policy': 'SymmetricTree'}. Best is trial 117 with value: 0.24251069900142652.
[I 2024-08-26 04:48:33,616] Trial 182 finished with value: 0.23238925199709512 and parameters: {'iterations': 974, 'learning_rate': 0.0161791377399738, 

[I 2024-08-26 04:49:33,583] Trial 200 finished with value: 0.22515101592531575 and parameters: {'iterations': 1087, 'learning_rate': 0.00867661288488338, 'depth': 5, 'l2_leaf_reg': 0.221660949230779, 'random_strength': 10.616352506869763, 'bagging_temperature': 9.356763484041437, 'border_count': 90, 'scale_pos_weight': 3.937674094470202, 'grow_policy': 'SymmetricTree'}. Best is trial 117 with value: 0.24251069900142652.
[I 2024-08-26 04:49:36,587] Trial 201 finished with value: 0.23783031988873438 and parameters: {'iterations': 871, 'learning_rate': 0.011032828081168832, 'depth': 4, 'l2_leaf_reg': 0.20730139562131125, 'random_strength': 11.467259549798866, 'bagging_temperature': 8.327427013886059, 'border_count': 70, 'scale_pos_weight': 3.3776872393489823, 'grow_policy': 'SymmetricTree'}. Best is trial 117 with value: 0.24251069900142652.
[I 2024-08-26 04:49:39,590] Trial 202 finished with value: 0.23474801061007955 and parameters: {'iterations': 879, 'learning_rate': 0.012097865999776

[I 2024-08-26 04:51:12,235] Trial 220 finished with value: 0.23640319071791155 and parameters: {'iterations': 1091, 'learning_rate': 0.014112087867369137, 'depth': 3, 'l2_leaf_reg': 0.18465615021173612, 'random_strength': 10.58320453623661, 'bagging_temperature': 7.656671364517336, 'border_count': 88, 'scale_pos_weight': 3.324112920069162, 'grow_policy': 'SymmetricTree'}. Best is trial 117 with value: 0.24251069900142652.
[I 2024-08-26 04:51:16,047] Trial 221 finished with value: 0.22478386167146974 and parameters: {'iterations': 1252, 'learning_rate': 0.013890005648934355, 'depth': 3, 'l2_leaf_reg': 0.18532314387241897, 'random_strength': 10.486737392282292, 'bagging_temperature': 7.548428886207995, 'border_count': 87, 'scale_pos_weight': 3.332161341721762, 'grow_policy': 'SymmetricTree'}. Best is trial 117 with value: 0.24251069900142652.
[I 2024-08-26 04:51:19,344] Trial 222 finished with value: 0.23641304347826086 and parameters: {'iterations': 1076, 'learning_rate': 0.010745506676

[I 2024-08-26 04:52:20,302] Trial 240 finished with value: 0.222052067381317 and parameters: {'iterations': 1110, 'learning_rate': 0.023708385848441246, 'depth': 3, 'l2_leaf_reg': 0.19153827757281244, 'random_strength': 10.99159239841867, 'bagging_temperature': 8.16974663296405, 'border_count': 83, 'scale_pos_weight': 3.2729293716221575, 'grow_policy': 'SymmetricTree'}. Best is trial 117 with value: 0.24251069900142652.
[I 2024-08-26 04:52:23,305] Trial 241 finished with value: 0.22294022617124393 and parameters: {'iterations': 956, 'learning_rate': 0.012302705771373487, 'depth': 3, 'l2_leaf_reg': 0.20616314404621464, 'random_strength': 11.787153691199181, 'bagging_temperature': 7.336484243314569, 'border_count': 93, 'scale_pos_weight': 3.05760959177019, 'grow_policy': 'SymmetricTree'}. Best is trial 117 with value: 0.24251069900142652.
[I 2024-08-26 04:52:26,528] Trial 242 finished with value: 0.23076923076923073 and parameters: {'iterations': 1042, 'learning_rate': 0.0162434921286115

[I 2024-08-26 04:53:40,023] Trial 260 finished with value: 0.23099850968703425 and parameters: {'iterations': 1254, 'learning_rate': 0.01567493933068075, 'depth': 5, 'l2_leaf_reg': 0.15522110888540527, 'random_strength': 10.572768441308417, 'bagging_temperature': 8.585342976090315, 'border_count': 114, 'scale_pos_weight': 3.3504172092906894, 'grow_policy': 'SymmetricTree'}. Best is trial 117 with value: 0.24251069900142652.
[I 2024-08-26 04:53:43,187] Trial 261 finished with value: 0.2391725921137686 and parameters: {'iterations': 1017, 'learning_rate': 0.010900877757786188, 'depth': 3, 'l2_leaf_reg': 0.1670889738803438, 'random_strength': 10.852697978247647, 'bagging_temperature': 8.180119326818138, 'border_count': 98, 'scale_pos_weight': 3.5537170066287906, 'grow_policy': 'SymmetricTree'}. Best is trial 117 with value: 0.24251069900142652.
[I 2024-08-26 04:53:46,924] Trial 262 finished with value: 0.225825130283729 and parameters: {'iterations': 1003, 'learning_rate': 0.0101441986165

[I 2024-08-26 04:54:59,959] Trial 280 finished with value: 0.22705051602390006 and parameters: {'iterations': 898, 'learning_rate': 0.008677260139295505, 'depth': 4, 'l2_leaf_reg': 0.1254664120368273, 'random_strength': 10.351595857185522, 'bagging_temperature': 5.258362324702411, 'border_count': 85, 'scale_pos_weight': 4.050505353166065, 'grow_policy': 'SymmetricTree'}. Best is trial 117 with value: 0.24251069900142652.
[I 2024-08-26 04:55:03,172] Trial 281 finished with value: 0.23612903225806453 and parameters: {'iterations': 954, 'learning_rate': 0.008677934004806327, 'depth': 4, 'l2_leaf_reg': 0.12356261494078016, 'random_strength': 10.461667993010959, 'bagging_temperature': 5.349568250768916, 'border_count': 93, 'scale_pos_weight': 3.5939354357709914, 'grow_policy': 'SymmetricTree'}. Best is trial 117 with value: 0.24251069900142652.
[I 2024-08-26 04:55:06,903] Trial 282 finished with value: 0.23437500000000003 and parameters: {'iterations': 1010, 'learning_rate': 0.0082685731620

[I 2024-08-26 04:56:12,802] Trial 300 finished with value: 0.15294609277058085 and parameters: {'iterations': 1090, 'learning_rate': 0.005009501776635087, 'depth': 6, 'l2_leaf_reg': 0.7179278585286799, 'random_strength': 10.66312690131364, 'bagging_temperature': 8.193207495349812, 'border_count': 94, 'scale_pos_weight': 9.97156020158904, 'grow_policy': 'SymmetricTree'}. Best is trial 117 with value: 0.24251069900142652.
[I 2024-08-26 04:56:16,339] Trial 301 finished with value: 0.22605363984674326 and parameters: {'iterations': 953, 'learning_rate': 0.020373826576693167, 'depth': 5, 'l2_leaf_reg': 0.16046519809906312, 'random_strength': 10.270550130471728, 'bagging_temperature': 7.882695548863645, 'border_count': 75, 'scale_pos_weight': 3.7055134246840797, 'grow_policy': 'SymmetricTree'}. Best is trial 117 with value: 0.24251069900142652.
[I 2024-08-26 04:56:19,483] Trial 302 finished with value: 0.22790697674418603 and parameters: {'iterations': 1004, 'learning_rate': 0.01141927728353

[I 2024-08-26 04:57:57,067] Trial 320 finished with value: 0.2219873150105708 and parameters: {'iterations': 1092, 'learning_rate': 0.011398965665058916, 'depth': 3, 'l2_leaf_reg': 0.16075880996571298, 'random_strength': 10.947646107851725, 'bagging_temperature': 8.137512836938502, 'border_count': 85, 'scale_pos_weight': 4.098110319543926, 'grow_policy': 'SymmetricTree'}. Best is trial 117 with value: 0.24251069900142652.
[I 2024-08-26 04:58:01,339] Trial 321 finished with value: 0.23118696664080682 and parameters: {'iterations': 1159, 'learning_rate': 0.005136586249816276, 'depth': 5, 'l2_leaf_reg': 0.17752177650872128, 'random_strength': 10.752183482058, 'bagging_temperature': 8.275203469825291, 'border_count': 98, 'scale_pos_weight': 3.239289138263469, 'grow_policy': 'SymmetricTree'}. Best is trial 117 with value: 0.24251069900142652.
[I 2024-08-26 04:58:04,551] Trial 322 finished with value: 0.23177842565597667 and parameters: {'iterations': 931, 'learning_rate': 0.0085080457010623

[I 2024-08-26 04:58:53,683] Trial 340 finished with value: 0.21387283236994223 and parameters: {'iterations': 648, 'learning_rate': 0.15227576181734648, 'depth': 3, 'l2_leaf_reg': 0.17501760146558534, 'random_strength': 11.530562673651284, 'bagging_temperature': 5.470328886853444, 'border_count': 89, 'scale_pos_weight': 3.4043570436464274, 'grow_policy': 'Depthwise'}. Best is trial 117 with value: 0.24251069900142652.
[I 2024-08-26 04:58:56,183] Trial 341 finished with value: 0.22816679779701024 and parameters: {'iterations': 794, 'learning_rate': 0.014593720390670977, 'depth': 3, 'l2_leaf_reg': 0.13483369081737961, 'random_strength': 11.750669262234352, 'bagging_temperature': 4.975296817409518, 'border_count': 100, 'scale_pos_weight': 3.161681123072831, 'grow_policy': 'SymmetricTree'}. Best is trial 117 with value: 0.24251069900142652.
[I 2024-08-26 04:58:58,951] Trial 342 finished with value: 0.22954977805960683 and parameters: {'iterations': 883, 'learning_rate': 0.01119890358657367

[I 2024-08-26 05:00:04,935] Trial 360 finished with value: 0.23313782991202345 and parameters: {'iterations': 913, 'learning_rate': 0.008691491478061978, 'depth': 3, 'l2_leaf_reg': 0.10530966835244288, 'random_strength': 11.161459055170061, 'bagging_temperature': 7.058331248811092, 'border_count': 88, 'scale_pos_weight': 3.3695336938960656, 'grow_policy': 'SymmetricTree'}. Best is trial 117 with value: 0.24251069900142652.
[I 2024-08-26 05:00:07,999] Trial 361 finished with value: 0.22847682119205298 and parameters: {'iterations': 986, 'learning_rate': 0.022347781777576937, 'depth': 3, 'l2_leaf_reg': 0.40359093653955946, 'random_strength': 11.526712031872206, 'bagging_temperature': 5.051493700662351, 'border_count': 96, 'scale_pos_weight': 3.0723838710195404, 'grow_policy': 'SymmetricTree'}. Best is trial 117 with value: 0.24251069900142652.
[I 2024-08-26 05:00:10,641] Trial 362 finished with value: 0.22366097704532076 and parameters: {'iterations': 844, 'learning_rate': 0.011837968379

[I 2024-08-26 05:01:57,067] Trial 380 finished with value: 0.23333333333333334 and parameters: {'iterations': 1115, 'learning_rate': 0.007681524213697887, 'depth': 4, 'l2_leaf_reg': 0.10021122117490561, 'random_strength': 10.700350519325061, 'bagging_temperature': 4.491838174954221, 'border_count': 100, 'scale_pos_weight': 3.2891666989048627, 'grow_policy': 'SymmetricTree'}. Best is trial 117 with value: 0.24251069900142652.
[I 2024-08-26 05:01:59,763] Trial 381 finished with value: 0.22885572139303478 and parameters: {'iterations': 857, 'learning_rate': 0.0051808694394642665, 'depth': 3, 'l2_leaf_reg': 0.17001545338574883, 'random_strength': 11.231493213401976, 'bagging_temperature': 8.052545559186916, 'border_count': 80, 'scale_pos_weight': 3.5243394863555753, 'grow_policy': 'SymmetricTree'}. Best is trial 117 with value: 0.24251069900142652.
[I 2024-08-26 05:02:04,807] Trial 382 finished with value: 0.2198053806525472 and parameters: {'iterations': 991, 'learning_rate': 0.0118419112

Best trial: score 0.24251069900142652, 
params {'iterations': 987, 'learning_rate': 0.011942535987477916, 'depth': 4, 'l2_leaf_reg': 0.17913310604185315, 'random_strength': 11.61512820207189, 'bagging_temperature': 7.9524408302966885, 'border_count': 143, 'scale_pos_weight': 3.361391569468003, 'grow_policy': 'SymmetricTree'}


1. 0.24251069900142652

params {'iterations': 987, 'learning_rate': 0.011942535987477916, 'depth': 4, 'l2_leaf_reg': 0.17913310604185315, 'random_strength': 11.61512820207189, 'bagging_temperature': 7.9524408302966885, 'border_count': 143, 'scale_pos_weight': 3.361391569468003, 'grow_policy': 'SymmetricTree'}

In [13]:
# 모델 설정 및 하이퍼파라미터
models = {
    'et': ExtraTreesClassifier(),
    'rf': RandomForestClassifier(),
    'cat': CatBoostClassifier(),
    'lgbm': LGBMClassifier(),
    'xgb': XGBClassifier(),
    'dt': DecisionTreeClassifier()
}

def train_and_evaluate_model(model_name, data, **params):
    if model_name not in models:
        print(f"{model_name}은(는) 지원되지 않는 모델입니다.")
        return
    
    # 데이터셋 분할
    x_train, x_val, y_train, y_val = train_test_split(
        data.drop("target", axis=1),
        data["target"].map({'Normal': 0, 'AbNormal': 1}),
        test_size=0.2,
        shuffle=True,
        random_state=RANDOM_STATE,
    )

    # 모델 선택
    model = models[model_name]

    # 하이퍼파라미터 설정
    model.set_params(**params)

    # 모델 학습
    model.fit(x_train, y_train)

    # 데이터 이름을 자동으로 추출하기 위한 래퍼 함수
    data_name = [name for name in globals() if globals()[name] is data][0]

    # 예측
    y_val_pred_proba = model.predict_proba(x_val)[:, 1]  # 양성 클래스 확률
    y_val_pred = (y_val_pred_proba >= THRESHOLD).astype(int)  # 스레드홀드에 따른 예측

    # 평가지표 계산
    f1 = f1_score(y_val, y_val_pred, average="binary")
    accuracy = accuracy_score(y_val, y_val_pred)
    precision = precision_score(y_val, y_val_pred)
    recall = recall_score(y_val, y_val_pred)
    conf_matrix = confusion_matrix(y_val, y_val_pred)
    
    # 평가지표 계산
    f1 = f1_score(y_val, y_val_pred, average="binary")
    accuracy = accuracy_score(y_val, y_val_pred)
    precision = precision_score(y_val, y_val_pred)
    recall = recall_score(y_val, y_val_pred)
    conf_matrix = confusion_matrix(y_val, y_val_pred)
    
    # 결과 출력
    print(f'{model_name} 모델이 {data_name} 데이터로 학습한 결과:')
    print(f'F1 Score: {f1}')
    print('---')
    print('Confusion Matrix:')
    print(conf_matrix)
    print('---')
    print(f'Accuracy: {accuracy}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print('\n')

In [14]:
train_and_evaluate_model(
    'cat', train_data_fill2,
    iterations = 481, 
    learning_rate = 0.018742270357007457, 
    depth = 5, 
    min_data_in_leaf = ,
    l2_leaf_reg = 1.0871571324663387, 
    random_strength = 3.49632241801363, 
    bagging_temperature = 5.717049796462913, 
    border_count = 183, 
    scale_pos_weight = 3.4406776189795383,
    grow_policy = 'SymmetricTree',


    random_state = RANDOM_STATE,
    eval_metric = 'F1',
    logging_level = 'Silent',
    boosting_type = 'Plain'
)

cat 모델이 train_data_fill2 데이터로 학습한 결과:
F1 Score: 0.24322446143154972
---
Confusion Matrix:
[[6838  824]
 [ 265  175]]
---
Accuracy: 0.8655887435201185
Precision: 0.17517517517517517
Recall: 0.3977272727272727


