# 제품 이상여부 판별 프로젝트


### 데이터 읽어오기


In [1]:
import os
from pprint import pprint

import optuna
import numpy as np
import pandas as pd
#import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [2]:
import pandas as pd

RANDOM_STATE = 110

train_data = pd.read_csv('./data/train_data_0827.csv')
test_data = pd.read_csv('./data/test_data_0827.csv')

In [3]:
# dam, fill1, fill2 공통 변수
var_dam_fill = [
    'Receip_No_encoded',
    'Equipment_same_num',
    'PalletID_Collect_Result_encoded',
    'Production_Qty_Collect_Result',
    'WorkMode Collect Result'
]

In [4]:
# 전체 공통 변수
### train
var_all_train = [
    'target',
    'model_suffix_encoded',
    'cleaned_workorder_encoded'
]

### test
var_all_test = [
    'Set ID',
    'target',
    'model_suffix_encoded',
    'cleaned_workorder_encoded'
]

In [5]:
# '_Dam'을 포함하는 변수 선택
dam_variables = [var for var in train_data.columns if '_Dam' in var]

# train
final_columns_train = var_dam_fill + var_all_train + dam_variables
train_data_dam = train_data[final_columns_train]

# test 
final_columns_test = var_dam_fill + var_all_test + dam_variables
test_data_dam = test_data[final_columns_test]

In [6]:
# '_Fill1'을 포함하는 변수 선택
fill1_variables = [var for var in train_data.columns if '_Fill1' in var]

# train
final_columns_train = var_dam_fill + var_all_train + fill1_variables
train_data_fill1 = train_data[final_columns_train]

# test 
final_columns_test = var_dam_fill + var_all_test + fill1_variables
test_data_fill1 = test_data[final_columns_test]

In [7]:
# '_Fill2'을 포함하는 변수 선택
fill2_variables = [var for var in train_data.columns if '_Fill2' in var]

# train
final_columns_train = var_dam_fill + var_all_train + fill2_variables
train_data_fill2 = train_data[final_columns_train]

# test 
final_columns_test = var_dam_fill + var_all_test + fill2_variables
test_data_fill2 = test_data[final_columns_test]

In [8]:
# '_AutoClave'을 포함하는 변수 선택
autoclave_variables = [var for var in train_data.columns if '_AutoClave' in var]

# train
final_columns_train = var_all_train + autoclave_variables
train_data_autoclave = train_data[final_columns_train]

# test 
final_columns_test = var_all_test + autoclave_variables
test_data_autoclave = test_data[final_columns_test]

In [9]:
# 각 DataFrame의 칼럼 수 계산
num_columns_train_data = train_data.shape[1]
num_columns_train_data_dam = train_data_dam.shape[1]
num_columns_train_data_autoclave = train_data_autoclave.shape[1]
num_columns_train_data_fill1 = train_data_fill1.shape[1]
num_columns_train_data_fill2 = train_data_fill2.shape[1]

num_columns_test_data = test_data.shape[1]
num_columns_test_data_dam = test_data_dam.shape[1]
num_columns_test_data_autoclave = test_data_autoclave.shape[1]
num_columns_test_data_fill1 = test_data_fill1.shape[1]
num_columns_test_data_fill2 = test_data_fill2.shape[1]

# 각 DataFrame의 칼럼 수 출력
print("----train data-----")
print(f"train_data DataFrame의 칼럼 수: {num_columns_train_data}")
print(f"train_data_dam DataFrame의 칼럼 수: {num_columns_train_data_dam}")
print(f"train_data_autoclave DataFrame의 칼럼 수: {num_columns_train_data_autoclave}")
print(f"train_data_fill1 DataFrame의 칼럼 수: {num_columns_train_data_fill1}")
print(f"train_data_fill2 DataFrame의 칼럼 수: {num_columns_train_data_fill2}")
print("----test data-----")
print(f"test_data DataFrame의 칼럼 수: {num_columns_test_data}")
print(f"test_data_dam DataFrame의 칼럼 수: {num_columns_test_data_dam}")
print(f"test_data_autoclave DataFrame의 칼럼 수: {num_columns_test_data_autoclave}")
print(f"test_data_fill1 DataFrame의 칼럼 수: {num_columns_test_data_fill1}")
print(f"test_data_fill2 DataFrame의 칼럼 수: {num_columns_test_data_fill2}")

----train data-----
train_data DataFrame의 칼럼 수: 40
train_data_dam DataFrame의 칼럼 수: 23
train_data_autoclave DataFrame의 칼럼 수: 8
train_data_fill1 DataFrame의 칼럼 수: 14
train_data_fill2 DataFrame의 칼럼 수: 14
----test data-----
test_data DataFrame의 칼럼 수: 41
test_data_dam DataFrame의 칼럼 수: 24
test_data_autoclave DataFrame의 칼럼 수: 9
test_data_fill1 DataFrame의 칼럼 수: 15
test_data_fill2 DataFrame의 칼럼 수: 15


---

## Optuna

스레스홀드 0.3으로 맞춘상태에서 튜닝 진행한 것

In [10]:
RANDOM_STATE = 110
THRESHOLD = 0.3

In [11]:
# 'Normal'과 'AbNormal'을 숫자로 변환
train_data['target'] = train_data['target'].map({'Normal': 0, 'AbNormal': 1})

# 스레드홀드 설정
THRESHOLD = 0.3

def objectiveRandomForestClassifier(trial, x_tr, y_tr, x_val, y_val):
    param = {
        'n_estimators' : trial.suggest_int('n_estimators', 800, 3000),
        'max_depth' : trial.suggest_int('max_depth', 20, 100),
        'min_samples_split' : trial.suggest_int('min_samples_split', 2, 16),
        'min_samples_leaf' : trial.suggest_int('min_samples_leaf', 1, 8),
        'criterion' : trial.suggest_categorical("criterion", ["entropy",]),
        'class_weight' : trial.suggest_categorical("class_weight", ["balanced"]),
        'random_state': RANDOM_STATE
    }
    
    model = RandomForestClassifier(**param)
    model.fit(x_tr, y_tr)
    pred_proba = model.predict_proba(x_val)[:, 1]  # 양성 클래스 확률
    pred = (pred_proba >= THRESHOLD).astype(int)  # 스레드홀드에 따른 예측
    
    score = f1_score(y_val, pred, average="binary")
    
    return score

# 데이터셋 분할
x_train, x_val, y_train, y_val = train_test_split(
    train_data.drop("target", axis=1),
    train_data["target"],
    test_size=0.2,
    shuffle=True,
    random_state=RANDOM_STATE,
)

# 하이퍼 파라미터 튜닝
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=RANDOM_STATE))
study.optimize(lambda trial: objectiveRandomForestClassifier(trial, x_train, y_train, x_val, y_val), n_trials=300)

print('Best trial: score {}, \nparams {}'.format(study.best_trial.value, study.best_trial.params))

[I 2024-08-28 02:53:00,685] A new study created in memory with name: no-name-ee781b96-69a4-413f-b1fc-7f1f8090a4e5
[I 2024-08-28 02:54:20,364] Trial 0 finished with value: 0.18971542685971043 and parameters: {'n_estimators': 1055, 'max_depth': 73, 'min_samples_split': 7, 'min_samples_leaf': 5, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 0 with value: 0.18971542685971043.
[I 2024-08-28 02:56:42,169] Trial 1 finished with value: 0.1843506759524785 and parameters: {'n_estimators': 2295, 'max_depth': 75, 'min_samples_split': 16, 'min_samples_leaf': 6, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 0 with value: 0.18971542685971043.
[I 2024-08-28 02:58:23,110] Trial 2 finished with value: 0.19744058500914077 and parameters: {'n_estimators': 1928, 'max_depth': 26, 'min_samples_split': 7, 'min_samples_leaf': 3, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 2 with value: 0.19744058500914077.
[I 2024-08-28 02:59:19,105] Trial 3 fini

[I 2024-08-28 04:02:42,134] Trial 29 finished with value: 0.19258496395468588 and parameters: {'n_estimators': 2470, 'max_depth': 76, 'min_samples_split': 15, 'min_samples_leaf': 3, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 10 with value: 0.22811059907834103.
[I 2024-08-28 04:04:11,366] Trial 30 finished with value: 0.21297836938435938 and parameters: {'n_estimators': 1223, 'max_depth': 94, 'min_samples_split': 7, 'min_samples_leaf': 2, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 10 with value: 0.22811059907834103.
[I 2024-08-28 04:06:10,163] Trial 31 finished with value: 0.22584692597239647 and parameters: {'n_estimators': 2133, 'max_depth': 89, 'min_samples_split': 4, 'min_samples_leaf': 1, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 10 with value: 0.22811059907834103.
[I 2024-08-28 04:08:56,719] Trial 32 finished with value: 0.2255125284738041 and parameters: {'n_estimators': 2042, 'max_depth': 88, 'min_samples_s

[I 2024-08-28 04:48:54,585] Trial 58 finished with value: 0.2124600638977636 and parameters: {'n_estimators': 810, 'max_depth': 91, 'min_samples_split': 10, 'min_samples_leaf': 1, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 52 with value: 0.23023255813953486.
[I 2024-08-28 04:49:52,663] Trial 59 finished with value: 0.21444201312910285 and parameters: {'n_estimators': 1076, 'max_depth': 69, 'min_samples_split': 3, 'min_samples_leaf': 3, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 52 with value: 0.23023255813953486.
[I 2024-08-28 04:50:42,811] Trial 60 finished with value: 0.22682445759368833 and parameters: {'n_estimators': 899, 'max_depth': 78, 'min_samples_split': 7, 'min_samples_leaf': 1, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 52 with value: 0.23023255813953486.
[I 2024-08-28 04:51:38,398] Trial 61 finished with value: 0.22890173410404624 and parameters: {'n_estimators': 981, 'max_depth': 98, 'min_samples_spli

[I 2024-08-28 05:17:16,482] Trial 87 finished with value: 0.22778473091364204 and parameters: {'n_estimators': 1051, 'max_depth': 82, 'min_samples_split': 4, 'min_samples_leaf': 1, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 52 with value: 0.23023255813953486.
[I 2024-08-28 05:18:02,634] Trial 88 finished with value: 0.2180376610505451 and parameters: {'n_estimators': 801, 'max_depth': 63, 'min_samples_split': 3, 'min_samples_leaf': 2, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 52 with value: 0.23023255813953486.
[I 2024-08-28 05:19:05,416] Trial 89 finished with value: 0.18210238374364984 and parameters: {'n_estimators': 922, 'max_depth': 87, 'min_samples_split': 4, 'min_samples_leaf': 7, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 52 with value: 0.23023255813953486.
[I 2024-08-28 05:20:29,665] Trial 90 finished with value: 0.21547799696509864 and parameters: {'n_estimators': 1281, 'max_depth': 69, 'min_samples_spli

[I 2024-08-28 05:47:22,374] Trial 116 finished with value: 0.23023255813953486 and parameters: {'n_estimators': 956, 'max_depth': 87, 'min_samples_split': 5, 'min_samples_leaf': 1, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 107 with value: 0.23144654088050312.
[I 2024-08-28 05:48:17,589] Trial 117 finished with value: 0.220708446866485 and parameters: {'n_estimators': 930, 'max_depth': 78, 'min_samples_split': 3, 'min_samples_leaf': 1, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 107 with value: 0.23144654088050312.
[I 2024-08-28 05:49:04,791] Trial 118 finished with value: 0.2291407222914072 and parameters: {'n_estimators': 800, 'max_depth': 87, 'min_samples_split': 4, 'min_samples_leaf': 1, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 107 with value: 0.23144654088050312.
[I 2024-08-28 05:49:51,470] Trial 119 finished with value: 0.2193675889328063 and parameters: {'n_estimators': 803, 'max_depth': 81, 'min_samples_sp

[I 2024-08-28 06:20:15,147] Trial 145 finished with value: 0.22863741339491916 and parameters: {'n_estimators': 1000, 'max_depth': 85, 'min_samples_split': 5, 'min_samples_leaf': 1, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 107 with value: 0.23144654088050312.
[I 2024-08-28 06:21:14,531] Trial 146 finished with value: 0.22929936305732485 and parameters: {'n_estimators': 1054, 'max_depth': 77, 'min_samples_split': 6, 'min_samples_leaf': 1, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 107 with value: 0.23144654088050312.
[I 2024-08-28 06:22:14,236] Trial 147 finished with value: 0.22905620360551432 and parameters: {'n_estimators': 1060, 'max_depth': 78, 'min_samples_split': 6, 'min_samples_leaf': 1, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 107 with value: 0.23144654088050312.
[I 2024-08-28 06:23:06,778] Trial 148 finished with value: 0.22811059907834103 and parameters: {'n_estimators': 930, 'max_depth': 75, 'min_sam

[I 2024-08-28 06:50:14,748] Trial 174 finished with value: 0.22929936305732485 and parameters: {'n_estimators': 1003, 'max_depth': 86, 'min_samples_split': 6, 'min_samples_leaf': 1, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 107 with value: 0.23144654088050312.
[I 2024-08-28 06:51:36,193] Trial 175 finished with value: 0.22706422018348624 and parameters: {'n_estimators': 851, 'max_depth': 90, 'min_samples_split': 5, 'min_samples_leaf': 1, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 107 with value: 0.23144654088050312.
[I 2024-08-28 06:52:49,127] Trial 176 finished with value: 0.2276595744680851 and parameters: {'n_estimators': 963, 'max_depth': 94, 'min_samples_split': 6, 'min_samples_leaf': 1, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 107 with value: 0.23144654088050312.
[I 2024-08-28 06:53:47,566] Trial 177 finished with value: 0.22784810126582278 and parameters: {'n_estimators': 1029, 'max_depth': 97, 'min_sampl

[I 2024-08-28 07:20:49,167] Trial 203 finished with value: 0.22863741339491916 and parameters: {'n_estimators': 927, 'max_depth': 90, 'min_samples_split': 5, 'min_samples_leaf': 1, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 107 with value: 0.23144654088050312.
[I 2024-08-28 07:22:19,399] Trial 204 finished with value: 0.2291407222914072 and parameters: {'n_estimators': 965, 'max_depth': 87, 'min_samples_split': 4, 'min_samples_leaf': 1, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 107 with value: 0.23144654088050312.
[I 2024-08-28 07:23:52,781] Trial 205 finished with value: 0.22784810126582278 and parameters: {'n_estimators': 1027, 'max_depth': 92, 'min_samples_split': 5, 'min_samples_leaf': 1, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 107 with value: 0.23144654088050312.
[I 2024-08-28 07:25:06,587] Trial 206 finished with value: 0.22784810126582278 and parameters: {'n_estimators': 907, 'max_depth': 83, 'min_sample

[I 2024-08-28 07:47:42,160] Trial 232 finished with value: 0.18871692461308037 and parameters: {'n_estimators': 950, 'max_depth': 97, 'min_samples_split': 5, 'min_samples_leaf': 5, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 107 with value: 0.23144654088050312.
[I 2024-08-28 07:48:21,053] Trial 233 finished with value: 0.22916666666666666 and parameters: {'n_estimators': 1014, 'max_depth': 94, 'min_samples_split': 5, 'min_samples_leaf': 1, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 107 with value: 0.23144654088050312.
[I 2024-08-28 07:48:56,510] Trial 234 finished with value: 0.2296983758700696 and parameters: {'n_estimators': 937, 'max_depth': 99, 'min_samples_split': 5, 'min_samples_leaf': 1, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 107 with value: 0.23144654088050312.
[I 2024-08-28 07:49:34,157] Trial 235 finished with value: 0.22863741339491916 and parameters: {'n_estimators': 988, 'max_depth': 82, 'min_sample

[I 2024-08-28 08:05:39,408] Trial 261 finished with value: 0.22837370242214533 and parameters: {'n_estimators': 909, 'max_depth': 90, 'min_samples_split': 5, 'min_samples_leaf': 1, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 107 with value: 0.23144654088050312.
[I 2024-08-28 08:06:15,612] Trial 262 finished with value: 0.22790202342917998 and parameters: {'n_estimators': 965, 'max_depth': 93, 'min_samples_split': 6, 'min_samples_leaf': 1, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 107 with value: 0.23144654088050312.
[I 2024-08-28 08:06:47,878] Trial 263 finished with value: 0.22706422018348624 and parameters: {'n_estimators': 875, 'max_depth': 88, 'min_samples_split': 5, 'min_samples_leaf': 1, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 107 with value: 0.23144654088050312.
[I 2024-08-28 08:07:26,778] Trial 264 finished with value: 0.22758620689655173 and parameters: {'n_estimators': 1055, 'max_depth': 81, 'min_sampl

[I 2024-08-28 08:26:45,648] Trial 290 finished with value: 0.22584692597239647 and parameters: {'n_estimators': 2240, 'max_depth': 67, 'min_samples_split': 4, 'min_samples_leaf': 1, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 288 with value: 0.23543990086741018.
[I 2024-08-28 08:27:22,777] Trial 291 finished with value: 0.21821631878557876 and parameters: {'n_estimators': 1017, 'max_depth': 25, 'min_samples_split': 4, 'min_samples_leaf': 1, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 288 with value: 0.23543990086741018.
[I 2024-08-28 08:28:04,242] Trial 292 finished with value: 0.22443890274314213 and parameters: {'n_estimators': 1098, 'max_depth': 34, 'min_samples_split': 4, 'min_samples_leaf': 1, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 288 with value: 0.23543990086741018.
[I 2024-08-28 08:28:41,551] Trial 293 finished with value: 0.22971285892634205 and parameters: {'n_estimators': 989, 'max_depth': 82, 'min_sam

Best trial: score 0.23543990086741018, 
params {'n_estimators': 1047, 'max_depth': 39, 'min_samples_split': 4, 'min_samples_leaf': 1, 'criterion': 'entropy', 'class_weight': 'balanced'}


Best trial: score 0.23543990086741018, 
params {'n_estimators': 1047, 'max_depth': 39, 'min_samples_split': 4, 'min_samples_leaf': 1, 'criterion': 'entropy', 'class_weight': 'balanced'}