# 제품 이상여부 판별 프로젝트


### 데이터 읽어오기


In [17]:
import os
from pprint import pprint

import optuna
import numpy as np
import pandas as pd
#import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [18]:
import pandas as pd

RANDOM_STATE = 110

train_data = pd.read_csv('./data/train_data_0827.csv')
test_data = pd.read_csv('./data/test_data_0827.csv')

In [19]:
# dam, fill1, fill2 공통 변수
var_dam_fill = [
    'Receip_No_encoded',
    'Equipment_same_num',
    'PalletID_Collect_Result_encoded',
    'Production_Qty_Collect_Result',
    'WorkMode Collect Result'
]

In [20]:
# 전체 공통 변수
### train
var_all_train = [
    'target',
    'model_suffix_encoded',
    'cleaned_workorder_encoded'
]

### test
var_all_test = [
    'Set ID',
    'target',
    'model_suffix_encoded',
    'cleaned_workorder_encoded'
]

In [21]:
# '_Dam'을 포함하는 변수 선택
dam_variables = [var for var in train_data.columns if '_Dam' in var]

# train
final_columns_train = var_dam_fill + var_all_train + dam_variables
train_data_dam = train_data[final_columns_train]

# test 
final_columns_test = var_dam_fill + var_all_test + dam_variables
test_data_dam = test_data[final_columns_test]

In [22]:
# '_Fill1'을 포함하는 변수 선택
fill1_variables = [var for var in train_data.columns if '_Fill1' in var]

# train
final_columns_train = var_dam_fill + var_all_train + fill1_variables
train_data_fill1 = train_data[final_columns_train]

# test 
final_columns_test = var_dam_fill + var_all_test + fill1_variables
test_data_fill1 = test_data[final_columns_test]

In [23]:
# '_Fill2'을 포함하는 변수 선택
fill2_variables = [var for var in train_data.columns if '_Fill2' in var]

# train
final_columns_train = var_dam_fill + var_all_train + fill2_variables
train_data_fill2 = train_data[final_columns_train]

# test 
final_columns_test = var_dam_fill + var_all_test + fill2_variables
test_data_fill2 = test_data[final_columns_test]

In [24]:
# '_AutoClave'을 포함하는 변수 선택
autoclave_variables = [var for var in train_data.columns if '_AutoClave' in var]

# train
final_columns_train = var_all_train + autoclave_variables
train_data_autoclave = train_data[final_columns_train]

# test 
final_columns_test = var_all_test + autoclave_variables
test_data_autoclave = test_data[final_columns_test]

In [25]:
# 각 DataFrame의 칼럼 수 계산
num_columns_train_data = train_data.shape[1]
num_columns_train_data_dam = train_data_dam.shape[1]
num_columns_train_data_autoclave = train_data_autoclave.shape[1]
num_columns_train_data_fill1 = train_data_fill1.shape[1]
num_columns_train_data_fill2 = train_data_fill2.shape[1]

num_columns_test_data = test_data.shape[1]
num_columns_test_data_dam = test_data_dam.shape[1]
num_columns_test_data_autoclave = test_data_autoclave.shape[1]
num_columns_test_data_fill1 = test_data_fill1.shape[1]
num_columns_test_data_fill2 = test_data_fill2.shape[1]

# 각 DataFrame의 칼럼 수 출력
print("----train data-----")
print(f"train_data DataFrame의 칼럼 수: {num_columns_train_data}")
print(f"train_data_dam DataFrame의 칼럼 수: {num_columns_train_data_dam}")
print(f"train_data_autoclave DataFrame의 칼럼 수: {num_columns_train_data_autoclave}")
print(f"train_data_fill1 DataFrame의 칼럼 수: {num_columns_train_data_fill1}")
print(f"train_data_fill2 DataFrame의 칼럼 수: {num_columns_train_data_fill2}")
print("----test data-----")
print(f"test_data DataFrame의 칼럼 수: {num_columns_test_data}")
print(f"test_data_dam DataFrame의 칼럼 수: {num_columns_test_data_dam}")
print(f"test_data_autoclave DataFrame의 칼럼 수: {num_columns_test_data_autoclave}")
print(f"test_data_fill1 DataFrame의 칼럼 수: {num_columns_test_data_fill1}")
print(f"test_data_fill2 DataFrame의 칼럼 수: {num_columns_test_data_fill2}")

----train data-----
train_data DataFrame의 칼럼 수: 40
train_data_dam DataFrame의 칼럼 수: 23
train_data_autoclave DataFrame의 칼럼 수: 8
train_data_fill1 DataFrame의 칼럼 수: 14
train_data_fill2 DataFrame의 칼럼 수: 14
----test data-----
test_data DataFrame의 칼럼 수: 41
test_data_dam DataFrame의 칼럼 수: 24
test_data_autoclave DataFrame의 칼럼 수: 9
test_data_fill1 DataFrame의 칼럼 수: 15
test_data_fill2 DataFrame의 칼럼 수: 15


---

## Optuna

스레스홀드 0.3으로 맞춘상태에서 튜닝 진행한 것

In [26]:
RANDOM_STATE = 110
THRESHOLD = 0.3

In [27]:
# 'Normal'과 'AbNormal'을 숫자로 변환
train_data_fill1['target'] = train_data_fill1['target'].map({'Normal': 0, 'AbNormal': 1})

# 스레드홀드 설정
THRESHOLD = 0.3

def objectiveRandomForestClassifier(trial, x_tr, y_tr, x_val, y_val):
    param = {
        'n_estimators' : trial.suggest_int('n_estimators', 800, 3000),
        'max_depth' : trial.suggest_int('max_depth', 20, 100),
        'min_samples_split' : trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf' : trial.suggest_int('min_samples_leaf', 1, 8),
        'criterion' : trial.suggest_categorical("criterion", ["entropy",]),
        'class_weight' : trial.suggest_categorical("class_weight", ["balanced"]),
        'random_state': RANDOM_STATE
    }
    
    model = RandomForestClassifier(**param)
    model.fit(x_tr, y_tr)
    pred_proba = model.predict_proba(x_val)[:, 1]  # 양성 클래스 확률
    pred = (pred_proba >= THRESHOLD).astype(int)  # 스레드홀드에 따른 예측
    
    score = f1_score(y_val, pred, average="binary")
    
    return score

# 데이터셋 분할
x_train, x_val, y_train, y_val = train_test_split(
    train_data_fill1.drop("target", axis=1),
    train_data_fill1["target"],
    test_size=0.2,
    shuffle=True,
    random_state=RANDOM_STATE,
)

# 하이퍼 파라미터 튜닝
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=RANDOM_STATE))
study.optimize(lambda trial: objectiveRandomForestClassifier(trial, x_train, y_train, x_val, y_val), n_trials=300)

print('Best trial: score {}, \nparams {}'.format(study.best_trial.value, study.best_trial.params))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data_fill1['target'] = train_data_fill1['target'].map({'Normal': 0, 'AbNormal': 1})
[I 2024-08-28 02:50:23,312] A new study created in memory with name: no-name-207bd1da-91f9-40b4-99ae-b5cb5d4d58cd
[I 2024-08-28 02:50:48,095] Trial 0 finished with value: 0.17314095449500552 and parameters: {'n_estimators': 1055, 'max_depth': 73, 'min_samples_split': 9, 'min_samples_leaf': 5, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 0 with value: 0.17314095449500552.
[I 2024-08-28 02:51:37,238] Trial 1 finished with value: 0.16571256402530884 and parameters: {'n_estimators': 2295, 'max_depth': 75, 'min_samples_split': 20, 'min_samples_leaf': 6, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 0

[I 2024-08-28 03:34:53,315] Trial 28 finished with value: 0.19417475728155342 and parameters: {'n_estimators': 2473, 'max_depth': 100, 'min_samples_split': 13, 'min_samples_leaf': 1, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 10 with value: 0.19554848966613672.
[I 2024-08-28 03:36:19,225] Trial 29 finished with value: 0.18795888399412627 and parameters: {'n_estimators': 2448, 'max_depth': 76, 'min_samples_split': 8, 'min_samples_leaf': 3, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 10 with value: 0.19554848966613672.
[I 2024-08-28 03:37:33,728] Trial 30 finished with value: 0.19298245614035087 and parameters: {'n_estimators': 2106, 'max_depth': 87, 'min_samples_split': 10, 'min_samples_leaf': 2, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 10 with value: 0.19554848966613672.
[I 2024-08-28 03:38:37,249] Trial 31 finished with value: 0.1967725143154607 and parameters: {'n_estimators': 1800, 'max_depth': 95, 'min_samples

[I 2024-08-28 04:09:52,121] Trial 57 finished with value: 0.18027210884353742 and parameters: {'n_estimators': 1698, 'max_depth': 81, 'min_samples_split': 3, 'min_samples_leaf': 4, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 55 with value: 0.1978021978021978.
[I 2024-08-28 04:10:48,954] Trial 58 finished with value: 0.18745493871665464 and parameters: {'n_estimators': 1541, 'max_depth': 65, 'min_samples_split': 2, 'min_samples_leaf': 2, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 55 with value: 0.1978021978021978.
[I 2024-08-28 04:11:36,880] Trial 59 finished with value: 0.1967725143154607 and parameters: {'n_estimators': 1356, 'max_depth': 70, 'min_samples_split': 5, 'min_samples_leaf': 3, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 55 with value: 0.1978021978021978.
[I 2024-08-28 04:12:25,325] Trial 60 finished with value: 0.19750519750519752 and parameters: {'n_estimators': 1176, 'max_depth': 85, 'min_samples_split

[I 2024-08-28 04:33:06,159] Trial 86 finished with value: 0.18835370237239393 and parameters: {'n_estimators': 891, 'max_depth': 55, 'min_samples_split': 4, 'min_samples_leaf': 2, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 81 with value: 0.19947780678851176.
[I 2024-08-28 04:33:46,244] Trial 87 finished with value: 0.19958202716823406 and parameters: {'n_estimators': 1071, 'max_depth': 61, 'min_samples_split': 4, 'min_samples_leaf': 3, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 87 with value: 0.19958202716823406.
[I 2024-08-28 04:34:26,455] Trial 88 finished with value: 0.18135593220338983 and parameters: {'n_estimators': 1105, 'max_depth': 61, 'min_samples_split': 7, 'min_samples_leaf': 4, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 87 with value: 0.19958202716823406.
[I 2024-08-28 04:35:00,667] Trial 89 finished with value: 0.19812304483837329 and parameters: {'n_estimators': 921, 'max_depth': 51, 'min_samples_spl

[I 2024-08-28 04:53:54,424] Trial 115 finished with value: 0.19163292847503377 and parameters: {'n_estimators': 1086, 'max_depth': 56, 'min_samples_split': 5, 'min_samples_leaf': 2, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 87 with value: 0.19958202716823406.
[I 2024-08-28 04:54:38,583] Trial 116 finished with value: 0.19781363872982824 and parameters: {'n_estimators': 1217, 'max_depth': 70, 'min_samples_split': 5, 'min_samples_leaf': 3, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 87 with value: 0.19958202716823406.
[I 2024-08-28 04:55:12,952] Trial 117 finished with value: 0.19738903394255877 and parameters: {'n_estimators': 942, 'max_depth': 65, 'min_samples_split': 4, 'min_samples_leaf': 3, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 87 with value: 0.19958202716823406.
[I 2024-08-28 04:55:56,072] Trial 118 finished with value: 0.19002695417789756 and parameters: {'n_estimators': 1148, 'max_depth': 54, 'min_sample

[I 2024-08-28 05:12:51,078] Trial 144 finished with value: 0.1978021978021978 and parameters: {'n_estimators': 983, 'max_depth': 62, 'min_samples_split': 4, 'min_samples_leaf': 3, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 87 with value: 0.19958202716823406.
[I 2024-08-28 05:13:24,658] Trial 145 finished with value: 0.19832985386221294 and parameters: {'n_estimators': 932, 'max_depth': 60, 'min_samples_split': 4, 'min_samples_leaf': 3, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 87 with value: 0.19958202716823406.
[I 2024-08-28 05:13:57,207] Trial 146 finished with value: 0.18808327351040918 and parameters: {'n_estimators': 875, 'max_depth': 56, 'min_samples_split': 4, 'min_samples_leaf': 2, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 87 with value: 0.19958202716823406.
[I 2024-08-28 05:14:50,457] Trial 147 finished with value: 0.1992696922274387 and parameters: {'n_estimators': 1011, 'max_depth': 49, 'min_samples_sp

[I 2024-08-28 05:38:21,016] Trial 173 finished with value: 0.1992696922274387 and parameters: {'n_estimators': 1003, 'max_depth': 57, 'min_samples_split': 5, 'min_samples_leaf': 3, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 87 with value: 0.19958202716823406.
[I 2024-08-28 05:39:18,900] Trial 174 finished with value: 0.19822639540949402 and parameters: {'n_estimators': 1045, 'max_depth': 52, 'min_samples_split': 4, 'min_samples_leaf': 3, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 87 with value: 0.19958202716823406.
[I 2024-08-28 05:40:13,227] Trial 175 finished with value: 0.19843342036553524 and parameters: {'n_estimators': 955, 'max_depth': 66, 'min_samples_split': 5, 'min_samples_leaf': 3, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 87 with value: 0.19958202716823406.
[I 2024-08-28 05:41:14,715] Trial 176 finished with value: 0.19801980198019803 and parameters: {'n_estimators': 1148, 'max_depth': 59, 'min_samples

[I 2024-08-28 06:02:16,065] Trial 202 finished with value: 0.1992696922274387 and parameters: {'n_estimators': 1007, 'max_depth': 60, 'min_samples_split': 5, 'min_samples_leaf': 3, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 87 with value: 0.19958202716823406.
[I 2024-08-28 06:02:49,224] Trial 203 finished with value: 0.19718309859154928 and parameters: {'n_estimators': 910, 'max_depth': 50, 'min_samples_split': 5, 'min_samples_leaf': 3, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 87 with value: 0.19958202716823406.
[I 2024-08-28 06:04:07,693] Trial 204 finished with value: 0.19532467532467532 and parameters: {'n_estimators': 1952, 'max_depth': 61, 'min_samples_split': 5, 'min_samples_leaf': 3, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 87 with value: 0.19958202716823406.
[I 2024-08-28 06:05:05,742] Trial 205 finished with value: 0.19958202716823406 and parameters: {'n_estimators': 1071, 'max_depth': 54, 'min_samples

[I 2024-08-28 06:28:05,727] Trial 231 finished with value: 0.1992696922274387 and parameters: {'n_estimators': 1031, 'max_depth': 51, 'min_samples_split': 4, 'min_samples_leaf': 3, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 87 with value: 0.19958202716823406.
[I 2024-08-28 06:29:04,085] Trial 232 finished with value: 0.19937369519832987 and parameters: {'n_estimators': 1070, 'max_depth': 53, 'min_samples_split': 5, 'min_samples_leaf': 3, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 87 with value: 0.19958202716823406.
[I 2024-08-28 06:29:53,387] Trial 233 finished with value: 0.1978021978021978 and parameters: {'n_estimators': 977, 'max_depth': 52, 'min_samples_split': 4, 'min_samples_leaf': 3, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 87 with value: 0.19958202716823406.
[I 2024-08-28 06:30:46,728] Trial 234 finished with value: 0.19801980198019803 and parameters: {'n_estimators': 1060, 'max_depth': 50, 'min_samples_

[I 2024-08-28 06:53:35,861] Trial 260 finished with value: 0.16599040944300994 and parameters: {'n_estimators': 882, 'max_depth': 66, 'min_samples_split': 17, 'min_samples_leaf': 3, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 87 with value: 0.19958202716823406.
[I 2024-08-28 06:54:27,706] Trial 261 finished with value: 0.1990570979570456 and parameters: {'n_estimators': 984, 'max_depth': 47, 'min_samples_split': 3, 'min_samples_leaf': 3, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 87 with value: 0.19958202716823406.
[I 2024-08-28 06:55:25,082] Trial 262 finished with value: 0.1826965305226175 and parameters: {'n_estimators': 1062, 'max_depth': 58, 'min_samples_split': 11, 'min_samples_leaf': 3, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 87 with value: 0.19958202716823406.
[I 2024-08-28 06:56:15,673] Trial 263 finished with value: 0.1974921630094044 and parameters: {'n_estimators': 941, 'max_depth': 50, 'min_samples_s

[I 2024-08-28 07:16:04,564] Trial 289 finished with value: 0.17197924388435878 and parameters: {'n_estimators': 986, 'max_depth': 43, 'min_samples_split': 4, 'min_samples_leaf': 5, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 87 with value: 0.19958202716823406.
[I 2024-08-28 07:16:54,593] Trial 290 finished with value: 0.19634464751958222 and parameters: {'n_estimators': 901, 'max_depth': 41, 'min_samples_split': 4, 'min_samples_leaf': 3, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 87 with value: 0.19958202716823406.
[I 2024-08-28 07:17:47,513] Trial 291 finished with value: 0.19958202716823406 and parameters: {'n_estimators': 1073, 'max_depth': 46, 'min_samples_split': 3, 'min_samples_leaf': 3, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 87 with value: 0.19958202716823406.
[I 2024-08-28 07:18:41,550] Trial 292 finished with value: 0.19738903394255877 and parameters: {'n_estimators': 1090, 'max_depth': 44, 'min_samples

Best trial: score 0.19958202716823406, 
params {'n_estimators': 1071, 'max_depth': 61, 'min_samples_split': 4, 'min_samples_leaf': 3, 'criterion': 'entropy', 'class_weight': 'balanced'}


Best trial: score 0.19958202716823406, 
params {'n_estimators': 1071, 'max_depth': 61, 'min_samples_split': 4, 'min_samples_leaf': 3, 'criterion': 'entropy', 'class_weight': 'balanced'}