# 제품 이상여부 판별 프로젝트


### 데이터 읽어오기


In [1]:
import os
from pprint import pprint

import optuna
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pandas as pd

THRESHOLD = 0.3
RANDOM_STATE = 110

train_data = pd.read_csv("./data/train_data_ver4.csv")
test_data = pd.read_csv("./data/test_data_ver4.csv")

In [3]:
# dam, fill1, fill2 공통 변수
var_dam_fill = [
    'PalletID_Collect_Result',
    'Production_Qty_Collect_Result',
    'Receip_No_encoded'
]

In [4]:
# 전체 공통 변수
### correlation 확인을 위한 변수 리스트
var_all_corr = [
    'model_suffix_encoded',
    'cleaned_workorder_encoded'
]

### train
var_all_train = [
    'target',
    'model_receip_combined_encoded',
    'cleaned_workorder_encoded',
    'time_gap_All'
]

### test
var_all_test = [
    'Set ID',
    'target',
    'model_receip_combined_encoded',
    'cleaned_workorder_encoded',
    'time_gap_All'
]

In [5]:
# '_Dam'을 포함하는 변수 선택
dam_variables = [var for var in train_data.columns if '_Dam' in var]

# train
final_columns_train = var_dam_fill + var_all_train + dam_variables
train_data_dam = train_data[final_columns_train]

# test 
final_columns_test = var_dam_fill + var_all_test + dam_variables
test_data_dam = test_data[final_columns_test]

In [6]:
# '_Fill1'을 포함하는 변수 선택
fill1_variables = [var for var in train_data.columns if '_Fill1' in var]

# train
final_columns_train = var_dam_fill + var_all_train + fill1_variables
train_data_fill1 = train_data[final_columns_train]

# test 
final_columns_test = var_dam_fill + var_all_test + fill1_variables
test_data_fill1 = test_data[final_columns_test]

In [7]:
# '_Fill2'을 포함하는 변수 선택
fill2_variables = [var for var in train_data.columns if '_Fill2' in var]

# train
final_columns_train = var_dam_fill + var_all_train + fill2_variables
train_data_fill2 = train_data[final_columns_train]

# test 
final_columns_test = var_dam_fill + var_all_test + fill2_variables
test_data_fill2 = test_data[final_columns_test]

In [8]:
# '_AutoClave'을 포함하는 변수 선택
autoclave_variables = [var for var in train_data.columns if '_AutoClave' in var]

# train
final_columns_train = var_all_train + autoclave_variables
train_data_autoclave = train_data[final_columns_train]

# test 
final_columns_test = var_all_test + autoclave_variables
test_data_autoclave = test_data[final_columns_test]

In [9]:
# 각 DataFrame의 칼럼 수 계산
num_columns_train_data = train_data.shape[1]
num_columns_train_data_dam = train_data_dam.shape[1]
num_columns_train_data_autoclave = train_data_autoclave.shape[1]
num_columns_train_data_fill1 = train_data_fill1.shape[1]
num_columns_train_data_fill2 = train_data_fill2.shape[1]

num_columns_test_data = test_data.shape[1]
num_columns_test_data_dam = test_data_dam.shape[1]
num_columns_test_data_autoclave = test_data_autoclave.shape[1]
num_columns_test_data_fill1 = test_data_fill1.shape[1]
num_columns_test_data_fill2 = test_data_fill2.shape[1]

# 각 DataFrame의 칼럼 수 출력
print("----train data-----")
print(f"train_data DataFrame의 칼럼 수: {num_columns_train_data}")
print(f"train_data_dam DataFrame의 칼럼 수: {num_columns_train_data_dam}")
print(f"train_data_autoclave DataFrame의 칼럼 수: {num_columns_train_data_autoclave}")
print(f"train_data_fill1 DataFrame의 칼럼 수: {num_columns_train_data_fill1}")
print(f"train_data_fill2 DataFrame의 칼럼 수: {num_columns_train_data_fill2}")
print("----test data-----")
print(f"test_data DataFrame의 칼럼 수: {num_columns_test_data}")
print(f"test_data_dam DataFrame의 칼럼 수: {num_columns_test_data_dam}")
print(f"test_data_autoclave DataFrame의 칼럼 수: {num_columns_test_data_autoclave}")
print(f"test_data_fill1 DataFrame의 칼럼 수: {num_columns_test_data_fill1}")
print(f"test_data_fill2 DataFrame의 칼럼 수: {num_columns_test_data_fill2}")

----train data-----
train_data DataFrame의 칼럼 수: 39
train_data_dam DataFrame의 칼럼 수: 20
train_data_autoclave DataFrame의 칼럼 수: 9
train_data_fill1 DataFrame의 칼럼 수: 14
train_data_fill2 DataFrame의 칼럼 수: 14
----test data-----
test_data DataFrame의 칼럼 수: 40
test_data_dam DataFrame의 칼럼 수: 21
test_data_autoclave DataFrame의 칼럼 수: 10
test_data_fill1 DataFrame의 칼럼 수: 15
test_data_fill2 DataFrame의 칼럼 수: 15


---

## Optuna

스레스홀드 0.3으로 맞춘상태에서 튜닝 진행한 것

In [None]:
import optuna
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# 'Normal'과 'AbNormal'을 숫자로 변환
train_data_fill2['target'] = train_data_fill2['target'].map({'Normal': 0, 'AbNormal': 1})

# 스레드홀드 설정
THRESHOLD = 0.3


def objectiveLGBM_dart(trial, x_tr, y_tr, x_val, y_val):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 500, 3000),
        'num_leaves': trial.suggest_int('num_leaves', 500, 3000),
        'max_depth': trial.suggest_int('max_depth', 10, 300),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1),
        'min_child_samples': trial.suggest_int('min_child_samples', 3, 300),
        
        'boosting_type': 'dart',  # 'boosting'를 'boosting_type'으로 수정
        'random_state': RANDOM_STATE,
        'verbose': -1
    }
       
    model = LGBMClassifier(**param)
    model.fit(x_tr, y_tr)
    pred_proba = model.predict_proba(x_val)[:, 1]  # 양성 클래스 확률
    pred = (pred_proba >= THRESHOLD).astype(int)  # 스레드홀드에 따른 예측
    
    score = f1_score(y_val, pred, average="binary")
    
    return score

# 데이터셋 분할
x_train, x_val, y_train, y_val = train_test_split(
    train_data_fill2.drop("target", axis=1),
    train_data_fill2["target"],
    test_size=0.2,
    shuffle=True,
    random_state=RANDOM_STATE,
)

# 하이퍼 파라미터 튜닝
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=RANDOM_STATE))
study.optimize(lambda trial: objectiveLGBM_dart(trial, x_train, y_train, x_val, y_val), n_trials=300)

print('Best trial: score {}, \nparams {}'.format(study.best_trial.value, study.best_trial.params))



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data_fill2['target'] = train_data_fill2['target'].map({'Normal': 0, 'AbNormal': 1})
[I 2024-09-28 23:07:54,744] A new study created in memory with name: no-name-cdc4ea64-97ef-4752-bfbb-628e95467159
[I 2024-09-28 23:08:41,502] Trial 0 finished with value: 0.24408014571949002 and parameters: {'n_estimators': 790, 'num_leaves': 2146, 'max_depth': 119, 'learning_rate': 0.06166547557397882, 'min_child_samples': 205}. Best is trial 0 with value: 0.24408014571949002.
[I 2024-09-28 23:14:14,349] Trial 1 finished with value: 0.19911504424778761 and parameters: {'n_estimators': 2206, 'num_leaves': 2888, 'max_depth': 195, 'learning_rate': 0.05178141716690417, 'min_child_samples': 28}. Best is trial 0 with value: 0.2440801457194900

[I 2024-09-29 00:03:14,408] Trial 29 finished with value: 0.21939586645469 and parameters: {'n_estimators': 2701, 'num_leaves': 2655, 'max_depth': 207, 'learning_rate': 0.04618696128522181, 'min_child_samples': 257}. Best is trial 15 with value: 0.24581005586592178.
[I 2024-09-29 00:05:40,671] Trial 30 finished with value: 0.22222222222222224 and parameters: {'n_estimators': 1662, 'num_leaves': 2931, 'max_depth': 151, 'learning_rate': 0.03273690408542684, 'min_child_samples': 98}. Best is trial 15 with value: 0.24581005586592178.
[I 2024-09-29 00:07:18,435] Trial 31 finished with value: 0.23752151462994836 and parameters: {'n_estimators': 1330, 'num_leaves': 1672, 'max_depth': 91, 'learning_rate': 0.05952411254006535, 'min_child_samples': 222}. Best is trial 15 with value: 0.24581005586592178.
[I 2024-09-29 00:08:14,889] Trial 32 finished with value: 0.23247232472324722 and parameters: {'n_estimators': 895, 'num_leaves': 1282, 'max_depth': 75, 'learning_rate': 0.055226362355429805, 'mi

In [11]:
# 스레드홀드 설정
THRESHOLD = 0.3

# 모델 설정 및 하이퍼파라미터
models = {
    'et': ExtraTreesClassifier(),
    'rf': RandomForestClassifier(),
    'cat': CatBoostClassifier(),
    'lgbm': LGBMClassifier(),
    'xgb': XGBClassifier(),
    'dt': DecisionTreeClassifier()
}

def train_and_evaluate_model(model_name, data, **params):
    if model_name not in models:
        print(f"{model_name}은(는) 지원되지 않는 모델입니다.")
        return
    
    # 데이터셋 분할
    x_train, x_val, y_train, y_val = train_test_split(
        data.drop("target", axis=1),
        data["target"].map({'Normal': 0, 'AbNormal': 1}),
        test_size=0.2,
        shuffle=True,
        random_state=RANDOM_STATE,
    )

    # 모델 선택
    model = models[model_name]

    # 하이퍼파라미터 설정
    model.set_params(**params)

    # 모델 학습
    model.fit(x_train, y_train)

    # 데이터 이름을 자동으로 추출하기 위한 래퍼 함수
    data_name = [name for name in globals() if globals()[name] is data][0]

    # 예측
    y_val_pred_proba = model.predict_proba(x_val)[:, 1]  # 양성 클래스 확률
    y_val_pred = (y_val_pred_proba >= THRESHOLD).astype(int)  # 스레드홀드에 따른 예측

    # 평가지표 계산
    f1 = f1_score(y_val, y_val_pred, average="binary")
    accuracy = accuracy_score(y_val, y_val_pred)
    precision = precision_score(y_val, y_val_pred)
    recall = recall_score(y_val, y_val_pred)
    conf_matrix = confusion_matrix(y_val, y_val_pred)
    
    # 결과 출력
    print(f'{model_name} 모델이 {data_name} 데이터로 학습한 결과:')
    print(f'F1 Score: {f1}')
    print('---')
    print('Confusion Matrix:')
    print(conf_matrix)
    print('---')
    print(f'Accuracy: {accuracy}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print('\n')

In [12]:
train_and_evaluate_model(
    'xgb', train_data_fill2,
    n_estimators = 1998, 
    learning_rate = 0.030898693059763598, 
    max_depth = 8, 
    alpha = 0.0017554538174868774, 
    gamma = 0.0007257577447593802, 
    reg_alpha = 0.7581280398368035, 
    reg_lambda = 0.5872331353519633, 
    colsample_bytree = 0.56275606593282, 
    subsample = 0.8342870707789082,
    objective = 'binary:logistic',
    tree_method = 'exact',
    random_state=RANDOM_STATE
)

xgb 모델이 train_data_fill2 데이터로 학습한 결과:
F1 Score: 0.24233983286908078
---
Confusion Matrix:
[[7471  191]
 [ 353   87]]
---
Accuracy: 0.9328560849173043
Precision: 0.3129496402877698
Recall: 0.19772727272727272




xgb 모델이 train_data_fill2 데이터로 학습한 결과:

F1 Score: 0.24233983286908078

Confusion Matrix:
[[7471  191]
 [ 353   87]]

Accuracy: 0.9328560849173043
Precision: 0.3129496402877698
Recall: 0.19772727272727272




In [13]:
# # 2-5. Fill2
# train_model_Fill2 = train_and_evaluate_model(
#     'xgb', train_data_fill2
#     , n_estimators = 488
#     , learning_rate = 0.27456156507923796
#     , max_depth = 18
#     , alpha = 0.001345329538356762
#     , gamma = 0.001271261094255318
#     , reg_alpha = 0.8757519133030134
#     , reg_lambda = 0.08373579326505055
#     , colsample_bytree = 0.8186279659279335
#     , subsample = 0.24909941675865316
#     , objective = 'binary:logistic'
#     , tree_method = 'exact'
#     , random_state=RANDOM_STATE
# )

.

.