# 제품 이상여부 판별 프로젝트


## 1. 데이터 불러오기


### 필수 라이브러리


In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)

def get_clf_eval(y_test, y_pred_proba, threshold=0.5):
    # 확률을 기준으로 예측 레이블 생성
    y_pred = (y_pred_proba >= threshold).astype(int)  # 0.5 이상의 확률을 양성으로 간주

    confusion = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print("Confusion Matrix:\n", confusion)
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")

### 데이터 읽어오기


In [2]:
import pandas as pd

THRESHOLD = 0.3
RANDOM_STATE = 110

train_data = pd.read_csv("./data/train_data_0825.csv")
test_data = pd.read_csv("./data/test_data_0825.csv")

---

## 3. 모델 학습

### 모델 정의

optuna

In [3]:
import optuna
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# 'Normal'과 'AbNormal'을 숫자로 변환
train_data['target'] = train_data['target'].map({'Normal': 0, 'AbNormal': 1})

# 스레드홀드 설정
THRESHOLD = 0.3

def objectiveAdaBoost(trial, x_tr, y_tr, x_val, y_val):
    # Base Estimator 설정
    base_estimator = DecisionTreeClassifier(
        max_depth=trial.suggest_int('max_depth', 1, 10),
        min_samples_split=trial.suggest_int('min_samples_split', 2, 20),
        min_samples_leaf=trial.suggest_int('min_samples_leaf', 1, 20),
        max_features=trial.suggest_float('max_features', 0.1, 1.0),
        random_state=RANDOM_STATE  # RANDOM_STATE 추가
    )

    # AdaBoost 모델 설정
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.3),
        'estimator': base_estimator,
        'random_state': RANDOM_STATE  # RANDOM_STATE 추가
    }
    
    model = AdaBoostClassifier(**param)
    model.fit(x_tr, y_tr)
    pred = model.predict(x_val)
    score = f1_score(y_val, pred, average="binary")
    
    return score

# 데이터셋 분할
x_train, x_val, y_train, y_val = train_test_split(
    train_data.drop("target", axis=1),
    train_data["target"],
    test_size=0.2,
    shuffle=True,
    random_state=RANDOM_STATE,
)

# 하이퍼 파라미터 튜닝
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=RANDOM_STATE))
study.optimize(lambda trial: objectiveAdaBoost(trial, x_train, y_train, x_val, y_val), n_trials=200)

print('Best trial: score {}, \nparams {}'.format(study.best_trial.value, study.best_trial.params))


[I 2024-08-26 08:33:59,493] A new study created in memory with name: no-name-be1869fd-3d7c-4756-986f-83c7ca8d2c0a
[I 2024-08-26 08:34:13,962] Trial 0 finished with value: 0.15918367346938775 and parameters: {'max_depth': 2, 'min_samples_split': 14, 'min_samples_leaf': 8, 'max_features': 0.6515043233998075, 'n_estimators': 356, 'learning_rate': 0.20505647563755436}. Best is trial 0 with value: 0.15918367346938775.
[I 2024-08-26 08:34:24,977] Trial 1 finished with value: 0.16697588126159557 and parameters: {'max_depth': 10, 'min_samples_split': 14, 'min_samples_leaf': 11, 'max_features': 0.1775586959624329, 'n_estimators': 224, 'learning_rate': 0.087842652266996}. Best is trial 1 with value: 0.16697588126159557.
[I 2024-08-26 08:34:27,109] Trial 2 finished with value: 0.15289256198347106 and parameters: {'max_depth': 2, 'min_samples_split': 2, 'min_samples_leaf': 9, 'max_features': 0.5433318093400571, 'n_estimators': 59, 'learning_rate': 0.16829351079096685}. Best is trial 1 with value: 

KeyboardInterrupt: 

Trial 104 finished with value: 0.21328671328671328  
parameters: {'max_depth': 4, 'min_samples_split': 15, 'min_samples_leaf': 17, 'max_features': 0.7009684986301644,  
'n_estimators': 473, 'learning_rate': 0.25112141981467107}.