# 제품 이상여부 판별 프로젝트


## 1. 데이터 불러오기


### 필수 라이브러리


In [7]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)

def get_clf_eval(y_test, y_pred_proba, threshold=0.5):
    # 확률을 기준으로 예측 레이블 생성
    y_pred = (y_pred_proba >= threshold).astype(int)  # 0.5 이상의 확률을 양성으로 간주

    confusion = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print("Confusion Matrix:\n", confusion)
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")

### 데이터 읽어오기


In [8]:
import pandas as pd

THRESHOLD = 0.3
RANDOM_STATE = 110

train_data = pd.read_csv("../../../data/train_data_0817.csv")
test_data = pd.read_csv("../../../data/test_data_0817.csv")

---

## 3. 모델 학습

### 모델 정의

optuna

In [9]:
# import optuna
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import f1_score

# # 'Normal'과 'AbNormal'을 숫자로 변환
# train_data['target'] = train_data['target'].map({'Normal': 0, 'AbNormal': 1})

# # 스레드홀드 설정
# THRESHOLD = 0.3

# def objectiveRF(trial, x_tr, y_tr, x_val, y_val):
#     param = {
#     'n_estimators' : trial.suggest_int('n_estimators', 1000, 3000),
#     'max_depth' : trial.suggest_int('max_depth', 30, 100),
#     'min_samples_split' : trial.suggest_int('min_samples_split', 2, 20),
#     'min_samples_leaf' : trial.suggest_int('min_samples_leaf', 1, 20),
#     'criterion' : trial.suggest_categorical("criterion", ["gini", "entropy",]),
#     'class_weight' : trial.suggest_categorical("class_weight", ["balanced"]),
#     'random_state': RANDOM_STATE
#     }
       
#     model = RandomForestClassifier(**param)
#     model.fit(x_tr, y_tr)
#     pred_proba = model.predict_proba(x_val)[:, 1]  # 양성 클래스 확률
#     pred = (pred_proba >= THRESHOLD).astype(int)  # 스레드홀드에 따른 예측
    
#     score = f1_score(y_val, pred, average="binary")
    
#     return score

# # 데이터셋 분할
# x_train, x_val, y_train, y_val = train_test_split(
#     train_data.drop("target", axis=1),
#     train_data["target"],
#     test_size=0.2,
#     shuffle=True,
#     random_state=RANDOM_STATE,
# )

# # 하이퍼 파라미터 튜닝
# study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=RANDOM_STATE))
# study.optimize(lambda trial: objectiveRF(trial, x_train, y_train, x_val, y_val), n_trials=50)

# print('Best trial: score {}, \nparams {}'.format(study.best_trial.value, study.best_trial.params))


Trial 22 finished with value: 0.23575129533678754  
parameters: {'n_estimators': 1555, 'max_depth': 53, 'min_samples_split': 4,  
'min_samples_leaf': 1, 'criterion': 'gini', 'class_weight': 'balanced'}  

In [10]:
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# 'Normal'과 'AbNormal'을 숫자로 변환
train_data['target'] = train_data['target'].map({'Normal': 0, 'AbNormal': 1})

# 스레드홀드 설정
THRESHOLD = 0.3

def objectiveRF(trial, x_tr, y_tr, x_val, y_val):
    param = {
    'n_estimators' : trial.suggest_int('n_estimators', 1000, 3000),
    'max_depth' : trial.suggest_int('max_depth', 30, 100),
    'min_samples_split' : trial.suggest_int('min_samples_split', 2, 20),
    'min_samples_leaf' : trial.suggest_int('min_samples_leaf', 1, 10),
    'criterion' : trial.suggest_categorical("criterion", ["gini", "entropy",]),
    'class_weight' : trial.suggest_categorical("class_weight", ["balanced"]),
    'random_state': RANDOM_STATE
    }
       
    model = RandomForestClassifier(**param)
    model.fit(x_tr, y_tr)
    pred_proba = model.predict_proba(x_val)[:, 1]  # 양성 클래스 확률
    pred = (pred_proba >= THRESHOLD).astype(int)  # 스레드홀드에 따른 예측
    
    score = f1_score(y_val, pred, average="binary")
    
    return score

# 데이터셋 분할
x_train, x_val, y_train, y_val = train_test_split(
    train_data.drop("target", axis=1),
    train_data["target"],
    test_size=0.2,
    shuffle=True,
    random_state=RANDOM_STATE,
)

# 하이퍼 파라미터 튜닝
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=RANDOM_STATE))
study.optimize(lambda trial: objectiveRF(trial, x_train, y_train, x_val, y_val), n_trials=500)

print('Best trial: score {}, \nparams {}'.format(study.best_trial.value, study.best_trial.params))


[I 2024-08-20 21:55:13,836] A new study created in memory with name: no-name-10a8e19c-2272-410c-963b-4b7cb6315753
[I 2024-08-20 21:56:25,133] Trial 0 finished with value: 0.18280467445742907 and parameters: {'n_estimators': 1232, 'max_depth': 76, 'min_samples_split': 9, 'min_samples_leaf': 7, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 0 with value: 0.18280467445742907.
[I 2024-08-20 21:59:43,215] Trial 1 finished with value: 0.21986353297952996 and parameters: {'n_estimators': 2910, 'max_depth': 75, 'min_samples_split': 11, 'min_samples_leaf': 1, 'criterion': 'gini', 'class_weight': 'balanced'}. Best is trial 1 with value: 0.21986353297952996.
[I 2024-08-20 22:01:15,689] Trial 2 finished with value: 0.1930379746835443 and parameters: {'n_estimators': 1270, 'max_depth': 30, 'min_samples_split': 9, 'min_samples_leaf': 5, 'criterion': 'entropy', 'class_weight': 'balanced'}. Best is trial 1 with value: 0.21986353297952996.
[I 2024-08-20 22:02:27,613] Trial 3 finishe

.