# 제품 이상여부 판별 프로젝트


## 1. 데이터 불러오기


### 필수 라이브러리


In [6]:
import os
from pprint import pprint

import optuna
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [7]:


def get_clf_eval(y_test, y_pred_proba, threshold=0.5):
    # 확률을 기준으로 예측 레이블 생성
    y_pred = (y_pred_proba >= threshold).astype(int)  # 0.5 이상의 확률을 양성으로 간주

    confusion = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print("Confusion Matrix:\n", confusion)
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")

### 데이터 읽어오기


In [8]:
RANDOM_STATE = 110

# csv 불러오기
train_data = pd.read_csv('C:/Users/KimDongyoung/Desktop/git_LGaimers5/Lg_aimers5/data/train_data_0817.csv')
test_data  = pd.read_csv('C:/Users/KimDongyoung/Desktop/git_LGaimers5/Lg_aimers5/data/test_data_0817.csv')

---

## 데이터 분할

In [9]:
df_train, df_val = train_test_split(
    train_data,
    test_size=0.2,
    stratify=train_data["target"],
    shuffle=True,
    random_state=RANDOM_STATE,
)

def print_stats(df: pd.DataFrame):
    num_normal = len(df[df["target"] == "Normal"])
    num_abnormal = len(df[df["target"] == "AbNormal"])

    print(f"  Total: Normal: {num_normal}, AbNormal: {num_abnormal}" + f" ratio: {num_abnormal/num_normal}")


# Print statistics
print(f"  \tAbnormal\tNormal")
print_stats(df_train)
print_stats(df_val)

  	Abnormal	Normal
  Total: Normal: 30524, AbNormal: 1880 ratio: 0.06159087930808544
  Total: Normal: 7632, AbNormal: 470 ratio: 0.061582809224318656


## 3. 모델 학습

### 모델 정의

optuna

In [10]:
# import optuna
# from xgboost import XGBClassifier

# # 'Normal'과 'AbNormal'을 숫자로 변환
# train_data['target'] = train_data['target'].map({'Normal': 0, 'AbNormal': 1})

# # 스레드홀드 설정
# THRESHOLD = 0.3

# def objectiveXGB(trial, x_tr, y_tr, x_val, y_val):
#     param = {
#         'n_estimators': trial.suggest_int('n_estimators', 1200, 3200)
#         , 'learning_rate': trial.suggest_float('learning_rate', 0.002, 0.2)
#         , 'max_depth': trial.suggest_int('max_depth', 3, 10)

#         , 'alpha': trial.suggest_float('alpha', 0.00001, 0.01, log=True)
#         , 'gamma': trial.suggest_float('gamma', 0.00001, 0.01, log=True)

#         , 'reg_alpha' : trial.suggest_float('reg_alpha', 0.01, 1)
#         , 'reg_lambda' : trial.suggest_float('reg_lambda', 0.01, 1)
        
#         , 'colsample_bytree' : trial.suggest_float('colsample_bytree', 0.01, 1)
#         , 'subsample' : trial.suggest_float('subsample', 0.03, 1)
#         , 'objective': 'binary:logistic'  # 이진 분류
#         , 'tree_method' : "exact"        
#         , 'random_state': RANDOM_STATE
#     }
       
#     model = XGBClassifier(**param)
#     model.fit(x_tr, y_tr)
#     pred_proba = model.predict_proba(x_val)[:, 1]  # 양성 클래스 확률
#     pred = (pred_proba >= THRESHOLD).astype(int)  # 스레드홀드에 따른 예측
    
#     score = f1_score(y_val, pred, average="binary")
    
#     return score

# # 데이터셋 분할
# x_train, x_val, y_train, y_val = train_test_split(
#     train_data.drop("target", axis=1),
#     train_data["target"],
#     test_size=0.2,
#     shuffle=True,
#     random_state=RANDOM_STATE,
# )

# # 하이퍼 파라미터 튜닝
# study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=RANDOM_STATE))
# study.optimize(lambda trial: objectiveXGB(trial, x_train, y_train, x_val, y_val), n_trials=500)

# print('Best trial: score {}, \nparams {}'.format(study.best_trial.value, study.best_trial.params))


Best trial: score 0.26099290780141843

params {'n_estimators': 2287, 'learning_rate': 0.046904208411195795, 'max_depth': 8, 'alpha': 1.9343531171735368e-05, 'gamma': 0.002118564280859176, 'reg_alpha': 0.6827713868263061, 'reg_lambda': 0.05035980721174918, 'colsample_bytree': 0.8959193125044248, 'subsample': 0.43471952905681815}

In [11]:
# 스레드홀드 설정
THRESHOLD = 0.3

# 모델 설정 및 하이퍼파라미터
models = {
    'et': ExtraTreesClassifier(),
    'rf': RandomForestClassifier(),
    'cat': CatBoostClassifier(),
    'lgbm': LGBMClassifier(),
    'xgb': XGBClassifier(),
    'dt': DecisionTreeClassifier()
}

def train_and_evaluate_model(model_name, data, **params):
    if model_name not in models:
        print(f"{model_name}은(는) 지원되지 않는 모델입니다.")
        return
    
    # 데이터셋 분할
    x_train, x_val, y_train, y_val = train_test_split(
        data.drop("target", axis=1),
        data["target"].map({'Normal': 0, 'AbNormal': 1}),
        test_size=0.2,
        shuffle=True,
        random_state=RANDOM_STATE,
    )

    # 모델 선택
    model = models[model_name]

    # 하이퍼파라미터 설정
    model.set_params(**params)

    # 모델 학습
    model.fit(x_train, y_train)

    # 데이터 이름을 자동으로 추출하기 위한 래퍼 함수
    data_name = [name for name in globals() if globals()[name] is data][0]

    # 예측
    y_val_pred_proba = model.predict_proba(x_val)[:, 1]  # 양성 클래스 확률
    y_val_pred = (y_val_pred_proba >= THRESHOLD).astype(int)  # 스레드홀드에 따른 예측

    # 평가지표 계산
    f1 = f1_score(y_val, y_val_pred, average="binary")
    accuracy = accuracy_score(y_val, y_val_pred)
    precision = precision_score(y_val, y_val_pred)
    recall = recall_score(y_val, y_val_pred)
    conf_matrix = confusion_matrix(y_val, y_val_pred)
    
    # 결과 출력
    print(f'{model_name} 모델이 {data_name} 데이터로 학습한 결과:')
    print(f'F1 Score: {f1}')
    print('---')
    print('Confusion Matrix:')
    print(conf_matrix)
    print('---')
    print(f'Accuracy: {accuracy}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print('\n')

In [12]:
train_and_evaluate_model(
    'xgb', train_data,
    n_estimators = 2287, 
    learning_rate = 0.046904208411195795, 
    max_depth = 8, 
    alpha = 1.9343531171735368e-05, 
    gamma = 0.002118564280859176, 
    reg_alpha = 0.6827713868263061, 
    reg_lambda = 0.05035980721174918, 
    colsample_bytree = 0.8959193125044248, 
    subsample = 0.43471952905681815,
    objective = 'binary:logistic',  # 이진 분류
    tree_method = "exact", 
    random_state=RANDOM_STATE
)

xgb 모델이 train_data 데이터로 학습한 결과:
F1 Score: 0.26099290780141843
---
Confusion Matrix:
[[7489  173]
 [ 348   92]]
---
Accuracy: 0.93569489015058
Precision: 0.3471698113207547
Recall: 0.20909090909090908




In [13]:
# import optuna
# from xgboost import XGBClassifier

# # 'Normal'과 'AbNormal'을 숫자로 변환
# train_data['target'] = train_data['target'].map({'Normal': 0, 'AbNormal': 1})

# # 스레드홀드 설정
# THRESHOLD = 0.3

# def objectiveXGB(trial, x_tr, y_tr, x_val, y_val):
#     param = {
#         'n_estimators': trial.suggest_int('n_estimators', 1200, 3800)
#         , 'learning_rate': trial.suggest_float('learning_rate', 0.002, 0.2)
#         , 'max_depth': trial.suggest_int('max_depth', 5, 16)

#         , 'alpha': trial.suggest_float('alpha', 0.00001, 0.01, log=True)
#         , 'gamma': trial.suggest_float('gamma', 0.00001, 0.01, log=True)

#         , 'reg_alpha' : trial.suggest_float('reg_alpha', 0.01, 1)
#         , 'reg_lambda' : trial.suggest_float('reg_lambda', 0.01, 1)
        
#         , 'colsample_bytree' : trial.suggest_float('colsample_bytree', 0.01, 1)
#         , 'subsample' : trial.suggest_float('subsample', 0.03, 1)
#         , 'objective': 'binary:logistic'  # 이진 분류
#         , 'tree_method' : "exact"        
#         , 'random_state': RANDOM_STATE
#     }
       
#     model = XGBClassifier(**param)
#     model.fit(x_tr, y_tr)
#     pred_proba = model.predict_proba(x_val)[:, 1]  # 양성 클래스 확률
#     pred = (pred_proba >= THRESHOLD).astype(int)  # 스레드홀드에 따른 예측
    
#     score = f1_score(y_val, pred, average="binary")
    
#     return score

# # 데이터셋 분할
# x_train, x_val, y_train, y_val = train_test_split(
#     train_data.drop("target", axis=1),
#     train_data["target"],
#     test_size=0.2,
#     shuffle=True,
#     random_state=RANDOM_STATE,
# )

# # 하이퍼 파라미터 튜닝
# study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=RANDOM_STATE))
# study.optimize(lambda trial: objectiveXGB(trial, x_train, y_train, x_val, y_val), n_trials=150)

# print('Best trial: score {}, \nparams {}'.format(study.best_trial.value, study.best_trial.params))


Best trial: score 0.2568306010928962,  
params {'n_estimators': 3373, 'learning_rate': 0.05751842095099548, 'max_depth': 10,  
'alpha': 6.72190235825414e-05, 'gamma': 0.0004303579118014147, 'reg_alpha': 0.7600677164324523,  
'reg_lambda': 0.03096872260029274, 'colsample_bytree': 0.7935724641416979, 'subsample': 0.35716001338953424}

.