# 제품 이상여부 판별 프로젝트


## 1. 데이터 불러오기


### 필수 라이브러리


In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)

def get_clf_eval(y_test, y_pred_proba, threshold=0.5):
    # 확률을 기준으로 예측 레이블 생성
    y_pred = (y_pred_proba >= threshold).astype(int)  # 0.5 이상의 확률을 양성으로 간주

    confusion = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print("Confusion Matrix:\n", confusion)
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")

### 데이터 읽어오기


In [6]:
RANDOM_STATE = 110

# csv 불러오기
train_data = pd.read_csv('C:/Users/KimDongyoung/Desktop/git_LGaimers5/Lg_aimers5/data/train_data_0817.csv')
test_data  = pd.read_csv('C:/Users/KimDongyoung/Desktop/git_LGaimers5/Lg_aimers5/data/test_data_0817.csv')

---

## 데이터 분할

In [7]:
df_train, df_val = train_test_split(
    train_data,
    test_size=0.2,
    stratify=train_data["target"],
    shuffle=True,
    random_state=RANDOM_STATE,
)

def print_stats(df: pd.DataFrame):
    num_normal = len(df[df["target"] == "Normal"])
    num_abnormal = len(df[df["target"] == "AbNormal"])

    print(f"  Total: Normal: {num_normal}, AbNormal: {num_abnormal}" + f" ratio: {num_abnormal/num_normal}")


# Print statistics
print(f"  \tAbnormal\tNormal")
print_stats(df_train)
print_stats(df_val)

  	Abnormal	Normal
  Total: Normal: 30524, AbNormal: 1880 ratio: 0.06159087930808544
  Total: Normal: 7632, AbNormal: 470 ratio: 0.061582809224318656


## 3. 모델 학습

### 모델 정의

optuna

In [8]:
import optuna
from sklearn.ensemble import ExtraTreesClassifier 

# 'Normal'과 'AbNormal'을 숫자로 변환
train_data['target'] = train_data['target'].map({'Normal': 0, 'AbNormal': 1})

# 스레드홀드 설정
THRESHOLD = 0.3

def objectiveExtraTreesClassifier(trial, x_tr, y_tr, x_val, y_val):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 2500, 3500)
        , 'max_depth': trial.suggest_int('max_depth', 50, 70)
        , 'min_samples_split': trial.suggest_int('min_samples_split', 2, 10)
        , 'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 7)
        , 'criterion': trial.suggest_categorical('criterion', ['entropy'])
        , 'bootstrap': trial.suggest_categorical('bootstrap', [False])
        , 'random_state': RANDOM_STATE
    }
    
    model = ExtraTreesClassifier(**param)
    model.fit(x_tr, y_tr)
    pred = model.predict(x_val)
    score = f1_score(y_val, pred, average="binary")
    
    return score

# 데이터셋 분할
x_train, x_val, y_train, y_val = train_test_split(
    train_data.drop("target", axis=1),
    train_data["target"],
    test_size=0.2,
    shuffle=True,
    random_state=RANDOM_STATE,
)

# 하이퍼 파라미터 튜닝
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=RANDOM_STATE))
study.optimize(lambda trial: objectiveExtraTreesClassifier(trial, x_train, y_train, x_val, y_val), n_trials=500)

print('Best trial: score {}, \nparams {}'.format(study.best_trial.value, study.best_trial.params))


[I 2024-08-22 22:35:03,484] A new study created in memory with name: no-name-21f93f6c-ce62-489a-803f-af46367e0c08
[I 2024-08-22 22:36:10,694] Trial 0 finished with value: 0.16216216216216214 and parameters: {'n_estimators': 2616, 'max_depth': 63, 'min_samples_split': 5, 'min_samples_leaf': 5, 'criterion': 'entropy', 'bootstrap': False}. Best is trial 0 with value: 0.16216216216216214.
[I 2024-08-22 22:37:32,076] Trial 1 finished with value: 0.16216216216216214 and parameters: {'n_estimators': 3180, 'max_depth': 64, 'min_samples_split': 10, 'min_samples_leaf': 5, 'criterion': 'entropy', 'bootstrap': False}. Best is trial 0 with value: 0.16216216216216214.
[I 2024-08-22 22:38:56,397] Trial 2 finished with value: 0.1810699588477366 and parameters: {'n_estimators': 3013, 'max_depth': 51, 'min_samples_split': 5, 'min_samples_leaf': 3, 'criterion': 'entropy', 'bootstrap': False}. Best is trial 2 with value: 0.1810699588477366.
[I 2024-08-22 22:40:08,276] Trial 3 finished with value: 0.173553

KeyboardInterrupt: 

Best trial: score 0.2034548944337812,  
params {'n_estimators': 2884, 'max_depth': 56, 'min_samples_split': 3, 'min_samples_leaf': 1,   
'criterion': 'entropy', 'bootstrap': False}

In [None]:
# import optuna
# from sklearn.ensemble import ExtraTreesClassifier 

# # 'Normal'과 'AbNormal'을 숫자로 변환
# train_data['target'] = train_data['target'].map({'Normal': 0, 'AbNormal': 1})

# # 스레드홀드 설정
# THRESHOLD = 0.3

# def objectiveExtraTreesClassifier(trial, x_tr, y_tr, x_val, y_val):
#     param = {
#         'n_estimators': trial.suggest_int('n_estimators', 1000, 3500)
#         , 'max_depth': trial.suggest_int('max_depth', 20, 80)
#         , 'min_samples_split': trial.suggest_int('min_samples_split', 2, 10)
#         , 'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 4)
#         , 'criterion': trial.suggest_categorical('criterion', ['entropy'])
#         , 'bootstrap': trial.suggest_categorical('bootstrap', [False])
#         , 'random_state': RANDOM_STATE
#     }
    
#     model = ExtraTreesClassifier(**param)
#     model.fit(x_tr, y_tr)
#     pred = model.predict(x_val)
#     score = f1_score(y_val, pred, average="binary")
    
#     return score

# # 데이터셋 분할
# x_train, x_val, y_train, y_val = train_test_split(
#     train_data.drop("target", axis=1),
#     train_data["target"],
#     test_size=0.2,
#     shuffle=True,
#     random_state=RANDOM_STATE,
# )

# # 하이퍼 파라미터 튜닝
# study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=RANDOM_STATE))
# study.optimize(lambda trial: objectiveExtraTreesClassifier(trial, x_train, y_train, x_val, y_val), n_trials=100)

# print('Best trial: score {}, \nparams {}'.format(study.best_trial.value, study.best_trial.params))


Trial 51 finished with value: 0.20384615384615384 and parameters: {'n_estimators': 2204, 'max_depth': 42,  
'min_samples_split': 3, 'min_samples_leaf': 1, 'criterion': 'entropy', 'bootstrap': False}.

.