# 제품 이상여부 판별 프로젝트


In [1]:
import optuna
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)

### 데이터 읽어오기


In [3]:
import pandas as pd

RANDOM_STATE = 110

## -- 실제 주소값 넣어야함 -- ##
# train_data = pd.read_csv("../../../data/train_data_0816.csv")
# test_data = pd.read_csv("../../../data/test_data_0816.csv")

In [4]:
# 공통 변수 리스트
com_variables_train = [
    'target', 'Model.Suffix', 'Workorder', 'WorkMode Collect Result'
    , 'Dispenser_1', 'Dispenser_2', 'Receip_No_Collect_Result'
    , 'Production_Qty_Collect_Result', 'Judge_Value_OK'
    , 'Workorder_0.9', 'Workorder_0.6'
]

com_variables_test = [
    'target', 'Set ID', 'Model.Suffix', 'Workorder'
    , 'WorkMode Collect Result', 'Dispenser_1'
    , 'Dispenser_2', 'Receip_No_Collect_Result'
    , 'Production_Qty_Collect_Result', 'Judge_Value_OK'
    , 'Workorder_0.9', 'Workorder_0.6'
]

In [5]:
# 공정 이름 필터링 후 공통 변수와 결합
def create_dataset(train_data, test_data, process_name, com_variables_train, com_variables_test):
    # 열 이름 필터링
    Process_Desc_col = train_data.filter(like=process_name).columns
    
    # train 데이터셋 생성
    final_columns_train = list(Process_Desc_col) + com_variables_train
    train_dataset = train_data[final_columns_train]
    
    # test 데이터셋 생성
    final_columns_test = list(Process_Desc_col) + com_variables_test
    test_dataset = test_data[final_columns_test]
    
    return train_dataset, test_dataset

# 공통 변수 정의
## com_variables_train = [...]  -> 이전 코드에서 정의한 변수 사용
## com_variables_test = [...]   -> 이전 코드에서 정의한 변수 사용

# 데이터셋 생성
train_data_dam, test_data_dam = create_dataset(train_data, test_data, '_Dam', com_variables_train, com_variables_test)
train_data_fill1, test_data_fill1 = create_dataset(train_data, test_data, '_Fill1', com_variables_train, com_variables_test)
train_data_fill2, test_data_fill2 = create_dataset(train_data, test_data, '_Fill2', com_variables_train, com_variables_test)
train_data_autoclave, test_data_autoclave = create_dataset(train_data, test_data, '_AutoClave', com_variables_train, com_variables_test)

---

## Optuna

스레스홀드 0.3으로 맞춘상태에서 튜닝 진행한 것

In [None]:
# # 'Normal'과 'AbNormal'을 숫자로 변환
# train_data_fill1['target'] = train_data_fill1['target'].map({'Normal': 0, 'AbNormal': 1})

# # 스레드홀드 설정
# THRESHOLD = 0.3

# def objectiveDecisionTree(trial, x_tr, y_tr, x_val, y_val):
#     param = {
#         'max_depth': trial.suggest_int('max_depth', 2, 100)
#         , 'min_samples_split': trial.suggest_int('min_samples_split', 2, 50)
#         , 'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20)
#         , 'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None])
#         , 'splitter': trial.suggest_categorical('splitter', ['best', 'random'])
#         , 'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy'])
#         , 'random_state': RANDOM_STATE
#     }
       
#     model = DecisionTreeClassifier(**param)
#     model.fit(x_tr, y_tr)
#     pred_proba = model.predict_proba(x_val)[:, 1]  # 양성 클래스 확률
#     pred = (pred_proba >= THRESHOLD).astype(int)  # 스레드홀드에 따른 예측
    
#     score = f1_score(y_val, pred, average="binary")
    
#     return score

# # 데이터셋 분할
# x_train, x_val, y_train, y_val = train_test_split(
#     train_data_fill1.drop("target", axis=1),
#     train_data_fill1["target"],
#     test_size=0.2,
#     shuffle=True,
#     random_state=RANDOM_STATE,
# )

# # 하이퍼 파라미터 튜닝
# study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=RANDOM_STATE))
# study.optimize(lambda trial: objectiveDecisionTree(trial, x_train, y_train, x_val, y_val), n_trials=3000)

# print('Best trial: score {}, \nparams {}'.format(study.best_trial.value, study.best_trial.params))


Trial 13 finished with value: 0.21323529411764705  
parameters: {'max_depth': 25, 'min_samples_split': 14, 'min_samples_leaf': 5, 'max_features':  
None, 'splitter': 'random', 'criterion': 'entropy'}. Best is trial 13 with value: 0.21323529411764705

In [13]:
from sklearn.tree import DecisionTreeClassifier

THRESHOLD = 0.3

# 최적의 하이퍼파라미터
best_params = {
    'max_depth': 25,
    'min_samples_split': 14,
    'min_samples_leaf': 5,
    'max_features': None,
    'splitter': 'random',
    'criterion': 'entropy',
    'random_state': RANDOM_STATE
}

# 모델 생성
model = DecisionTreeClassifier(**best_params)

# 데이터셋 분할 (재사용)
x_train, x_val, y_train, y_val = train_test_split(
    train_data_fill1.drop("target", axis=1),
    train_data_fill1["target"],
    test_size=0.2,
    shuffle=True,
    random_state=RANDOM_STATE,
)

# 모델 학습
model.fit(x_train, y_train)

# 검증 데이터에 대한 예측
y_val_pred_proba = model.predict_proba(x_val)[:, 1]  # 양성 클래스 확률
y_val_pred = (y_val_pred_proba >= THRESHOLD).astype(int)  # 스레드홀드에 따른 예측

# 평가지표 계산
f1 = f1_score(y_val, y_val_pred, average="binary")
accuracy = accuracy_score(y_val, y_val_pred)
precision = precision_score(y_val, y_val_pred)
recall = recall_score(y_val, y_val_pred)

# 혼동 행렬 계산
conf_matrix = confusion_matrix(y_val, y_val_pred)

# 결과 출력
print(f'F1 Score: {f1}')
print('---')
print('Confusion Matrix:')
print(conf_matrix) # 혼동 행렬 출력
print('---')
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')

F1 Score: 0.21323529411764705
---
Confusion Matrix:
[[7373  289]
 [ 353   87]]
---
Accuracy: 0.9207603060972599
Precision: 0.23138297872340424
Recall: 0.19772727272727272


optuna F1-score 와 실질 모델 학습 성능 결과와 같은 수치  

.