# 제품 이상여부 판별 프로젝트


### 데이터 읽어오기


In [1]:
import pandas as pd

RANDOM_STATE = 110
THRESHOLD = 0.3

train_data = pd.read_csv("C:/Users/KimDongyoung/Desktop/git_LGaimers5/Lg_aimers5/Lg_aimers5-1/data/train_data_0816.csv")
test_data = pd.read_csv("C:/Users/KimDongyoung/Desktop/git_LGaimers5/Lg_aimers5/Lg_aimers5-1/data/test_data_0816.csv")

In [2]:
# 공통 변수 리스트
com_variables_train = [
    'target', 'Model.Suffix', 'Workorder', 'WorkMode Collect Result'
    , 'Dispenser_1', 'Dispenser_2', 'Receip_No_Collect_Result'
    , 'Production_Qty_Collect_Result', 'Judge_Value_OK'
    , 'Workorder_0.9', 'Workorder_0.6'
]

com_variables_test = [
    'target', 'Set ID', 'Model.Suffix', 'Workorder'
    , 'WorkMode Collect Result', 'Dispenser_1'
    , 'Dispenser_2', 'Receip_No_Collect_Result'
    , 'Production_Qty_Collect_Result', 'Judge_Value_OK'
    , 'Workorder_0.9', 'Workorder_0.6'
]

In [3]:
# 공정 이름 필터링 후 공통 변수와 결합
def create_dataset(train_data, test_data, process_name, com_variables_train, com_variables_test):
    # 열 이름 필터링
    Process_Desc_col = train_data.filter(like=process_name).columns
    
    # train 데이터셋 생성
    final_columns_train = list(Process_Desc_col) + com_variables_train
    train_dataset = train_data[final_columns_train]
    
    # test 데이터셋 생성
    final_columns_test = list(Process_Desc_col) + com_variables_test
    test_dataset = test_data[final_columns_test]
    
    return train_dataset, test_dataset

# 공통 변수 정의
## com_variables_train = [...]  -> 이전 코드에서 정의한 변수 사용
## com_variables_test = [...]   -> 이전 코드에서 정의한 변수 사용

# 데이터셋 생성
train_data_dam, test_data_dam = create_dataset(train_data, test_data, '_Dam', com_variables_train, com_variables_test)
train_data_fill1, test_data_fill1 = create_dataset(train_data, test_data, '_Fill1', com_variables_train, com_variables_test)
train_data_fill2, test_data_fill2 = create_dataset(train_data, test_data, '_Fill2', com_variables_train, com_variables_test)
train_data_autoclave, test_data_autoclave = create_dataset(train_data, test_data, '_AutoClave', com_variables_train, com_variables_test)

---

In [4]:
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score, precision_score, recall_score

# 스레드홀드 설정
THRESHOLD = 0.3


def train_and_evaluate_model(model_name, data):
    if model_name not in models:
        print(f"{model_name}은(는) 지원되지 않는 모델입니다.")
        return
    
    # 데이터셋 분할
    x_train, x_val, y_train, y_val = train_test_split(
        train_data_autoclave.drop("target", axis=1),
        train_data_autoclave["target"].map({'Normal': 0, 'AbNormal': 1}),  # y_val도 숫자로 변환
        test_size=0.2,
        shuffle=True,
        random_state=RANDOM_STATE,
    )

    # 모델 선택
    model = models[model_name]

    # 모델 학습
    model.fit(x_train, y_train)

    # 예측
    y_val_pred_proba = model.predict_proba(x_val)[:, 1]  # 양성 클래스 확률
    y_val_pred = (y_val_pred_proba >= THRESHOLD).astype(int)  # 스레드홀드에 따른 예측

    # 평가지표 계산
    f1 = f1_score(y_val, y_val_pred, average="binary")
    accuracy = accuracy_score(y_val, y_val_pred)
    precision = precision_score(y_val, y_val_pred)
    recall = recall_score(y_val, y_val_pred)
    conf_matrix = confusion_matrix(y_val, y_val_pred)

    # 결과 출력
    print(f'{model_name} 결과:')
    print(f'F1 Score: {f1}')
    print('---')
    print('Confusion Matrix:')
    print(conf_matrix)
    print('---')
    print(f'Accuracy: {accuracy}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print('\n')

In [5]:
# 모델 설정 및 하이퍼파라미터
models = {
    'ExtraTreesClassifier': ExtraTreesClassifier(
        n_estimators=1645,
        max_depth=32,
        min_samples_split=8,
        min_samples_leaf=1,
        criterion='entropy',
        bootstrap=False,
        random_state=RANDOM_STATE
    ),
    
    'RandomForestClassifier': RandomForestClassifier(
        n_estimators=1005,
        max_depth=82,
        min_samples_split=3,
        min_samples_leaf=1,
        criterion='gini',
        class_weight='balanced',
        random_state=RANDOM_STATE
    ),
    
    'CatBoostClassifier': CatBoostClassifier(
        iterations=757,
        learning_rate=0.050198665725373286,
        depth=9,
        l2_leaf_reg=1.4245312044811413,
        random_strength=9.361327508234833,
        bagging_temperature=4.80588064825688,
        border_count=99,
        scale_pos_weight=1.1403833705864026,
        random_seed=RANDOM_STATE,
        eval_metric='F1',
        logging_level='Silent',
        boosting_type='Plain'
    ),
    
    'LGBMClassifier': LGBMClassifier(
        n_estimators=691,
        num_leaves=555,
        max_depth=74,
        learning_rate=0.0519,
        min_child_samples=59,
        boosting_type='dart',
        random_state=RANDOM_STATE,
        verbose=-1
    ),
    
    'XGBClassifier': XGBClassifier(
        n_estimators=2213,
        learning_rate=0.11735028445102921,
        max_depth=5,
        alpha=0.008053553080773119,
        gamma=0.0006061064433044652,
        reg_alpha=0.5380410453451789,
        reg_lambda=0.8278020434040504,
        colsample_bytree=0.8672296781322193,
        subsample=0.3694461716611997,
        random_state=RANDOM_STATE
    ),
    
    'DecisionTreeClassifier': DecisionTreeClassifier(
        max_depth=19,
        min_samples_split=10,
        min_samples_leaf=3,
        max_features=None,
        splitter='best',
        criterion='gini',
        random_state=RANDOM_STATE
    )
}

# 사용 예시
# train_and_evaluate_model('LGBMClassifier', train_data_autoclave)
train_and_evaluate_model('DecisionTreeClassifier', train_data_autoclave)

DecisionTreeClassifier 결과:
F1 Score: 0.21443736730360935
---
Confusion Matrix:
[[7261  401]
 [ 339  101]]
---
Accuracy: 0.9086645272772155
Precision: 0.20119521912350596
Recall: 0.22954545454545455


