# 제품 이상여부 판별 프로젝트


## 1. 데이터 불러오기


### 필수 라이브러리


In [18]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import os
from pprint import pprint

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from tqdm import tqdm

### 데이터 읽어오기


In [19]:
import pandas as pd

RANDOM_STATE = 110

train_data = pd.read_csv("C:/Users/KimDongyoung/Desktop/git_LGaimers5/Lg_aimers5/Lg_aimers5-1/data/train_data_0816.csv")
test_data = pd.read_csv("C:/Users/KimDongyoung/Desktop/git_LGaimers5/Lg_aimers5/Lg_aimers5-1/data/test_data_0816.csv")

In [20]:
# 공통 변수 리스트
com_variables_train = [
    'target', 'Model.Suffix', 'Workorder', 'WorkMode Collect Result'
    , 'Dispenser_1', 'Dispenser_2', 'Receip_No_Collect_Result'
    , 'Production_Qty_Collect_Result', 'Judge_Value_OK'
    , 'Workorder_0.9', 'Workorder_0.6'
]

com_variables_test = [
    'target', 'Set ID', 'Model.Suffix', 'Workorder'
    , 'WorkMode Collect Result', 'Dispenser_1'
    , 'Dispenser_2', 'Receip_No_Collect_Result'
    , 'Production_Qty_Collect_Result', 'Judge_Value_OK'
    , 'Workorder_0.9', 'Workorder_0.6'
]

In [21]:
# 공정 이름 필터링 후 공통 변수와 결합
def create_dataset(train_data, test_data, process_name, com_variables_train, com_variables_test):
    # 열 이름 필터링
    Process_Desc_col = train_data.filter(like=process_name).columns
    
    # train 데이터셋 생성
    final_columns_train = list(Process_Desc_col) + com_variables_train
    train_dataset = train_data[final_columns_train]
    
    # test 데이터셋 생성
    final_columns_test = list(Process_Desc_col) + com_variables_test
    test_dataset = test_data[final_columns_test]
    
    return train_dataset, test_dataset

# 공통 변수 정의
## com_variables_train = [...]  -> 이전 코드에서 정의한 변수 사용
## com_variables_test = [...]   -> 이전 코드에서 정의한 변수 사용

# 데이터셋 생성
train_data_dam, test_data_dam = create_dataset(train_data, test_data, '_Dam', com_variables_train, com_variables_test)
train_data_fill1, test_data_fill1 = create_dataset(train_data, test_data, '_Fill1', com_variables_train, com_variables_test)
train_data_fill2, test_data_fill2 = create_dataset(train_data, test_data, '_Fill2', com_variables_train, com_variables_test)
train_data_autoclave, test_data_autoclave = create_dataset(train_data, test_data, '_AutoClave', com_variables_train, com_variables_test)

---

## 3. 모델 학습

### 모델 정의

In [22]:
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score, precision_score, recall_score

# 스레드홀드 설정
THRESHOLD = 0.3


def train_and_evaluate_model(model_name, data):
    if model_name not in models:
        print(f"{model_name}은(는) 지원되지 않는 모델입니다.")
        return
    
    # 데이터셋 분할
    x_train, x_val, y_train, y_val = train_test_split(
        data.drop("target", axis=1),
        data["target"].map({'Normal': 0, 'AbNormal': 1}),  # y_val도 숫자로 변환
        test_size=0.2,
        shuffle=True,
        random_state=RANDOM_STATE,
    )

    # 모델 선택
    model = models[model_name]

    # 모델 학습
    model.fit(x_train, y_train)

    # 예측
    y_val_pred_proba = model.predict_proba(x_val)[:, 1]  # 양성 클래스 확률
    y_val_pred = (y_val_pred_proba >= THRESHOLD).astype(int)  # 스레드홀드에 따른 예측

    # 평가지표 계산
    f1 = f1_score(y_val, y_val_pred, average="binary")
    accuracy = accuracy_score(y_val, y_val_pred)
    precision = precision_score(y_val, y_val_pred)
    recall = recall_score(y_val, y_val_pred)
    conf_matrix = confusion_matrix(y_val, y_val_pred)

    # 결과 출력
    print(f'{model_name} 결과:')
    print(f'F1 Score: {f1}')
    print('---')
    print('Confusion Matrix:')
    print(conf_matrix)
    print('---')
    print(f'Accuracy: {accuracy}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print('\n')

아래 모델 목록에서 공정별로 해당하는거로 가져가서 학습

In [10]:
# 모델 설정 및 하이퍼파라미터
#models = {
#     'ExtraTreesClassifier': ExtraTreesClassifier(
#         n_estimators=1645,
#         max_depth=32,
#         min_samples_split=8,
#         min_samples_leaf=1,
#         criterion='entropy',
#         bootstrap=False,
#         random_state=RANDOM_STATE
#     ),
    
#     'RandomForestClassifier': RandomForestClassifier(
#         n_estimators = 1959, 
#         max_depth = 48, 
#         min_samples_split = 5, 
#         min_samples_leaf= 1, 
#         criterion = 'entropy', 
#         class_weight = 'balanced',
#         random_state=RANDOM_STATE
 #    )
    
#     'CatBoostClassifier': CatBoostClassifier(
#         iterations=757,
#         learning_rate=0.050198665725373286,
#         depth=9,
#         l2_leaf_reg=1.4245312044811413,
#         random_strength=9.361327508234833,
#         bagging_temperature=4.80588064825688,
#         border_count=99,
#         scale_pos_weight=1.1403833705864026,
#         random_seed=RANDOM_STATE,
#         eval_metric='F1',
#         logging_level='Silent',
#         boosting_type='Plain'
#     ),
    
#     'LGBMClassifier': LGBMClassifier(
#         n_estimators=979,
#         num_leaves=1565,
#         max_depth=34,
#         learning_rate=0.04888906225539191,
#         min_child_samples=36,
#         boosting_type='dart',
#         random_state=RANDOM_STATE,
#         verbose=-1
#     ),
    
#     'XGBClassifier': XGBClassifier(
#         n_estimators=2213,
#         learning_rate=0.11735028445102921,
#         max_depth=5,
#         alpha=0.008053553080773119,
#         gamma=0.0006061064433044652,
#         reg_alpha=0.5380410453451789,
#         reg_lambda=0.8278020434040504,
#         colsample_bytree=0.8672296781322193,
#         subsample=0.3694461716611997,
#         random_state=RANDOM_STATE
#     ),
    
#     'DecisionTreeClassifier': DecisionTreeClassifier(
#         max_depth=122,
#         min_samples_split=30,
#         min_samples_leaf=21,
#         max_features=None,
#         splitter='best',
#         criterion='gini',
#         random_state=RANDOM_STATE
#     )
#}

# # 사용 예시
#train_and_evaluate_model('RandomForestClassifier', data)
#train_and_evaluate_model('XGBClassifier', train_data_fill1)

RandomForestClassifier 결과:
F1 Score: 0.1927525057825752
---
Confusion Matrix:
[[6930  732]
 [ 315  125]]
---
Accuracy: 0.870772648728709
Precision: 0.14585764294049008
Recall: 0.2840909090909091




Dam 모델

In [40]:
# 모델 설정 및 하이퍼파라미터
models = {
    'LGBMClassifier': LGBMClassifier(
        n_estimators=2748,
        num_leaves=657,
        max_depth=256,
        learning_rate=0.001043279508273329,
        min_child_samples=58,
        boosting_type='dart',
        random_state=RANDOM_STATE,
        verbose=-1
    )
}

# 학습
train_and_evaluate_model('LGBMClassifier', train_data_dam)

LGBMClassifier 결과:
F1 Score: 0.25119617224880386
---
Confusion Matrix:
[[7371  291]
 [ 335  105]]
---
Accuracy: 0.9227351271291039
Precision: 0.26515151515151514
Recall: 0.23863636363636365




In [23]:
# 모델 설정 및 하이퍼파라미터
models = {
    'CatBoostClassifier': CatBoostClassifier(
        iterations= 755, 
        depth= 7, 
        learning_rate= 0.19866742206648758, 
        l2_leaf_reg= 0.24483628170369134, 
        border_count= 95,
        random_state=RANDOM_STATE,
        # verbose=-1
    )
}

# 학습
train_and_evaluate_model('CatBoostClassifier', train_data_dam)

0:	learn: 0.4821751	total: 13.1ms	remaining: 9.91s
1:	learn: 0.3549734	total: 25.9ms	remaining: 9.75s
2:	learn: 0.2939218	total: 38.6ms	remaining: 9.68s
3:	learn: 0.2548974	total: 51.4ms	remaining: 9.65s
4:	learn: 0.2309913	total: 63.8ms	remaining: 9.57s
5:	learn: 0.2190122	total: 77.2ms	remaining: 9.64s
6:	learn: 0.2109658	total: 90.6ms	remaining: 9.68s
7:	learn: 0.2053156	total: 104ms	remaining: 9.69s
8:	learn: 0.2012307	total: 114ms	remaining: 9.44s
9:	learn: 0.1997075	total: 123ms	remaining: 9.15s
10:	learn: 0.1982408	total: 133ms	remaining: 8.97s
11:	learn: 0.1964299	total: 143ms	remaining: 8.86s
12:	learn: 0.1957812	total: 156ms	remaining: 8.91s
13:	learn: 0.1950974	total: 168ms	remaining: 8.9s
14:	learn: 0.1942744	total: 181ms	remaining: 8.91s
15:	learn: 0.1936921	total: 192ms	remaining: 8.87s
16:	learn: 0.1927233	total: 205ms	remaining: 8.9s
17:	learn: 0.1924132	total: 219ms	remaining: 8.95s
18:	learn: 0.1921630	total: 232ms	remaining: 8.97s
19:	learn: 0.1919067	total: 246ms	re

AutoClave 모델

In [41]:
# 모델 설정 및 하이퍼파라미터
models = {
    'LGBMClassifier': LGBMClassifier(
        n_estimators=1205,
        num_leaves=1880,
        max_depth=53,
        learning_rate=0.0877525670484991,
        min_child_samples=288,
        boosting_type='dart',
        random_state=RANDOM_STATE,
        verbose=-1
    )
}

# 학습
train_and_evaluate_model('LGBMClassifier', train_data_autoclave)

LGBMClassifier 결과:
F1 Score: 0.25271739130434784
---
Confusion Matrix:
[[7459  203]
 [ 347   93]]
---
Accuracy: 0.9321155270303628
Precision: 0.3141891891891892
Recall: 0.21136363636363636




Fill1 모델

In [42]:
# 모델 설정 및 하이퍼파라미터
models = {
    'LGBMClassifier': LGBMClassifier(
        n_estimators=979,
        num_leaves=1565,
        max_depth=34,
        learning_rate=0.04888906225539191,
        min_child_samples=36,
        boosting_type='dart',
        random_state=RANDOM_STATE,
        verbose=-1
    )
}

# 학습
train_and_evaluate_model('LGBMClassifier', train_data_fill1)

LGBMClassifier 결과:
F1 Score: 0.25299600532623173
---
Confusion Matrix:
[[7446  216]
 [ 345   95]]
---
Accuracy: 0.9307578375709701
Precision: 0.3054662379421222
Recall: 0.2159090909090909




Fill2 모델

In [43]:
# 모델 설정 및 하이퍼파라미터
models = {
    'LGBMClassifier': LGBMClassifier(
        n_estimators=1308,
        num_leaves=1813,
        max_depth=105,
        learning_rate=0.0026995515108598628,
        min_child_samples=37,
        boosting_type='dart',
        random_state=RANDOM_STATE,
        verbose=-1
    )
}

# 학습
train_and_evaluate_model('LGBMClassifier', train_data_fill2)

LGBMClassifier 결과:
F1 Score: 0.2562189054726368
---
Confusion Matrix:
[[7401  261]
 [ 337  103]]
---
Accuracy: 0.9261910639348309
Precision: 0.28296703296703296
Recall: 0.2340909090909091




---

(작성중)

---

## 모델 학습

In [110]:
from sklearn.model_selection import train_test_split

def split_data(data, target, test_size=0.2):
    return train_test_split(data, test_size=test_size, stratify=target, random_state=RANDOM_STATE)

df_train_dam, df_val_dam = split_data(train_data_dam, train_data["target"])
df_train_fill1, df_val_fill1 = split_data(train_data_fill1, train_data["target"])
df_train_fill2, df_val_fill2 = split_data(train_data_fill2, train_data["target"])
df_train_autoclave, df_val_autoclave = split_data(train_data_autoclave, train_data["target"])

공정별로 모델 학습 진행

분할한 데이터 -> 원래의 데이터(train data)로 학습한 새 모델 생성

In [113]:
# SettingWithCopyWarning 경고 무시
pd.options.mode.chained_assignment = None  # default='warn'

In [114]:
# 모델 학습
preprocess_and_train(train_data_dam, model_Dam)
preprocess_and_train(train_data_fill1, model_Fill1)
preprocess_and_train(train_data_fill2, model_Fill2)
preprocess_and_train(train_data_autoclave, model_AutoClave)

In [115]:
# 예측에 필요한 데이터 분리
x_test_dam = test_data_dam.drop(["target", "Set ID"], axis=1)
x_test_fill1 = test_data_fill1.drop(["target", "Set ID"], axis=1)
x_test_fill2 = test_data_fill2.drop(["target", "Set ID"], axis=1)
x_test_autoclave = test_data_autoclave.drop(["target", "Set ID"], axis=1)

# 각 공정의 예측 확률 계산
probs = [
    model_Dam.predict_proba(x_test_dam)[:, 1]
    , model_Fill1.predict_proba(x_test_fill1)[:, 1]
    , model_Fill2.predict_proba(x_test_fill2)[:, 1]
    , model_AutoClave.predict_proba(x_test_autoclave)[:, 1]
]

In [125]:
# 소프트 보팅: 각 모델의 확률 평균 계산
soft_voting_probs = np.mean(probs, axis=0)

# 최종 예측: 평균 확률에 대해 스레드 홀드 0.3 적용
final_predictions = (soft_voting_probs >= 0.23).astype(int)

# 최종 예측 결과 출력
print(sum(final_predictions))

1058


In [122]:
# 소프트 보팅: 각 모델의 확률 평균 계산
soft_voting_probs = np.mean(probs, axis=0)

# 최종 예측: 평균 확률에 대해 스레드 홀드 0.3 적용
final_predictions = (soft_voting_probs >= 0.3).astype(int)

# 최종 예측 결과 출력
print(sum(final_predictions))

572


In [135]:
# 4개의 예측 중 확률값이 0.3 이상이면 1(AbNormal)로 예측
threshold = 0.3
final_predictions = (np.any([prob > threshold for prob in probs], axis=0)).astype(int)

print(sum(final_predictions))

1783


## 4. 제출하기


### 제출 파일 작성


In [129]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["target"] = final_predictions

# df_sub['target'] 값을 문자열 레이블로 변환
df_sub['target'] = df_sub['target'].apply(lambda x: 'AbNormal' if x == 1 else 'Normal')

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

In [130]:
df_sub['target'].value_counts()

Normal      16303
AbNormal     1058
Name: target, dtype: int64

In [131]:
df_sub.head(10)

Unnamed: 0,Set ID,target
0,0001be084fbc4aaa9d921f39e595961b,Normal
1,0005bbd180064abd99e63f9ed3e1ac80,Normal
2,000948934c4140d883d670adcb609584,Normal
3,000a6bfd02874c6296dc7b2e9c5678a7,Normal
4,0018e78ce91343678716e2ea27a51c95,Normal
5,001fda4596f545d0a3b0ce85fbea77d2,Normal
6,0020734a7b29472298358ad58645a0c9,Normal
7,00234c5914cd4c4a888d13f8b3773135,Normal
8,00297b6c93e44d49ac534758a23dc74e,Normal
9,002d904240d84b188d410d16383a9c3a,Normal


**우측 상단의 제출 버튼을 클릭해 결과를 확인하세요**
