# 제품 이상여부 판별 프로젝트

## 1. 데이터 불러오기

### 필수 라이브러리

In [1]:
import os
from pprint import pprint

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from tqdm import tqdm

### 엑셀 파일을 읽는 함수

읽어오는 속도가 느린 엑셀 파일을 위해 csv 파일로 변환하여 저장해 두고 사용합니다.

In [2]:
def read_excel_file(file_path: str, header: int = None) -> pd.DataFrame:
    csv_file = file_path.replace(".xlsx", ".csv")

    if not os.path.exists(csv_file):
        print("Converting excel to csv...")
        if header:
            df = pd.read_excel(file_path, header=header)
        else:
            df = pd.read_excel(file_path)

        df.to_csv(csv_file, index=False)
        print(f"  {file_path} -> {csv_file}")
        return df
    else:
        print(f"  Reading {csv_file}")
        return pd.read_csv(csv_file, low_memory=False)

### 엑셀 파일들 읽어오기

In [3]:
ROOT_DIR = "data"
RANDOM_STATE = 110

X_Dam = read_excel_file(os.path.join(ROOT_DIR, "Dam dispensing.xlsx"), header=1)

X_AutoClave = read_excel_file(
    os.path.join(ROOT_DIR, "Auto clave.xlsx"), header=1
)

X_Fill1 = read_excel_file(
    os.path.join(ROOT_DIR, "Fill1 dispensing.xlsx"), header=1
)

X_Fill2 = read_excel_file(
    os.path.join(ROOT_DIR, "Fill2 dispensing.xlsx"), header=1
)

y = pd.read_csv(os.path.join(ROOT_DIR, "train_y.csv"))

  Reading data\Dam dispensing.csv
  Reading data\Auto clave.csv
  Reading data\Fill1 dispensing.csv
  Reading data\Fill2 dispensing.csv


### 데이터 병합

x 데이터 병합

In [4]:
# Rename columns
X_Dam.columns = [i + " - Dam" for i in X_Dam.columns]
X_AutoClave.columns = [i + " - AutoClave" for i in X_AutoClave.columns]
X_Fill1.columns = [i + " - Fill1" for i in X_Fill1.columns]
X_Fill2.columns = [i + " - Fill2" for i in X_Fill2.columns]
X_Dam = X_Dam.rename(columns={"Set ID - Dam": "Set ID"})
X_AutoClave = X_AutoClave.rename(columns={"Set ID - AutoClave": "Set ID"})
X_Fill1 = X_Fill1.rename(columns={"Set ID - Fill1": "Set ID"})
X_Fill2 = X_Fill2.rename(columns={"Set ID - Fill2": "Set ID"})

# Merge X
X = pd.merge(X_Dam, X_AutoClave, on="Set ID")
X = pd.merge(X, X_Fill1, on="Set ID")
X = pd.merge(X, X_Fill2, on="Set ID")
X = X.drop(X[X.duplicated(subset="Set ID")].index).reset_index(drop=True)
X

Unnamed: 0,Wip Line - Dam,Process Desc. - Dam,Equipment - Dam,Model.Suffix - Dam,Workorder - Dam,LOT ID - Dam,Set ID,Box ID - Dam,Collect Date - Dam,Insp. Seq No. - Dam,...,Collect Result.37 - Fill2,Unit Time.37 - Fill2,Judge Value.37 - Fill2,Collect Result.38 - Fill2,Unit Time.38 - Fill2,Judge Value.38 - Fill2,Collect Result.39 - Fill2,Unit Time.39 - Fill2,Judge Value.39 - Fill2,Unnamed: 131 - Fill2
0,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3F1X5847-2,OP753345013050000002,OP753345013050000002,,2023-05-04 08:57:23,1,...,1,,,1,,,0,,,
1,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3F1X5847-2,OP753345013050000003,OP753345013050000003,,2023-05-04 09:11:35,1,...,2,,,1,,,0,,,
2,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3F1X5847-2,OP753345013050000004,OP753345013050000004,,2023-05-04 09:13:19,1,...,3,,,1,,,0,,,
3,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3F1X5847-2,OP753345013050000005,OP753345013050000005,,2023-05-04 09:15:24,1,...,4,,,1,,,0,,,
4,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3F1X5847-2,OP753345013050000006,OP753345013050000006,,2023-05-04 09:17:27,1,...,5,,,1,,,0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57862,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334505,4F1XB738-1,OP753345054040002685,OP753345054040002685,,2024-04-28 18:30:42,1,...,11,,,435,,,1,,,0.0
57863,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334505,4F1XB738-1,OP753345054040002686,OP753345054040002686,,2024-04-28 18:31:43,1,...,5,,,436,,,1,,,0.0
57864,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334505,4F1XB738-1,OP753345054040002687,OP753345054040002687,,2024-04-28 18:32:49,1,...,12,,,437,,,1,,,0.0
57865,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334505,4F1XB738-1,OP753345054040002688,OP753345054040002688,,2024-04-28 18:33:51,1,...,6,,,438,,,1,,,0.0


x 데이터와 y 데이터 병합

In [5]:
# Merge X and y
df_merged = pd.merge(X, y, "inner", on="Set ID")

# Drop columns with more than half of the values missing
drop_cols = []
for column in df_merged.columns:
    if (df_merged[column].notnull().sum() // 2) < df_merged[
        column
    ].isnull().sum():
        drop_cols.append(column)
df_merged = df_merged.drop(drop_cols, axis=1)

# Drop Lot ID
df_merged = df_merged.drop("LOT ID - Dam", axis=1)
df_merged

Unnamed: 0,Wip Line - Dam,Process Desc. - Dam,Equipment - Dam,Model.Suffix - Dam,Workorder - Dam,Set ID,Collect Date - Dam,Insp. Seq No. - Dam,Insp Judge Code - Dam,Collect Result - Dam,...,Collect Result.31 - Fill2,Collect Result.32 - Fill2,Collect Result.33 - Fill2,Collect Result.34 - Fill2,Collect Result.35 - Fill2,Collect Result.36 - Fill2,Collect Result.37 - Fill2,Collect Result.38 - Fill2,Collect Result.39 - Fill2,target
0,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3F1X5847-2,OP753345013050000003,2023-05-04 09:11:35,1,OK,240.0,...,91.8,270.0,50,85,18.200,5.0,2,1,0,AbNormal
1,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3F1X5847-2,OP753345013050000004,2023-05-04 09:13:19,1,OK,240.0,...,91.8,270.0,50,85,18.400,6.0,3,1,0,AbNormal
2,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3F1X5847-2,OP753345013050000007,2023-05-04 09:19:31,1,OK,240.0,...,91.8,270.0,50,85,18.600,1.0,6,1,0,AbNormal
3,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3F1X5847-2,OP753345013050000011,2023-05-04 11:19:55,1,OK,240.0,...,91.8,270.0,50,85,18.200,4.0,18,1,0,AbNormal
4,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3F1X5847-2,OP753345013050000024,2023-05-05 09:35:48,1,OK,240.0,...,91.8,270.0,50,85,18.100,6.0,12,1,0,AbNormal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40501,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334505,4F1XB738-1,OP753345054040002680,2024-04-28 18:25:20,1,OK,240.0,...,50.0,91.8,270,50,114.612,18.8,2,430,1,Normal
40502,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334505,4F1XB738-1,OP753345054040002681,2024-04-28 18:26:27,1,OK,1000.0,...,50.0,91.8,270,50,114.612,19.0,9,431,1,Normal
40503,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334505,4F1XB738-1,OP753345054040002682,2024-04-28 18:27:27,1,OK,240.0,...,50.0,91.8,270,50,114.612,19.3,3,432,1,Normal
40504,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334505,4F1XB738-1,OP753345054040002684,2024-04-28 18:29:36,1,OK,240.0,...,50.0,91.8,270,50,114.612,19.6,4,434,1,Normal


In [6]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40506 entries, 0 to 40505
Columns: 189 entries, Wip Line - Dam to target
dtypes: float64(72), int64(77), object(40)
memory usage: 58.7+ MB


In [8]:
# 결측값 'unknown' 값으로 대체
df_merged['Collect Result.17 - Dam'].fillna('unknown', inplace=True)
df_merged['Collect Result.7 - Fill1'].fillna('unknown', inplace=True)
df_merged['Collect Result.17 - Fill2'].fillna('unknown', inplace=True)

In [9]:
print(df_merged['Collect Result.17 - Dam'].isnull().sum())
print(df_merged['Collect Result.7 - Fill1'].isnull().sum())
print(df_merged['Collect Result.17 - Fill2'].isnull().sum())

0
0
0


### 데이터 분할

In [10]:
df_merged = df_merged.sort_values(by=["Collect Date - Dam"])
df_train, df_val = train_test_split(
    df_merged,
    test_size=0.3,
    stratify=df_merged["target"],
    random_state=RANDOM_STATE,
)


def print_stats(df: pd.DataFrame):
    num_normal = len(df[df["target"] == "Normal"])
    num_abnormal = len(df[df["target"] == "AbNormal"])

    print(
        f"  Total: Normal: {num_normal}, AbNormal: {num_abnormal}"
        + f" ratio: {num_abnormal/num_normal}"
    )


# Print statistics
print(f"  \tAbnormal\tNormal")
print_stats(df_train)
print_stats(df_val)

  	Abnormal	Normal
  Total: Normal: 26709, AbNormal: 1645 ratio: 0.06158972630948369
  Total: Normal: 11447, AbNormal: 705 ratio: 0.06158818904516467


## 3. 모델 학습

In [11]:
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)

def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])
    weighted_F1 = f1_score(y_test, y_pred, average='weighted')

    metrics = pd.DataFrame({
        '정확도': [accuracy],
        '정밀도': [precision],
        '재현율': [recall],
        'F1 Score': [F1],
        'Weighted F1': [weighted_F1]
    })

    confusion_df = pd.DataFrame(confusion, index=['True', 'False'], columns=['True', 'False'])

    print("\n오차행렬:")
    display(confusion_df)
    print("평가 지표:")
    display(metrics)

In [12]:
# 모델 라이브러리
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

# 보팅
from sklearn.ensemble import VotingClassifier

### 모델 정의 

In [13]:
# RandomForest
rf_model = RandomForestClassifier(
    n_estimators=1935
    , max_depth=32
    , min_samples_split=2
    , min_samples_leaf=1   
    , bootstrap=True
    , criterion='entropy'
    , class_weight='balanced'
    , random_state=0
)

# LightGBM_dart
lgbm_dart_model = LGBMClassifier(
    n_estimators=1029
    , num_leaves=167
    , max_depth=30
    , learning_rate=0.05767571715999541
    , min_child_samples=25
    , verbose=-1
    , boosting='dart'  # dart 사용
    , random_state=0
)

# XGBoost 
xgb_model = XGBClassifier(
    n_estimators=414
    , learning_rate=0.20046808426888615
    , max_depth=11
    , alpha=0.004365542651458743
    , gamma=0.00025712949731685885
    , reg_alpha=0.17168922089033928
    , reg_lambda=0.03881395024846057
    , colsample_bytree=0.32031741412326675
    , subsample=0.6269215430592496
    , objective='binary:logistic'  # 이진 분류
    , tree_method="exact"        
    , random_state=0
)

# ExtraTrees
et_model = ExtraTreesClassifier(
    n_estimators=100
    , max_depth=48
    , min_samples_split=3
	, min_samples_leaf=1
	, criterion='gini'
	, random_state=0
)

In [14]:
### 보팅 분류기 생성 ###
model = VotingClassifier(
    estimators=[
        ('lgb_dart',lgbm_dart_model),
        ('xgb', xgb_model),
        ('rf', rf_model),
        ('et', et_model)
    ],
    voting='soft'  # 'hard'는 다수결 투표, 'soft'는 확률 평균
)

In [15]:
df_merged = df_merged.sort_values(by=["Collect Date - Dam"])
df_train, df_val = train_test_split(
    df_merged,
    test_size=0.3,
    stratify=df_merged["target"],
    random_state=RANDOM_STATE,
)


def print_stats(df: pd.DataFrame):
    num_normal = len(df[df["target"] == "Normal"])
    num_abnormal = len(df[df["target"] == "AbNormal"])

    print(
        f"  Total: Normal: {num_normal}, AbNormal: {num_abnormal}"
        + f" ratio: {num_abnormal/num_normal}"
    )


# Print statistics
print(f"  \tAbnormal\tNormal")
print_stats(df_train)
print_stats(df_val)

  	Abnormal	Normal
  Total: Normal: 26709, AbNormal: 1645 ratio: 0.06158972630948369
  Total: Normal: 11447, AbNormal: 705 ratio: 0.06158818904516467


### 모델 학습

In [16]:
features = []

for col in df_train.columns:
    try:
        df_train[col] = df_train[col].astype(int)
        features.append(col)
    except:
        continue

if "Set ID" in features:
    features.remove("Set ID")

train_x = df_train[features]
train_y = df_train["target"]

model.fit(train_x, train_y)

VotingClassifier(estimators=[('lgb_dart',
                              LGBMClassifier(boosting='dart',
                                             learning_rate=0.05767571715999541,
                                             max_depth=30, min_child_samples=25,
                                             n_estimators=1029, num_leaves=167,
                                             random_state=0, verbose=-1)),
                             ('xgb',
                              XGBClassifier(alpha=0.004365542651458743,
                                            base_score=None, booster=None,
                                            callbacks=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_by...
                                            monotone_constraints=None,
                                            n_estimators=414, n_jobs=N

In [26]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# 1. 검증 데이터에 대한 예측 수행
val_x = df_val[features]  # 검증 데이터의 특성(features)
val_y = df_val["target"]  # 검증 데이터의 실제 레이블

# 예측 수행
val_pred = model.predict(val_x)

# 2. 성능 지표 계산
accuracy = accuracy_score(val_y, val_pred)
conf_matrix = confusion_matrix(val_y, val_pred)
class_report = classification_report(val_y, val_pred)

print(f"Accuracy: {accuracy}")
print(f"Confusion Matrix:\n{conf_matrix}")
print(f"Classification Report:\n{class_report}")


Accuracy: 0.9435483870967742
Confusion Matrix:
[[   27   678]
 [    8 11439]]
Classification Report:
              precision    recall  f1-score   support

    AbNormal       0.77      0.04      0.07       705
      Normal       0.94      1.00      0.97     11447

    accuracy                           0.94     12152
   macro avg       0.86      0.52      0.52     12152
weighted avg       0.93      0.94      0.92     12152



In [36]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

# 1. 검증 데이터에 대한 예측 수행
val_x = df_val[features]  # 검증 데이터의 특성(features)
val_y = df_val["target"]  # 검증 데이터의 실제 레이블

# 레이블 인코딩
label_encoder = LabelEncoder()
val_y_encoded = label_encoder.fit_transform(val_y)

# 예측 확률 수행
val_pred_proba = model.predict_proba(val_x)[:, 1]  # 양성 클래스(1)의 확률

# 스레스홀드 0.3 적용
threshold = 0.85
val_pred = (val_pred_proba >= threshold).astype(int)

# 2. 성능 지표 계산
accuracy = accuracy_score(val_y_encoded, val_pred)
conf_matrix = confusion_matrix(val_y_encoded, val_pred)
class_report = classification_report(val_y_encoded, val_pred, target_names=label_encoder.classes_)

print(f"Accuracy: {accuracy}")
print(f"Confusion Matrix:\n{conf_matrix}")
print(f"Classification Report:\n{class_report}")

Accuracy: 0.8784562211981567
Confusion Matrix:
[[  141   564]
 [  913 10534]]
Classification Report:
              precision    recall  f1-score   support

    AbNormal       0.13      0.20      0.16       705
      Normal       0.95      0.92      0.93     11447

    accuracy                           0.88     12152
   macro avg       0.54      0.56      0.55     12152
weighted avg       0.90      0.88      0.89     12152



## 4. 제출하기

### 테스트 데이터 예측

테스트 데이터 불러오기

In [20]:
df_test_y = pd.read_csv(os.path.join("submission.csv"))

In [21]:
df_test = pd.merge(X, df_test_y, "inner", on="Set ID")
df_test_x = df_test[features]

for col in df_test_x.columns:
    try:
        df_test_x.loc[:, col] = df_test_x[col].astype(int)
    except:
        continue

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


In [39]:
# 테스트 데이터에 대한 예측 확률 수행
test_pred_proba = model.predict_proba(df_test_x)[:, 1]  # 양성 클래스(1)의 확률

# 스레스홀드 0.85 적용
threshold = 0.85
test_pred = (test_pred_proba >= threshold).astype(int)

test_pred

array([0, 0, 0, ..., 1, 1, 1])

In [52]:
test_pred.sum()/len(test_pred)

0.9241979148666551

### 제출 파일 작성

In [41]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["target"] = test_pred

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

In [53]:
import pandas as pd
df_csv = pd.read_csv('submission.csv')
df_csv['target'] = df_csv['target'].replace({
    0: 'AbNormal',
    1: 'Normal'
})

df_csv.to_csv('submission.csv', index=False)

In [55]:
df_csv = pd.read_csv('submission.csv')
df_csv.head(20)

Unnamed: 0,Set ID,target
0,OP753345013050000002,AbNormal
1,OP753345013050000005,AbNormal
2,OP753345013050000006,AbNormal
3,OP753345013050000008,AbNormal
4,OP753345013050000009,AbNormal
5,OP753345013050000015,AbNormal
6,OP753345013050000012,AbNormal
7,OP753345013050000031,Normal
8,OP753345013050000033,AbNormal
9,OP753345013050000036,Normal


**우측 상단의 제출 버튼을 클릭해 결과를 확인하세요**