# 제품 이상여부 판별 프로젝트


## 1. 데이터 불러오기


### 필수 라이브러리


In [1]:
import os
from pprint import pprint

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tqdm import tqdm

### 데이터 읽어오기


In [2]:
ROOT_DIR = "data"
RANDOM_STATE = 110

# Load data
data = pd.read_csv(os.path.join(ROOT_DIR, "train.csv"))
# Drop NaN values
data = data.dropna(axis=1)
# Encode target
data.target = data.target.map({'Normal': 0, 'AbNormal': 1})
# Drop string values
data = data.select_dtypes(include=[np.number])
data

Unnamed: 0,Insp. Seq No._Dam,CURE END POSITION X Collect Result_Dam,CURE END POSITION Z Collect Result_Dam,CURE END POSITION Θ Collect Result_Dam,CURE SPEED Collect Result_Dam,CURE STANDBY POSITION X Collect Result_Dam,CURE STANDBY POSITION Z Collect Result_Dam,CURE STANDBY POSITION Θ Collect Result_Dam,CURE START POSITION X Collect Result_Dam,CURE START POSITION Z Collect Result_Dam,...,Head Clean Position Z Collect Result_Fill2,Head Purge Position X Collect Result_Fill2,Head Purge Position Y Collect Result_Fill2,Head Purge Position Z Collect Result_Fill2,Machine Tact time Collect Result_Fill2,PalletID Collect Result_Fill2,Production Qty Collect Result_Fill2,Receip No Collect Result_Fill2,WorkMode Collect Result_Fill2,target
0,1,240.0,2.5,-90,100,1150,33.5,0,1030,33.5,...,50.0,91.8,270,50,114.612,19.9,7,127,1,0
1,1,240.0,2.5,-90,70,1150,33.5,0,1030,33.5,...,91.8,270.0,50,85,19.600,7.0,185,1,0,0
2,1,1000.0,12.5,90,85,1150,33.5,0,280,33.5,...,50.0,91.8,270,50,114.612,19.8,10,73,1,0
3,1,1000.0,12.5,90,70,1150,33.5,0,280,33.5,...,91.8,270.0,50,85,19.900,12.0,268,1,0,0
4,1,240.0,2.5,-90,70,1150,33.5,0,1030,33.5,...,91.8,270.0,50,85,19.700,8.0,121,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40501,1,240.0,2.5,-90,70,1150,33.5,0,1030,33.5,...,91.8,270.0,50,85,19.200,1.0,318,1,0,0
40502,1,1000.0,12.5,90,100,1150,33.5,0,280,33.5,...,50.0,91.8,270,50,114.612,20.5,14,197,1,0
40503,1,240.0,2.5,-90,100,1150,33.5,0,1030,33.5,...,50.0,91.8,270,50,85.000,19.7,1,27,1,0
40504,1,1000.0,12.5,90,70,1150,33.5,0,280,33.5,...,91.8,270.0,50,85,20.100,13.0,117,1,0,0


### 언더 샘플링


데이타 불균형을 해결하기 위해 언더 샘플링을 진행합니다.


In [3]:
normal_ratio = 1.0  # 1.0 means 1:1 ratio

df_normal = data[data["target"] == 0]
df_abnormal = data[data["target"] == 1]

num_normal = len(df_normal)
num_abnormal = len(df_abnormal)
print(f"  Total: Normal: {num_normal}, AbNormal: {num_abnormal}")

df_normal = df_normal.sample(n=int(num_abnormal * normal_ratio), replace=False, random_state=RANDOM_STATE)
df_concat = pd.concat([df_normal, df_abnormal], axis=0).reset_index(drop=True)
df_concat.value_counts("target")

  Total: Normal: 38156, AbNormal: 2350


target
0    2350
1    2350
Name: count, dtype: int64

### 데이터 분할


In [4]:
df_train, df_val = train_test_split(
    df_concat,
    test_size=0.3,
    stratify=df_concat["target"],
    random_state=RANDOM_STATE,
)

train_min = df_train.min(axis=0)
train_max = df_train.max(axis=0)
test_min = df_val.min(axis=0)
test_max = df_val.max(axis=0)


df_train = (df_train - train_min) / (train_max - train_min)
df_val = (df_val - test_min) / (test_max - test_min)

df_train = df_train.dropna(axis=1)
df_val = df_val.dropna(axis=1)

def print_stats(df: pd.DataFrame):
    num_normal = len(df[df["target"] == "Normal"])
    num_abnormal = len(df[df["target"] == "AbNormal"])

    print(f"  Total: Normal: {num_normal}, AbNormal: {num_abnormal}" + f" ratio: {num_abnormal/num_normal}")

# X, y, train, test
X_train = df_train.drop('target', axis = 1)
y_train = df_train['target']
X_test = df_val.drop('target', axis = 1)
y_test = df_val['target']

df_train

Unnamed: 0,CURE END POSITION X Collect Result_Dam,CURE END POSITION Z Collect Result_Dam,CURE END POSITION Θ Collect Result_Dam,CURE SPEED Collect Result_Dam,CURE START POSITION X Collect Result_Dam,CURE START POSITION Θ Collect Result_Dam,DISCHARGED SPEED OF RESIN Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam,...,Head Clean Position Z Collect Result_Fill2,Head Purge Position X Collect Result_Fill2,Head Purge Position Y Collect Result_Fill2,Head Purge Position Z Collect Result_Fill2,Machine Tact time Collect Result_Fill2,PalletID Collect Result_Fill2,Production Qty Collect Result_Fill2,Receip No Collect Result_Fill2,WorkMode Collect Result_Fill2,target
2067,1.0,1.0,1.0,0.000000,0.0,1.0,0.0,1.000000,0.985294,0.991525,...,0.0,0.0,1.0,0.0,0.694129,0.811159,0.026694,0.128978,0.058824,0.0
3733,1.0,1.0,1.0,0.428571,0.0,1.0,1.0,0.452991,0.705882,0.432203,...,0.0,0.0,1.0,0.0,1.000000,0.858369,0.022587,0.105528,0.058824,1.0
3245,0.0,0.0,0.0,0.000000,1.0,0.0,0.0,1.000000,1.000000,0.991525,...,0.0,0.0,1.0,0.0,0.694129,0.793991,0.006160,0.031826,0.058824,1.0
3620,0.0,0.0,0.0,0.000000,1.0,0.0,0.0,0.000000,0.161765,0.000000,...,1.0,1.0,0.0,1.0,0.017560,0.085837,0.151951,0.001675,0.000000,1.0
3352,1.0,1.0,1.0,0.000000,0.0,1.0,0.0,0.008547,0.161765,0.008475,...,1.0,1.0,0.0,1.0,0.026856,0.515021,0.248460,0.001675,0.000000,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
671,1.0,1.0,1.0,0.000000,0.0,1.0,1.0,0.307692,0.411765,0.296610,...,0.0,0.0,1.0,0.0,0.694129,0.828326,0.018480,0.328308,0.058824,0.0
623,0.0,0.0,0.0,0.000000,1.0,0.0,1.0,0.341880,0.632353,0.338983,...,0.0,0.0,1.0,0.0,0.694129,0.841202,0.012320,0.358459,0.058824,0.0
1690,1.0,1.0,1.0,0.000000,0.0,1.0,1.0,0.307692,0.661765,0.305085,...,0.0,0.0,1.0,0.0,0.694129,0.789700,0.032854,0.093802,0.058824,0.0
4644,0.0,0.0,0.0,0.000000,1.0,0.0,0.0,0.000000,0.014706,0.008475,...,1.0,1.0,0.0,1.0,0.008263,0.000000,0.131417,0.001675,0.000000,1.0


## 3. 모델 학습


### 모델 정의


In [5]:
model = MLPClassifier(hidden_layer_sizes=(32, 16, 8, 4, 2), max_iter=1000, random_state=23)

model.fit(X_train, y_train)

# Step 8: Make predictions
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

# Step 9: Evaluate the model
train_accuracy = accuracy_score(y_train, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred_test)
print(f'Training Accuracy: {train_accuracy: .4f}')
print(f'Test Accuracy: {test_accuracy:.4f}')

Training Accuracy:  0.5000
Test Accuracy: 0.5000


### 모델 학습


## 4. 제출하기


### 테스트 데이터 예측


테스트 데이터 불러오기


In [6]:
test_data = pd.read_csv(os.path.join(ROOT_DIR, "test.csv"))

# Drop NaN values
test_data = test_data.dropna(axis=1)
# Drop string values
test_data = test_data.select_dtypes(include=[np.number])

test_fin_min = test_data.min(axis=0)
test_fin_max = test_data.max(axis=0)

test_data = (test_data - test_fin_min) / (test_fin_max - test_fin_min)

test_data = test_data.dropna(axis=1)

In [7]:
test_pred = model.predict(test_data)

test_pred

array([1., 1., 1., ..., 1., 1., 1.])

### 제출 파일 작성


In [8]:
from sklearn.preprocessing import LabelEncoder

# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")

df_sub["target"] = test_pred
df_sub.target = df_sub.target.map({0: 'Normal', 1: 'AbNormal'})

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

**우측 상단의 제출 버튼을 클릭해 결과를 확인하세요**
