# 제품 이상여부 판별 프로젝트


## 1. 데이터 불러오기


### 필수 라이브러리


In [10]:
import os
from pprint import pprint

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from tqdm import tqdm

import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

### 데이터 읽어오기


In [2]:
ROOT_DIR = "data"
RANDOM_STATE = 110

# Load data
train_data = pd.read_csv(os.path.join(ROOT_DIR, "train.csv"))
train_data

Unnamed: 0,Wip Line_Dam,Process Desc._Dam,Equipment_Dam,Model.Suffix_Dam,Workorder_Dam,Insp. Seq No._Dam,Insp Judge Code_Dam,CURE END POSITION X Collect Result_Dam,CURE END POSITION X Unit Time_Dam,CURE END POSITION X Judge Value_Dam,...,Production Qty Collect Result_Fill2,Production Qty Unit Time_Fill2,Production Qty Judge Value_Fill2,Receip No Collect Result_Fill2,Receip No Unit Time_Fill2,Receip No Judge Value_Fill2,WorkMode Collect Result_Fill2,WorkMode Unit Time_Fill2,WorkMode Judge Value_Fill2,target
0,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334505,4F1XA938-1,1,OK,240.0,,,...,7,,,127,,,1,,,Normal
1,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334505,3KPM0016-2,1,OK,240.0,,,...,185,,,1,,,0,,,Normal
2,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4E1X9167-1,1,OK,1000.0,,,...,10,,,73,,,1,,,Normal
3,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3K1X0057-1,1,OK,1000.0,,,...,268,,,1,,,0,,,Normal
4,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3HPM0007-1,1,OK,240.0,,,...,121,,,1,,,0,,,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40501,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3J1XF434-2,1,OK,240.0,,,...,318,,,1,,,0,,,Normal
40502,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4E1XC796-1,1,OK,1000.0,,,...,14,,,197,,,1,,,Normal
40503,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,4C1XD438-1,1,OK,240.0,,,...,1,,,27,,,1,,,Normal
40504,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3I1XA258-1,1,OK,1000.0,,,...,117,,,1,,,0,,,Normal


### 데이터 결측치 제거

In [3]:
# 기본 데이터 정보 확인
print("[기본 데이터 정보 확인]")
train_data.info()
print("\n")

# 모든 행이 NaN인 열 제거
print("[모든 행이 NaN인 열 제거]")
train_cleaned = train_data.dropna(axis=1, how='all')
print(train_cleaned)
print("\n")
# train_cleaned 정보 확인
print("[train_cleaned 정보 확인]")
train_cleaned.info()
# train_cleaned

[기본 데이터 정보 확인]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40506 entries, 0 to 40505
Columns: 464 entries, Wip Line_Dam to target
dtypes: float64(350), int64(77), object(37)
memory usage: 143.4+ MB


[모든 행이 NaN인 열 제거]
      Wip Line_Dam Process Desc._Dam     Equipment_Dam Model.Suffix_Dam  \
0          IVI-OB6     Dam Dispenser  Dam dispenser #1      AJX75334505   
1          IVI-OB6     Dam Dispenser  Dam dispenser #1      AJX75334505   
2          IVI-OB6     Dam Dispenser  Dam dispenser #2      AJX75334501   
3          IVI-OB6     Dam Dispenser  Dam dispenser #2      AJX75334501   
4          IVI-OB6     Dam Dispenser  Dam dispenser #1      AJX75334501   
...            ...               ...               ...              ...   
40501      IVI-OB6     Dam Dispenser  Dam dispenser #1      AJX75334501   
40502      IVI-OB6     Dam Dispenser  Dam dispenser #2      AJX75334501   
40503      IVI-OB6     Dam Dispenser  Dam dispenser #1      AJX75334501   
40504      IVI-OB6     Dam

### 중복 데이터 제거

In [4]:
# 1. 중복 행 제거
# 전체 열을 기준으로 중복 행 개수 출력 (열 기준 : subset=['열'],)
dup = train_cleaned.duplicated(keep='first').sum()
print(f"중복 행 개수: {dup}")

# 중복된 행 없음.
# Wip Line_Dam, Process Desc._Dam, Equipment_Dam가 모두 동일

#성현님 코드 일부 삭제

중복 행 개수: 0


동일한 값을 가지는 열 삭제 및 잘못 유입된 값 제거

In [None]:
# 모든 행이 동일한 값을 가지는 열 찾기
constant_columns = [col for col in train_cleaned.columns
                    if train_cleaned[col].nunique(dropna=False) == 1]

print("모든 행이 동일한 값을 가지는 열:", constant_columns)

train_cleaned = train_cleaned.drop(columns=constant_columns)

In [None]:
# 특정 칼럼에서 'OK' 값을 NaN으로 치환하는 함수 정의
def replace_ok_with_nan(column):
    return column.replace('OK', np.nan)

# 칼럼 리스트
columns_to_clean = [
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam',
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2'
]

# 이상치 치환
for col in columns_to_clean:
    train_cleaned.loc[:, col] = replace_ok_with_nan(train_cleaned[col])

# 결측치 제거한 데이터 저장경로 설정
save_path = os.path.join(ROOT_DIR, "cleaned_train_data.csv")

# 수정된 데이터 저장
train_cleaned.to_csv(save_path, index=False)

### 이상치 제거

In [None]:
# 이상치 탐지(z-score)
def detect_outliers_zscore(data):
    z_scores = np.abs(stats.zscore(data))
    return z_scores > 3

# 히스토그램 시각화
for column in train_cleaned.select_dtypes(include=[np.number]).columns:
    plt.figure(figsize=(10, 6))
    
    # Z-score 이상치 탐지
    outliers_zscore = detect_outliers_zscore(train_cleaned[column])
    
    # 전체 데이터 히스토그램 시각화
    sns.histplot(train_cleaned[column], kde=True, color='lightblue', label='Data', bins=30)
    
    # 이상치 데이터 히스토그램 시각화
    sns.histplot(train_cleaned.loc[outliers_zscore, column], kde=True, color='red', label='Z-score Outliers', bins=30)
    
    plt.title(f'Z-score Outliers Detection in {column}')
    plt.legend()
    plt.show()

### 언더 샘플링


데이타 불균형을 해결하기 위해 언더 샘플링을 진행합니다.


In [5]:
normal_ratio = 1.0  # 1.0 means 1:1 ratio

df_normal = train_data[train_data["target"] == "Normal"]
df_abnormal = train_data[train_data["target"] == "AbNormal"]

num_normal = len(df_normal)
num_abnormal = len(df_abnormal)
print(f"  Total: Normal: {num_normal}, AbNormal: {num_abnormal}")

df_normal = df_normal.sample(n=int(num_abnormal * normal_ratio), replace=False, random_state=RANDOM_STATE)
df_concat = pd.concat([df_normal, df_abnormal], axis=0).reset_index(drop=True)
df_concat.value_counts("target")

  Total: Normal: 38156, AbNormal: 2350


target
AbNormal    2350
Normal      2350
Name: count, dtype: int64

### 데이터 분할


In [6]:
df_train, df_val = train_test_split(
    df_concat,
    test_size=0.3,
    stratify=df_concat["target"],
    random_state=RANDOM_STATE,
)


def print_stats(df: pd.DataFrame):
    num_normal = len(df[df["target"] == "Normal"])
    num_abnormal = len(df[df["target"] == "AbNormal"])

    print(f"  Total: Normal: {num_normal}, AbNormal: {num_abnormal}" + f" ratio: {num_abnormal/num_normal}")


# Print statistics
print(f"  \tAbnormal\tNormal")
print_stats(df_train)
print_stats(df_val)

  	Abnormal	Normal
  Total: Normal: 1645, AbNormal: 1645 ratio: 1.0
  Total: Normal: 705, AbNormal: 705 ratio: 1.0


## 3. 모델 학습


### 모델 정의


In [7]:
model = RandomForestClassifier(random_state=RANDOM_STATE)

### 모델 학습


In [8]:
features = []

for col in df_train.columns:
    try:
        df_train[col] = df_train[col].astype(int)
        features.append(col)
    except:
        continue

train_x = df_train[features]
train_y = df_train["target"]


## StratifiedKFold를 사용하여 교차검증

In [9]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

accuracy_scores = cross_val_score(model, train_x, train_y, cv=skf, scoring='accuracy')
f1_scores = cross_val_score(model, train_x, train_y, cv=skf, scoring='f1_macro')
precision_scores = cross_val_score(model, train_x, train_y, cv=skf, scoring='precision_macro')

print(f"정확도: {np.mean(accuracy_scores):.4f}")
print(f"정밀도: {np.mean(precision_scores):.4f}")
print(f"F1스코어: {np.mean(f1_scores):.4f}")


NameError: name 'StratifiedKFold' is not defined

### 모델학습

In [None]:
model.fit(train_x, train_y)

## 4. 제출하기


### 테스트 데이터 예측


테스트 데이터 불러오기


In [None]:
test_data = pd.read_csv(os.path.join(ROOT_DIR, "test.csv"))

In [None]:
df_test_x = test_data[features]

for col in df_test_x.columns:
    try:
        df_test_x.loc[:, col] = df_test_x[col].astype(int)
    except:
        continue

In [None]:
test_pred = model.predict(df_test_x)
test_pred

### 제출 파일 작성


In [None]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["target"] = test_pred

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

**우측 상단의 제출 버튼을 클릭해 결과를 확인하세요**
