# 세팅

In [None]:
# Google drive mount
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
from pprint import pprint

import numpy as np
import pandas as pd
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import sys
import seaborn as sns # 데이터 시각화 라이브러리
import matplotlib.pyplot as plt # 그래프 라이브러리
import missingno as msno

# 경고 무시 설정
import warnings
warnings.filterwarnings('ignore')
# 그래프 인라인 표시 명령어
%matplotlib inline

In [None]:
# 환경마다 설정 잘 하기
ROOT_DIR = "/content/drive/MyDrive/HackJun/data"
RANDOM_STATE = 110  # 고정

In [None]:
# Load data
df_train = pd.read_csv(os.path.join(ROOT_DIR, "train.csv"), low_memory=False)

## 1:1 Under Sampling

In [None]:
normal_ratio = 1.0  # 1.0 means 1:1 ratio

# "target" 열이 "Normal"인 행들 : Normal 데이터들
df_normal = df_train[df_train["target"] == "Normal"]

# "target" 열이 "AbNormal"인 행들 : Abnomal 데이터들
df_abnormal = df_train[df_train["target"] == "AbNormal"]

num_normal = len(df_normal)
num_abnormal = len(df_abnormal)
print(f"  Total: Normal: {num_normal}, AbNormal: {num_abnormal}")

In [None]:
# "Normal" 클래스의 샘플을 "AbNormal" 클래스의 샘플 수에 맞게 랜덤으로 추출
df_normal = df_normal.sample(
    n=int(num_abnormal * normal_ratio),  # 추출할 샘플 수를 설정
    replace=False,                       # 중복 없이 샘플 추출
    random_state=RANDOM_STATE            # 랜덤 시드 설정으로 재현성 확보
)

# "Normal" 클래스와 "AbNormal" 클래스의 데이터프레임 결합
df_concat = pd.concat([df_normal, df_abnormal], axis=0).reset_index(drop=True)

# 결합된 데이터프레임에서 "target" 열의 각 클래스별 샘플 수
df_concat.value_counts("target")

## 전처리
- ML 특성상 Complexity 고려해야 하므로, 피쳐수 대폭 축약

In [None]:
######################################## 복합 버전 #################################
# 삭제 및 변환 근거 EDA.py 참고
def preprocess_data(df_origin):

    df = df_origin.copy()

    # 절반 이상 결측치 칼럼, 단일값 칼럼, 결측치 포함 칼럼 제거
    drop_cols = [
        column for column in df.columns
        if (df[column].notnull().sum() // 2) < df[column].isnull().sum()
        or df[column].nunique() == 1
        or df[column].isnull().any()
    ]

    df_drop = df.drop(columns=drop_cols)

    # Workorder 제거
    work_to_drop = ['Workorder_Dam', 'Workorder_AutoClave', 'Workorder_Fill1', 'Workorder_Fill2']
    df_drop = df_drop.drop(columns=work_to_drop)

    # palletID, Recip, Workmode 칼럼 삭제
    cols_to_drop = [col for col in df_drop.columns if any(keyword in col for keyword in ['PalletID', 'Receip', 'WorkMode'])]
    df_drop = df_drop.drop(columns=cols_to_drop)

    # model_suffix 값 생성, 변환 수행
    model_suffix_value = ['AJX75334501', 'AJX75334503', 'AJX75334502', 'AJX75334505', 'AJX75334506', 'AJX75334507', 'AJX75334508']

    # Model_suffix 체크하고 각 열에 대해 변환
    for suffix in model_suffix_value:
        df_drop[suffix] = ((df_drop['Model.Suffix_Dam'] == suffix) &
                        (df_drop['Model.Suffix_Fill1'] == suffix) &
                        (df_drop['Model.Suffix_Fill2'] == suffix) &
                        (df_drop['Model.Suffix_AutoClave'] == suffix)).astype(int)

    # 변환된 열을 바탕으로 model_suffix 열 생성
    columns_to_check = ['AJX75334501', 'AJX75334505', 'AJX75334507', 'AJX75334508']
    df_drop['model_suffix'] = df_drop[columns_to_check].apply(lambda row: 0 if (row == 1).any() else 1, axis=1)

    # 기존 Model.suffix 삭제
    df_drop.drop(columns=columns_to_check + ['AJX75334502', 'AJX75334503', 'AJX75334506'] +
                ['Model.Suffix_Dam', 'Model.Suffix_Fill1', 'Model.Suffix_Fill2', 'Model.Suffix_AutoClave'], inplace=True)

    # Equipment 칼럼 더미화
    df_drop = pd.get_dummies(df_drop, columns=['Equipment_Dam', 'Equipment_Fill1', 'Equipment_Fill2'])

    # True / False -> 1 / 0
    df_drop = df_drop.applymap(lambda x: 1 if x is True else (0 if x is False else x))

    # 'Chamber Temp. Judge Value_AutoClave' 컬럼의 OK와 NG 값을 0과 1로 변환
    df_drop['Chamber Temp. Judge Value_AutoClave'] = df_drop['Chamber Temp. Judge Value_AutoClave'].replace({'OK': 0, 'NG': 1})

    # 그룹화 및 대표값 계산을 위한 함수 정의
    def calculate_axis_means(df, group_name, axis):
        group_data = df[[col for col in df.columns if group_name in col and axis in col]]
        group_mean = group_data.mean(axis=1)
        return group_mean

    # 위치 관련 변수 처리 (X, Y, Z 각각의 평균값 계산)
    df_drop['Dam_Position_X_Mean'] = calculate_axis_means(df_drop, 'Dam', 'X')
    df_drop['Dam_Position_Y_Mean'] = calculate_axis_means(df_drop, 'Dam', 'Y')
    df_drop['Dam_Position_Z_Mean'] = calculate_axis_means(df_drop, 'Dam', 'Z')

    df_drop['Fill1_Position_X_Mean'] = calculate_axis_means(df_drop, 'Fill1', 'X')
    df_drop['Fill1_Position_Y_Mean'] = calculate_axis_means(df_drop, 'Fill1', 'Y')
    df_drop['Fill1_Position_Z_Mean'] = calculate_axis_means(df_drop, 'Fill1', 'Z')

    df_drop['Fill2_Position_X_Mean'] = calculate_axis_means(df_drop, 'Fill2', 'X')
    df_drop['Fill2_Position_Y_Mean'] = calculate_axis_means(df_drop, 'Fill2', 'Y')
    df_drop['Fill2_Position_Z_Mean'] = calculate_axis_means(df_drop, 'Fill2', 'Z')

    # 계산에 사용된 칼럼들을 삭제 (Mean이 포함된 칼럼은 제외)
    cols_to_drop = [col for col in df_drop.columns if any(keyword in col for keyword in ['Dam', 'Fill1', 'Fill2'])
                    and any(axis in col for axis in ['X', 'Y', 'Z'])
                    and 'Mean' not in col]

    df_drop = df_drop.drop(columns=cols_to_drop)

    # 그룹화 및 대표값 계산을 위한 함수 정의
    def calculate_speed_means(df, group_name):
        group_data = df[[col for col in df.columns if group_name in col and ('SPEED' in col or 'Speed' in col)]]
        group_mean = group_data.mean(axis=1)
        return group_mean

    # 속도 관련 변수 처리
    df_drop['Dam_SPEED_Mean'] = calculate_speed_means(df_drop, 'Dam')
    df_drop['Fill1_SPEED_Mean'] = calculate_speed_means(df_drop, 'Fill1')
    df_drop['Fill2_SPEED_Mean'] = calculate_speed_means(df_drop, 'Fill2')

    # 계산에 사용된 칼럼들을 삭제 (Mean이 포함된 칼럼은 제외)
    cols_to_drop = [col for col in df_drop.columns if any(keyword in col for keyword in ['Dam', 'Fill1', 'Fill2'])
                    and ('SPEED' in col or 'Speed' in col)
                    and 'Mean' not in col]

    df_drop = df_drop.drop(columns=cols_to_drop)

    # 그룹화 및 대표값 계산을 위한 함수 정의
    def calculate_time_means(df, group_name):
        group_data = df[[col for col in df.columns if group_name in col and ('TIME' in col or 'Time' in col or 'time' in col)]]
        group_mean = group_data.mean(axis=1)
        return group_mean

    # 시간 관련 변수 처리
    df_drop['Dam_TIME_Mean'] = calculate_time_means(df_drop, 'Dam')
    df_drop['AutoClave_TIME_Mean'] = calculate_time_means(df_drop, 'AutoClave')
    df_drop['Fill1_TIME_Mean'] = calculate_time_means(df_drop, 'Fill1')
    df_drop['Fill2_TIME_Mean'] = calculate_time_means(df_drop, 'Fill2')

    # 계산에 사용된 칼럼들을 삭제 (Mean이 포함된 칼럼은 제외)
    cols_to_drop = [col for col in df_drop.columns if any(keyword in col for keyword in ['Dam', 'AutoClave', 'Fill1', 'Fill2'])
                    and ('TIME' in col or 'Time' in col or 'time' in col)
                    and 'Mean' not in col]

    df_drop = df_drop.drop(columns=cols_to_drop)

    # 그룹화 및 대표값 계산을 위한 함수 정의
    def calculate_vol_means(df, group_name):
        group_data = df[[col for col in df.columns if group_name in col and ('Volume' in col)]]
        group_mean = group_data.mean(axis=1)
        return group_mean

    # 속도 관련 변수 처리
    df_drop['Dam_Dispense_Volume_Mean'] = calculate_vol_means(df_drop, 'Dam')
    df_drop['Fill1_Dispense_Volume_Mean'] = calculate_vol_means(df_drop, 'Fill1')

    # 계산에 사용된 칼럼들을 삭제 (Mean이 포함된 칼럼은 제외)
    cols_to_drop = [col for col in df_drop.columns if any(keyword in col for keyword in ['Dam', 'Fill1'])
                    and ('Volume' in col) and 'Mean' not in col]

    df_drop = df_drop.drop(columns=cols_to_drop)

    # Dam Θ 처리
    df_drop['CURE_POSITION_Θ_Collect_Result_Dam'] = df_drop['CURE START POSITION Θ Collect Result_Dam'].replace({-90 : 0, 90: 1})
    df_drop = df_drop.drop(['CURE START POSITION Θ Collect Result_Dam', 'CURE END POSITION Θ Collect Result_Dam'], axis=1)

    # 전처리 완료

    # target 칼럼 마지막 배치
    if 'target' in df_drop.columns:
        target_col = df_drop.pop('target')
        df_drop['target'] = target_col
        df_drop['target'] = df_drop['target'].map({'Normal': 0, 'AbNormal': 1})

    return df_drop

In [None]:
df_pre = preprocess_data(df_concat)

In [None]:
df_all = preprocess_data(df_train)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

scaler_stand = StandardScaler()
scaler_minmax = MinMaxScaler()

In [None]:
# X (Origin)
X_df = df_pre.drop(['target'], axis = 1)
X = X_df.values
# Y
Y_df = df_pre['target']
Y = Y_df.values


## target 포함 ##
# 이진 값으로 이루어진 칼럼 목록 식별 (target 칼럼 제외)
binary_columns = [col for col in df_pre.columns if col != 'target' and df_pre[col].nunique() == 2]

# Stand
df_stand = df_pre.drop(columns=['target']).copy()
df_stand.loc[:, df_stand.columns.difference(binary_columns)] = scaler_stand.fit_transform(df_stand.drop(columns=binary_columns))

# Min-Max
df_minmax = pd.DataFrame(scaler_minmax.fit_transform(df_pre.drop(columns=['target'])), columns=df_pre.drop(columns=['target']).columns)

# Min-Max 후 Log
df_log = df_minmax.copy()
df_log.loc[:, df_minmax.columns.difference(binary_columns)] = np.log1p(df_minmax.drop(columns=binary_columns))

## target 미포함 ##
df_stand_t = df_stand.copy()
df_stand_t['target'] = df_pre['target']

df_minmax_t = df_minmax.copy()
df_minmax_t['target'] = df_pre['target']

df_log_t = df_log.copy()
df_log_t['target'] = df_pre['target']

In [None]:
# 성능 평가를 위한 함수 정의
def evaluate_model(y_true, y_pred, model_name):
    f1 = f1_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    accuracy = accuracy_score(y_true, y_pred)
    conf_matrix = confusion_matrix(y_true, y_pred)

    print(f"=== {model_name} Performance ===")
    print("F1 Score:", f1)
    print("Precision:", precision)
    print("Recall:", recall)
    print("Accuracy:", accuracy)
    print("Confusion Matrix:\n", conf_matrix)
    print("\n")


In [None]:
# 필요한 라이브러리 임포트
from sklearn.model_selection import KFold  # 교차 검증을 위한 K-Fold 클래스
from sklearn.model_selection import cross_validate  # 교차 검증을 위한 함수
from sklearn.model_selection import cross_val_score  # 교차 검증 점수를 계산하기 위한 함수
from sklearn.model_selection import StratifiedKFold  # 층화된 K-Fold 교차 검증을 위한 클래스
from sklearn.neighbors import KNeighborsClassifier  # K-최근접 이웃 분류기
from sklearn.tree import DecisionTreeClassifier  # 의사결정 트리 분류기
from sklearn.ensemble import RandomForestClassifier  # 랜덤 포레스트 분류기
from sklearn.ensemble import GradientBoostingClassifier  # 그라디언트 부스팅 분류기
from sklearn.experimental import enable_hist_gradient_boosting  # 히스토그램 그라디언트 부스팅 분류기 사용을 위한 모듈
from sklearn.ensemble import HistGradientBoostingClassifier  # 히스토그램 그라디언트 부스팅 분류기
from sklearn.naive_bayes import GaussianNB  # 가우시안 나이브 베이즈 분류기
from sklearn.svm import SVC  # 서포트 벡터 머신 분류기

# Logistic

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
# 로지스틱 회귀 모델 학습
model = LogisticRegression(random_state=110, max_iter=10000)
model.fit(df_stand.values, Y)

# 예측
y_pred = model.predict(df_all_stand.values)

In [None]:
# 성능 평가
evaluate_model(Y_all, y_pred, "Logistic Regression")

=== Logistic Regression Performance ===
F1 Score: 0.04036243822075783
Precision: 0.6282051282051282
Recall: 0.020851063829787235
Accuracy: 0.9424776576309682
Confusion Matrix:
 [[38127    29]
 [ 2301    49]]




# IF

In [None]:
from sklearn.ensemble import IsolationForest

In [None]:
# Isolation Forest 모델 생성
iso_forest = IsolationForest(contamination=0.2, random_state=110, max_samples=1024, n_estimators=1000) # 이상치 30%

In [None]:
iso_forest.fit(df_minmax.values)

In [None]:
# 이상치 탐지
y_pred = iso_forest.predict(df_all_minmax.values)
y_pred = np.where(y_pred == 1, 0, 1)  # 1: 정상, -1: 이상 -> 0: 정상, 1: 이상

In [None]:
# 성능 평가
evaluate_model(Y_all, y_pred, "IF")

=== IF Performance ===
F1 Score: 0.11243660261447246
Precision: 0.06755944716284659
Recall: 0.33489361702127657
Accuracy: 0.6932553201994767
Confusion Matrix:
 [[27294 10862]
 [ 1563   787]]




# KNN

In [None]:
# KNN
# K-최근접 이웃 분류기 모델 생성
model = KNeighborsClassifier()

# 교차 검증을 수행하고, 훈련 점수와 테스트 점수를 반환
score = cross_validate(model, df_all_stand.values, Y_all,
                       return_train_score=True, n_jobs=-1,
                       cv=StratifiedKFold())

# n_jobs=-1 : 가능한 모든 CPU 코어를 사용하여 병렬 처리를 수행
# cv=StratifiedKFold() : 층화된 K-폴드 교차 검증을 사용

# 훈련 점수와 테스트 점수의 평균을 계산하여 출력
print(np.mean(score['train_score']), np.mean(score['test_score']))

0.9466005021734075 0.9419838455812328


In [None]:
model.fit(df_all_stand.values, Y_all)
# 예측 수행
y_pred = model.predict(df_all_stand.values)
evaluate_model(Y_all, y_pred, "KNN")

=== KNN Performance ===
F1 Score: 0.21157390342793952
Precision: 0.790633608815427
Recall: 0.12212765957446808
Accuracy: 0.9471930084431937
Confusion Matrix:
 [[38080    76]
 [ 2063   287]]




# Random Forest

In [None]:
# Random Forest
model = RandomForestClassifier()
score = cross_validate(model, df_all_stand.values, Y_all,
                      return_train_score=True, n_jobs=-1,
                      cv = StratifiedKFold())
print(np.mean(score['train_score']), np.mean(score['test_score']))

0.9996173412884216 0.9425270097334165


In [None]:
model.fit(df_all_stand.values, Y_all)
# 예측 수행
y_pred = model.predict(df_all_stand.values)
evaluate_model(Y_all, y_pred, "Random Forest")

=== Random Forest Performance ===
F1 Score: 0.9959445037353256
Precision: 0.9991434689507495
Recall: 0.9927659574468085
Accuracy: 0.9995309336888363
Confusion Matrix:
 [[38154     2]
 [   17  2333]]




In [None]:
# Get feature importances
feature_importances = model.feature_importances_

# Create a DataFrame for feature importances
feature_importances_df = pd.DataFrame({
    'Feature': df_minmax.columns,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

feature_importances_df.head(30)

# Gradient Boosting

In [None]:
# GradientBoosting
model = GradientBoostingClassifier()
score = cross_validate(model, df_all_stand.values, Y_all,
                      return_train_score=True, n_jobs=-1,
                      cv = StratifiedKFold())
print(np.mean(score['train_score']), np.mean(score['test_score']))

0.9453167397618035 0.944502013596488


In [None]:
model.fit(df_all_stand.values, Y_all)
# 예측 수행
y_pred = model.predict(df_all_stand.values)
evaluate_model(Y_all, y_pred, "GB")

=== GB Performance ===
F1 Score: 0.11151223425591657
Precision: 0.972027972027972
Recall: 0.05914893617021277
Accuracy: 0.9453167431985385
Confusion Matrix:
 [[38152     4]
 [ 2211   139]]




# XGBoost

In [None]:
from xgboost import XGBClassifier

In [None]:
model = XGBClassifier(n_estimators=500,
                        learning_rate=0.01,
                        max_depth=10)
score = cross_validate(model, df_stand.values, Y,
                      return_train_score=True, n_jobs=-1,
                      cv = StratifiedKFold())
print(np.mean(score['train_score']), np.mean(score['test_score']))

0.8295478723404255 0.760531914893617


In [None]:
model.fit(df_stand.values, Y)
# 예측 수행
y_pred = model.predict(df_all_stand.values)
evaluate_model(Y_all, y_pred, "GB")

=== GB Performance ===
F1 Score: 0.11948608137044966
Precision: 0.12025862068965518
Recall: 0.11872340425531915
Accuracy: 0.8984841751839233
Confusion Matrix:
 [[36115  2041]
 [ 2071   279]]




# HistGradientBoosting

In [None]:
# HistGradientBoosting
model = HistGradientBoostingClassifier()
score = cross_validate(model, df_all_stand.values, Y_all,
                      return_train_score=True, n_jobs=-1,
                      cv = StratifiedKFold())
print(np.mean(score['train_score']), np.mean(score['test_score']))

0.9443168875831075 0.9438848241274813


In [None]:
model.fit(df_all_stand.values, Y_all)
# 예측 수행
y_pred = model.predict(df_all_stand.values)
evaluate_model(Y_all, y_pred, "HGB")

=== HGB Performance ===
F1 Score: 0.07513270722743977
Precision: 0.9292929292929293
Recall: 0.03914893617021276
Accuracy: 0.9440823581691601
Confusion Matrix:
 [[38149     7]
 [ 2258    92]]




# Naive Bayes

In [None]:
# Naive Bayes
model = GaussianNB()
score = cross_validate(model, df_all_stand.values, Y_all,
                      return_train_score=True, n_jobs=-1,
                      cv = StratifiedKFold())
print(np.mean(score['train_score']), np.mean(score['test_score']))

0.8328889497832497 0.8320495859009821


In [None]:
model.fit(df_all_stand.values, Y_all)
# 예측 수행
y_pred = model.predict(df_all_stand.values)
evaluate_model(Y_all, y_pred, "HGB")

=== HGB Performance ===
F1 Score: 0.11766247379454926
Precision: 0.08500567966679289
Recall: 0.19106382978723405
Accuracy: 0.833753024243322
Confusion Matrix:
 [[33323  4833]
 [ 1901   449]]




# SVM

In [None]:
# Support Vector Machine(SVM)
model = SVC()
score = cross_validate(model, df_all_stand.values, Y_all,
                      return_train_score=True, n_jobs=-1,
                      cv = StratifiedKFold())
print(np.mean(score['train_score']), np.mean(score['test_score']))

0.9432306320331184 0.9429467140520515


In [None]:
model.fit(df_all_stand.values, Y_all)
# 예측 수행
y_pred = model.predict(df_all_stand.values)
evaluate_model(Y_all, y_pred, "SVM")

=== SVM Performance ===
F1 Score: 0.04329725228975853
Precision: 1.0
Recall: 0.022127659574468085
Accuracy: 0.9432676640497704
Confusion Matrix:
 [[38156     0]
 [ 2298    52]]




# Test

In [None]:
# test
test = pd.read_csv(os.path.join(ROOT_DIR, "test.csv"), low_memory=False)

test_pre = preprocess_data(test)
test_X = test_pre.drop(columns=['Set ID'])
X_stand = pd.DataFrame(scaler_stand.fit_transform(test_X), columns=test_X.columns)

y_pred_test = model.predict(X_stand.values)

# submission
submission = pd.read_csv("/content/drive/MyDrive/HackJun/submission.csv", low_memory=False)
submission["target"] = y_pred_test

submission['target'] = submission['target'].replace({0: 'Normal', 1: 'AbNormal'})

# 제출 파일 저장
submission.to_csv("submission.csv", index=False)

In [None]:
submission