# 제품 이상여부 판별 프로젝트


## 1. 데이터 불러오기


### 필수 라이브러리


In [557]:
import os
from pprint import pprint

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from tqdm import tqdm

### 데이터 읽어오기


In [558]:
RANDOM_STATE = 110

train_data = pd.read_csv("../../data/train_data.csv")
test_data = pd.read_csv("../../data/test_data.csv")

---

In [559]:
train_data.info()
print('---')
# test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40506 entries, 0 to 40505
Columns: 119 entries, Model.Suffix to Dispenser_num
dtypes: float64(58), int64(51), object(10)
memory usage: 36.8+ MB
---


In [560]:
# train_data.info()
print('---')
test_data.info()

---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17361 entries, 0 to 17360
Columns: 120 entries, Set ID to Dispenser_num
dtypes: float64(94), int64(16), object(10)
memory usage: 15.9+ MB


---

## Modeling

In [561]:
# 각 변수별로 결측값이 존재하는지 확인하는 코드
missing_values = train_data.isnull().sum()

# 결측값이 존재하는 변수와 그 개수 출력
missing_values = missing_values[missing_values > 0]
print(missing_values)

# 결측값이 존재하는 변수명을 리스트에 담기
missing_columns = missing_values.index.tolist()
# print("결측값이 존재하는 변수명:", missing_columns)

HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Dam      29213
WorkMode Collect Result_Dam                                24059
GMES_ORIGIN_INSP_JUDGE_CODE Collect Result_AutoClave       29213
GMES_ORIGIN_INSP_JUDGE_CODE Judge Value_AutoClave          29213
HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill1    29213
WorkMode Collect Result_Fill1                              24059
HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill2    29213
WorkMode Collect Result_Fill2                              24059
dtype: int64


In [562]:
# 결측값이 존재하는 변수들의 value_counts 계산
for column in missing_columns:
    print(f"Column: {column}")
    print(train_data[column].value_counts())
    print("\n")

Column: HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Dam
OK    11293
Name: HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Dam, dtype: int64


Column: WorkMode Collect Result_Dam
7.0    16447
Name: WorkMode Collect Result_Dam, dtype: int64


Column: GMES_ORIGIN_INSP_JUDGE_CODE Collect Result_AutoClave
OK    11293
Name: GMES_ORIGIN_INSP_JUDGE_CODE Collect Result_AutoClave, dtype: int64


Column: GMES_ORIGIN_INSP_JUDGE_CODE Judge Value_AutoClave
OK    11293
Name: GMES_ORIGIN_INSP_JUDGE_CODE Judge Value_AutoClave, dtype: int64


Column: HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill1
OK    11293
Name: HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill1, dtype: int64


Column: WorkMode Collect Result_Fill1
7.0    16447
Name: WorkMode Collect Result_Fill1, dtype: int64


Column: HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill2
OK    11293
Name: HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill2, dtype: int64


Column: WorkMode Collect Result_Fill2
0.0    16447

In [563]:
# "OK" 값을 가진 변수를 찾고 제거
columns_to_drop = [column for column in missing_columns if train_data[column].apply(lambda x: np.any(x == "OK")).any()]

# 드롭할 변수명 출력 (한 줄에 2개씩)
print("드롭할 변수명:")
for i in range(0, len(columns_to_drop), 2):
    print(columns_to_drop[i:i+2])

# 변수 드롭
train_data = train_data.drop(columns=columns_to_drop)
test_data = test_data.drop(columns=columns_to_drop)

드롭할 변수명:
['HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Dam', 'GMES_ORIGIN_INSP_JUDGE_CODE Collect Result_AutoClave']
['GMES_ORIGIN_INSP_JUDGE_CODE Judge Value_AutoClave', 'HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill1']
['HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill2']


In [564]:
# 각 변수별로 결측값이 존재하는지 확인하는 코드
missing_values = train_data.isnull().sum()

# 결측값이 존재하는 변수와 그 개수 출력
missing_values = missing_values[missing_values > 0]
print(missing_values)

# 결측값이 존재하는 변수명을 리스트에 담기
missing_columns = missing_values.index.tolist()
# print("결측값이 존재하는 변수명:", missing_columns)

WorkMode Collect Result_Dam      24059
WorkMode Collect Result_Fill1    24059
WorkMode Collect Result_Fill2    24059
dtype: int64


In [565]:
# 그룹화할 변수들
groupby_columns = [
    "WorkMode Collect Result_Dam"
    , "WorkMode Collect Result_Fill2"
    ]

# 그룹화하여 target 변수 값의 비율을 계산
grouped = train_data.groupby(groupby_columns)["target"].value_counts(normalize=True).unstack().fillna(0)

# 결과 출력
print(grouped)

target                                                     AbNormal    Normal
WorkMode Collect Result_Dam WorkMode Collect Result_Fill2                    
7.0                         0.0                            0.073326  0.926674


In [566]:
# WorkMode Collect Result_Dam의 이름을 WorkMode Collect Result로 변경
train_data = train_data.rename(columns={'WorkMode Collect Result_Dam': 'WorkMode Collect Result'})
test_data = test_data.rename(columns={'WorkMode Collect Result_Dam': 'WorkMode Collect Result'})

# WorkMode Collect Result_Fill1, WorkMode Collect Result_Fill2 열 드롭
train_data = train_data.drop(columns=['WorkMode Collect Result_Fill1', 'WorkMode Collect Result_Fill2'])
test_data = test_data.drop(columns=['WorkMode Collect Result_Fill1', 'WorkMode Collect Result_Fill2'])

In [567]:
# WorkMode Collect Result 열의 값이 7인 행을 1로 변경
train_data['WorkMode Collect Result'] = train_data['WorkMode Collect Result'].replace(7, 1)
test_data['WorkMode Collect Result'] = test_data['WorkMode Collect Result'].replace(7, 1)

# WorkMode Collect Result 열의 결측값을 0으로 채움
train_data['WorkMode Collect Result'] = train_data['WorkMode Collect Result'].fillna(0)
test_data['WorkMode Collect Result'] = test_data['WorkMode Collect Result'].fillna(0)

In [568]:
test_data['WorkMode Collect Result'].value_counts()

0.0    10349
1.0     7012
Name: WorkMode Collect Result, dtype: int64

In [569]:
# 각 변수별로 결측값이 존재하는지 확인하는 코드
missing_values = train_data.isnull().sum()

# 결측값이 존재하는 변수와 그 개수 출력
missing_values = missing_values[missing_values > 0]
print(missing_values)

# 결측값이 존재하는 변수명을 리스트에 담기
missing_columns = missing_values.index.tolist()
print("결측값이 존재하는 변수명:", missing_columns)

Series([], dtype: int64)
결측값이 존재하는 변수명: []


In [570]:
# 각 변수별로 결측값이 존재하는지 확인하는 코드
missing_values = test_data.isnull().sum()

# 결측값이 존재하는 변수와 그 개수 출력
missing_values = missing_values[missing_values > 0]
print(missing_values)

# 결측값이 존재하는 변수명을 리스트에 담기
missing_columns = missing_values.index.tolist()
print("결측값이 존재하는 변수명:", missing_columns)

target    17361
dtype: int64
결측값이 존재하는 변수명: ['target']


### 문자형(object) 변수 -> 수치형 변환


In [571]:
train_data['target'].value_counts()

Normal      38156
AbNormal     2350
Name: target, dtype: int64

In [572]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40506 entries, 0 to 40505
Columns: 112 entries, Model.Suffix to Dispenser_num
dtypes: float64(56), int64(51), object(5)
memory usage: 34.6+ MB


In [573]:
print(test_data['target'].dtype)

float64


### 타겟 인코딩

In [574]:
# 'target' 열의 변수 타입을 object로 변경
# -> test 데이터는 float64 타입으로 되어있음 
test_data['target'] = test_data['target'].astype('object')

# object 타입의 변수 출력
train_object_columns = train_data.select_dtypes(include=['object']).columns
test_object_columns = test_data.select_dtypes(include=['object']).columns

print(train_object_columns, f" train_object_columns 갯수 : {len(train_object_columns)}")
print(test_object_columns, f" test_object_columns 갯수 : {len(test_object_columns)}")

# 각 object 변수의 고유 값 개수 출력
print("\nTrain Data:")
for col in train_object_columns:
    unique_count = train_data[col].nunique()
    print(f"{col} unique 값 갯수: {unique_count}")

print("\nTest Data:")
for col in test_object_columns:
    unique_count = test_data[col].nunique()
    print(f"{col} unique 값 갯수: {unique_count}")

Index(['Model.Suffix', 'Workorder', 'Chamber Temp. Judge Value_AutoClave',
       'target', 'Dispenser_num'],
      dtype='object')  train_object_columns 갯수 : 5
Index(['Set ID', 'Model.Suffix', 'Workorder',
       'Chamber Temp. Judge Value_AutoClave', 'target', 'Dispenser_num'],
      dtype='object')  test_object_columns 갯수 : 6

Train Data:
Model.Suffix unique 값 갯수: 7
Workorder unique 값 갯수: 663
Chamber Temp. Judge Value_AutoClave unique 값 갯수: 2
target unique 값 갯수: 2
Dispenser_num unique 값 갯수: 3

Test Data:
Set ID unique 값 갯수: 17361
Model.Suffix unique 값 갯수: 7
Workorder unique 값 갯수: 662
Chamber Temp. Judge Value_AutoClave unique 값 갯수: 2
target unique 값 갯수: 0
Dispenser_num unique 값 갯수: 3


In [575]:
# 필요한 라이브러리 임포트
import pandas as pd
import category_encoders as ce

# 타겟 변수와 범주형 변수 지정
## Target Encoding의 smoothing 파라미터는 default로 auto로 설정되어 있음
target = 'target'  # 타겟 변수 이름으로 변경
categorical_columns = [
    'Model.Suffix',
    'Workorder',
    'Chamber Temp. Judge Value_AutoClave',
    'Dispenser_num'

]  # 범주형 변수 이름으로 변경

# 타겟 값을 숫자로 변환
target_mapping = {'Normal': 0, 'AbNormal': 1}
train_data[target] = train_data[target].map(target_mapping)
test_data[target] = test_data[target].map(target_mapping)

# 열이 존재하는지 확인
missing_columns = [col for col in categorical_columns if col not in train_data.columns]
if missing_columns:
    raise ValueError(f"train_data에 다음 열이 존재하지 않습니다: {missing_columns}")

# 타겟 인코더 생성 및 학습
encoder = ce.TargetEncoder(cols=categorical_columns)
train_data = encoder.fit_transform(train_data, train_data[target])

# Set ID 열을 별도로 저장
set_id = test_data['Set ID']

# 테스트 데이터 인코딩 (Set ID 열 제외)
test_data = test_data.drop(columns=['Set ID'])
test_data = encoder.transform(test_data)

# Set ID 열을 맨 앞에 추가
test_data.insert(0, 'Set ID', set_id)

# categorical_columns에 해당하는 열의 데이터 값만 확인
print(train_data[categorical_columns].head(3))
print(test_data[categorical_columns].head(3))

# 역 매핑 딕셔너리 생성
reverse_target_mapping = {v: k for k, v in target_mapping.items()}

# 타겟 값을 원래대로 변환
train_data[target] = train_data[target].map(reverse_target_mapping)
test_data[target] = test_data[target].map(reverse_target_mapping)

print("--- train_data ---")

# 변환된 타겟 값 확인
print(train_data[[target]].value_counts())

   Model.Suffix  Workorder  Chamber Temp. Judge Value_AutoClave  Dispenser_num
0      0.049336   0.158385                             0.058361       0.058614
1      0.049336   0.015314                             0.058361       0.058614
2      0.056712   0.009534                             0.058361       0.054977
   Model.Suffix  Workorder  Chamber Temp. Judge Value_AutoClave  Dispenser_num
0      0.056712   0.091912                             0.058361       0.054977
1      0.056712   0.024247                             0.058361       0.054977
2      0.056712   0.091463                             0.058361       0.058614
--- train_data ---
target  
Normal      38156
AbNormal     2350
dtype: int64


In [576]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40506 entries, 0 to 40505
Columns: 112 entries, Model.Suffix to Dispenser_num
dtypes: float64(60), int64(51), object(1)
memory usage: 34.6+ MB


In [577]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17361 entries, 0 to 17360
Columns: 113 entries, Set ID to Dispenser_num
dtypes: float64(95), int64(16), object(2)
memory usage: 15.0+ MB


### 데이터 분할

In [578]:
from sklearn.model_selection import train_test_split
import pandas as pd

# '_Dam', '_AutoClave', '_Fill1', '_Fill2'를 포함하는 열 이름 필터링 함수
def filter_columns(df, is_test=False):
    dam_columns = df.filter(like='_Dam').columns.tolist()
    autoclave_columns = df.filter(like='_AutoClave').columns.tolist()
    fill1_columns = df.filter(like='_Fill1').columns.tolist()
    fill2_columns = df.filter(like='_Fill2').columns.tolist()

    # 추가적으로 불러올 열 이름
    additional_cols = ['Model.Suffix', 'Workorder', 'target', 'Dispenser_num']
    if is_test:
        additional_cols.append('Set ID')

    # 각 그룹별로 새로운 데이터프레임 생성
    df_dam = df[dam_columns + additional_cols]
    df_autoclave = df[autoclave_columns + additional_cols]
    df_fill1 = df[fill1_columns + additional_cols]
    df_fill2 = df[fill2_columns + additional_cols]

    return df_dam, df_autoclave, df_fill1, df_fill2

# train_data에 적용
df_train_dam, df_train_autoclave, df_train_fill1, df_train_fill2 = filter_columns(train_data)

# test_data에 적용
df_test_dam, df_test_autoclave, df_test_fill1, df_test_fill2 = filter_columns(test_data, is_test=True)

# 데이터 분할 및 통계 출력 함수
def split_and_print_stats(df, name):
    df_train, df_val = train_test_split(
        df,
        test_size=0.2,
        stratify=df["target"],
        random_state=RANDOM_STATE,
    )

    def print_stats(df: pd.DataFrame):
        num_normal = len(df[df["target"] == "Normal"])
        num_abnormal = len(df[df["target"] == "AbNormal"])

        print(f"  Total: Normal: {num_normal}, AbNormal: {num_abnormal}" + f" ratio: {num_abnormal/num_normal}")

    # 통계 출력
    print(f"{name} Columns:")
    print(f"  \tAbnormal\tNormal")
    print_stats(df_train)
    print_stats(df_val)

    return df_train, df_val

# 각 데이터프레임에 대해 데이터 분할 및 통계 출력
df_train_dam_split, df_val_dam_split = split_and_print_stats(df_train_dam, "Dam")
df_train_autoclave_split, df_val_autoclave_split = split_and_print_stats(df_train_autoclave, "AutoClave")
df_train_fill1_split, df_val_fill1_split = split_and_print_stats(df_train_fill1, "Fill1")
df_train_fill2_split, df_val_fill2_split = split_and_print_stats(df_train_fill2, "Fill2")

Dam Columns:
  	Abnormal	Normal
  Total: Normal: 30524, AbNormal: 1880 ratio: 0.06159087930808544
  Total: Normal: 7632, AbNormal: 470 ratio: 0.061582809224318656
AutoClave Columns:
  	Abnormal	Normal
  Total: Normal: 30524, AbNormal: 1880 ratio: 0.06159087930808544
  Total: Normal: 7632, AbNormal: 470 ratio: 0.061582809224318656
Fill1 Columns:
  	Abnormal	Normal
  Total: Normal: 30524, AbNormal: 1880 ratio: 0.06159087930808544
  Total: Normal: 7632, AbNormal: 470 ratio: 0.061582809224318656
Fill2 Columns:
  	Abnormal	Normal
  Total: Normal: 30524, AbNormal: 1880 ratio: 0.06159087930808544
  Total: Normal: 7632, AbNormal: 470 ratio: 0.061582809224318656


## 3. 모델 학습

In [579]:
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)

def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])
    weighted_F1 = f1_score(y_test, y_pred, average='weighted')

    metrics = pd.DataFrame({
        '정확도': [accuracy],
        '정밀도': [precision],
        '재현율': [recall],
        'F1 Score': [F1],
        'Weighted F1': [weighted_F1]
    })

    confusion_df = pd.DataFrame(confusion, index=['True', 'False'], columns=['True', 'False'])

    print("\n오차행렬:")
    display(confusion_df)
    print("평가 지표:")
    display(metrics)

### 모델 정의

In [580]:
model_Dam = RandomForestClassifier(random_state=RANDOM_STATE)
model_AutoClave = RandomForestClassifier(random_state=RANDOM_STATE)
model_Fill1 = RandomForestClassifier(random_state=RANDOM_STATE)
model_Fill2 = RandomForestClassifier(random_state=RANDOM_STATE)

### 모델 학습


In [581]:
df_train_dam_split.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32404 entries, 24811 to 12593
Data columns (total 61 columns):
 #   Column                                                    Non-Null Count  Dtype  
---  ------                                                    --------------  -----  
 0   CURE END POSITION X Collect Result_Dam                    32404 non-null  int64  
 1   CURE END POSITION Z Collect Result_Dam                    32404 non-null  float64
 2   CURE END POSITION Θ Collect Result_Dam                    32404 non-null  int64  
 3   CURE SPEED Collect Result_Dam                             32404 non-null  int64  
 4   CURE START POSITION X Collect Result_Dam                  32404 non-null  int64  
 5   CURE START POSITION Θ Collect Result_Dam                  32404 non-null  int64  
 6   DISCHARGED SPEED OF RESIN Collect Result_Dam              32404 non-null  int64  
 7   DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam       32404 non-null  float64
 8   DISCHARGED T

In [582]:
df_train_dam.columns.tolist()

['CURE END POSITION X Collect Result_Dam',
 'CURE END POSITION Z Collect Result_Dam',
 'CURE END POSITION Θ Collect Result_Dam',
 'CURE SPEED Collect Result_Dam',
 'CURE START POSITION X Collect Result_Dam',
 'CURE START POSITION Θ Collect Result_Dam',
 'DISCHARGED SPEED OF RESIN Collect Result_Dam',
 'DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam',
 'DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam',
 'DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam',
 'Dispense Volume(Stage1) Collect Result_Dam',
 'Dispense Volume(Stage2) Collect Result_Dam',
 'Dispense Volume(Stage3) Collect Result_Dam',
 'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam',
 'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam',
 'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam',
 'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam',
 'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam',
 'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam',
 'HEAD NORMAL COORDINAT

In [583]:
df_train_dam_split, df_val_dam_split = split_and_print_stats(df_train_dam, "Dam")
df_train_autoclave_split, df_val_autoclave_split = split_and_print_stats(df_train_autoclave, "AutoClave")
df_train_fill1_split, df_val_fill1_split = split_and_print_stats(df_train_fill1, "Fill1")
df_train_fill2_split, df_val_fill2_split = split_and_print_stats(df_train_fill2, "Fill2")

Dam Columns:
  	Abnormal	Normal
  Total: Normal: 30524, AbNormal: 1880 ratio: 0.06159087930808544
  Total: Normal: 7632, AbNormal: 470 ratio: 0.061582809224318656
AutoClave Columns:
  	Abnormal	Normal
  Total: Normal: 30524, AbNormal: 1880 ratio: 0.06159087930808544
  Total: Normal: 7632, AbNormal: 470 ratio: 0.061582809224318656
Fill1 Columns:
  	Abnormal	Normal
  Total: Normal: 30524, AbNormal: 1880 ratio: 0.06159087930808544
  Total: Normal: 7632, AbNormal: 470 ratio: 0.061582809224318656
Fill2 Columns:
  	Abnormal	Normal
  Total: Normal: 30524, AbNormal: 1880 ratio: 0.06159087930808544
  Total: Normal: 7632, AbNormal: 470 ratio: 0.061582809224318656


In [584]:
# df_train 데이터로 학습
features = []

for col in df_train_dam_split.columns:
    try:
        df_train_dam_split[col] = df_train_dam_split[col].astype(int)
        features.append(col)
    except:
        continue

train_x = df_train_dam_split[features]
train_y = df_train_dam_split["target"]

model_Dam.fit(train_x, train_y)

RandomForestClassifier(random_state=110)

In [585]:
# df_train 데이터로 학습
features = []

for col in df_train_autoclave_split.columns:
    try:
        df_train_autoclave_split[col] = df_train_autoclave_split[col].astype(int)
        features.append(col)
    except:
        continue

train_x = df_train_autoclave_split[features]
train_y = df_train_autoclave_split["target"]

model_AutoClave.fit(train_x, train_y)

RandomForestClassifier(random_state=110)

In [586]:
# df_train 데이터로 학습
features = []

for col in df_train_fill1_split.columns:
    try:
        df_train_fill1_split[col] = df_train_fill1_split[col].astype(int)
        features.append(col)
    except:
        continue

train_x = df_train_fill1_split[features]
train_y = df_train_fill1_split["target"]

model_Fill1.fit(train_x, train_y)

RandomForestClassifier(random_state=110)

In [588]:
# df_train 데이터로 학습
features = []

for col in df_train_fill2_split.columns:
    try:
        df_train_fill2_split[col] = df_train_fill2_split[col].astype(int)
        features.append(col)
    except:
        continue

train_x = df_train_fill2_split[features]
train_y = df_train_fill2_split["target"]

model_Fill2.fit(train_x, train_y)

RandomForestClassifier(random_state=110)

In [602]:
# df_val에서 y_val과 x_val 추출
y_val = df_val_dam_split['target'] 
x_val = df_val_dam_split.drop(columns=['target']) 

# y_val 레이블을 정수로 변환
y_val_int = [1 if label == 'AbNormal' else 0 for label in y_val]

# 확률 예측
soft_voting_probs = model_Dam.predict_proba(x_val)[:, 1]

# 스레시홀드 적용(일반 모델의 경우 default로 0.5)
soft_voting_preds = [1 if prob > 0.7 else 0 for prob in soft_voting_probs]

# 평가
get_clf_eval(y_val_int, soft_voting_preds)


오차행렬:


Unnamed: 0,True,False
True,427,43
False,7310,322


평가 지표:


Unnamed: 0,정확도,정밀도,재현율,F1 Score,Weighted F1
0,0.092446,0.055189,0.908511,0.104058,0.081895


In [600]:
# df_val에서 y_val과 x_val 추출
y_val = df_val_autoclave_split['target'] 
x_val = df_val_autoclave_split.drop(columns=['target']) 

# y_val 레이블을 정수로 변환
y_val_int = [1 if label == 'AbNormal' else 0 for label in y_val]

# 확률 예측
soft_voting_probs = model_AutoClave.predict_proba(x_val)[:, 1]

# 스레시홀드 적용(일반 모델의 경우 default로 0.5)
soft_voting_preds = [1 if prob > 0.9 else 0 for prob in soft_voting_probs]

# 평가
get_clf_eval(y_val_int, soft_voting_preds)


오차행렬:


Unnamed: 0,True,False
True,413,57
False,7165,467


평가 지표:


Unnamed: 0,정확도,정밀도,재현율,F1 Score,Weighted F1
0,0.108615,0.0545,0.878723,0.102634,0.113828


In [601]:
# df_val에서 y_val과 x_val 추출
y_val = df_val_fill1_split['target'] 
x_val = df_val_fill1_split.drop(columns=['target']) 

# y_val 레이블을 정수로 변환
y_val_int = [1 if label == 'AbNormal' else 0 for label in y_val]

# 확률 예측
soft_voting_probs = model_Fill1.predict_proba(x_val)[:, 1]

# 스레시홀드 적용(일반 모델의 경우 default로 0.5)
soft_voting_preds = [1 if prob > 0.7 else 0 for prob in soft_voting_probs]

# 평가
get_clf_eval(y_val_int, soft_voting_preds)


오차행렬:


Unnamed: 0,True,False
True,405,65
False,6770,862


평가 지표:


Unnamed: 0,정확도,정밀도,재현율,F1 Score,Weighted F1
0,0.156381,0.056446,0.861702,0.105952,0.195887


In [607]:
# df_val에서 y_val과 x_val 추출
y_val = df_val_fill2_split['target'] 
x_val = df_val_fill2_split.drop(columns=['target']) 

# y_val 레이블을 정수로 변환
y_val_int = [1 if label == 'AbNormal' else 0 for label in y_val]

# 확률 예측
soft_voting_probs = model_Fill2.predict_proba(x_val)[:, 1]

# 스레시홀드 적용(일반 모델의 경우 default로 0.5)
soft_voting_preds = [1 if prob > 0.7 else 0 for prob in soft_voting_probs]

# 평가
get_clf_eval(y_val_int, soft_voting_preds)


오차행렬:


Unnamed: 0,True,False
True,422,48
False,7095,537


평가 지표:


Unnamed: 0,정확도,정밀도,재현율,F1 Score,Weighted F1
0,0.118366,0.056139,0.897872,0.105672,0.129252


분할한 데이터 -> 원래의 데이터(train data)로 학습한 새 모델 생성

In [608]:
# train_data 데이터로 학습
features = []

for col in df_train_dam_split.columns:
    try:
        df_train_dam_split[col] = df_train_dam_split[col].astype(int)
        features.append(col)
    except:
        continue

train_x = df_train_dam_split[features]
train_y = df_train_dam_split["target"]

model_Dam.fit(train_x, train_y)

RandomForestClassifier(random_state=110)

In [609]:
# train_data 데이터로 학습
features = []

for col in df_train_autoclave_split.columns:
    try:
        df_train_autoclave_split[col] = df_train_autoclave_split[col].astype(int)
        features.append(col)
    except:
        continue

train_x = df_train_autoclave_split[features]
train_y = df_train_autoclave_split["target"]

model_Dam.fit(train_x, train_y)

RandomForestClassifier(random_state=110)

In [610]:
# train_data 데이터로 학습
features = []

for col in df_train_fill1_split.columns:
    try:
        df_train_fill1_split[col] = df_train_fill1_split[col].astype(int)
        features.append(col)
    except:
        continue

train_x = df_train_fill1_split[features]
train_y = df_train_fill1_split["target"]

model_Dam.fit(train_x, train_y)

RandomForestClassifier(random_state=110)

In [611]:
# train_data 데이터로 학습
features = []

for col in df_train_fill2_split.columns:
    try:
        df_train_fill2_split[col] = df_train_fill2_split[col].astype(int)
        features.append(col)
    except:
        continue

train_x = df_train_fill2_split[features]
train_y = df_train_fill2_split["target"]

model_Dam.fit(train_x, train_y)

RandomForestClassifier(random_state=110)

In [632]:
# 예측에 필요한 데이터 분리
x_test = df_test_dam.drop(["target", "Set ID"], axis=1)

# 피처 이름 일치 확인 및 수정
expected_features = model_Dam.feature_names_in_
x_test = x_test.reindex(columns=expected_features, fill_value=0)

# 변경된 스레시홀드 적용하여 테스트 데이터에 대한 예측
soft_voting_preds_dam = model_Dam.predict_proba(x_test)[:, 1]
soft_voting_preds_dam = [1 if prob > 0.7 else 0 for prob in soft_voting_preds_dam]

# 테스트 데이터에서 True로 예측된 개수 출력
print(sum(soft_voting_preds_dam))

17361


In [623]:
# 예측에 필요한 데이터 분리
x_test = df_test_autoclave.drop(["target", "Set ID"], axis=1)

# 피처 이름 일치 확인 및 수정
expected_features = model_AutoClave.feature_names_in_
x_test = x_test.reindex(columns=expected_features, fill_value=0)

# 변경된 스레시홀드 적용하여 테스트 데이터에 대한 예측
soft_voting_preds_autoclave = model_AutoClave.predict_proba(x_test)[:, 1]
soft_voting_preds_autoclave = [1 if prob > 0.95 else 0 for prob in soft_voting_preds_autoclave]

# 테스트 데이터에서 True로 예측된 개수 출력
print(sum(soft_voting_preds_autoclave))

8645


In [614]:
# 예측에 필요한 데이터 분리
x_test = df_test_fill1.drop(["target", "Set ID"], axis=1)

# 피처 이름 일치 확인 및 수정
expected_features = model_Fill1.feature_names_in_
x_test = x_test.reindex(columns=expected_features, fill_value=0)

# 변경된 스레시홀드 적용하여 테스트 데이터에 대한 예측
soft_voting_preds_Fill1 = model_Fill1.predict_proba(x_test)[:, 1]
soft_voting_preds_Fill1 = [1 if prob > 0.7 else 0 for prob in soft_voting_preds_Fill1]

# 테스트 데이터에서 True로 예측된 개수 출력
print(sum(soft_voting_preds_Fill1))

15377


In [615]:
# 예측에 필요한 데이터 분리
x_test = df_test_fill2.drop(["target", "Set ID"], axis=1)

# 피처 이름 일치 확인 및 수정
expected_features = model_Fill2.feature_names_in_
x_test = x_test.reindex(columns=expected_features, fill_value=0)

# 변경된 스레시홀드 적용하여 테스트 데이터에 대한 예측
soft_voting_preds_Fill2 = model_Fill2.predict_proba(x_test)[:, 1]
soft_voting_preds_Fill2 = [1 if prob > 0.7 else 0 for prob in soft_voting_preds_Fill2]

# 테스트 데이터에서 True로 예측된 개수 출력
print(sum(soft_voting_preds_Fill2))

16130


## 4. 제출하기


### 제출 파일 작성


In [616]:
import numpy as np

pred = [1 if any(x) else 0 for x in zip(soft_voting_preds_dam, 
                                        soft_voting_preds_autoclave, 
                                        soft_voting_preds_Fill1, 
                                        soft_voting_preds_Fill2)]
df_sub = pd.read_csv("submission.csv")

# pred 값을 df_sub["target"]에 할당
df_sub["target"] = pred

# df_sub['target'] 값을 문자열 레이블로 변환
df_sub['target'] = df_sub['target'].apply(lambda x: 'AbNormal' if x == 1 else 'Normal')

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

In [617]:
df_sub['target'].value_counts()

AbNormal    17361
Name: target, dtype: int64

In [618]:
df_sub.head(10)

Unnamed: 0,Set ID,target
0,0001be084fbc4aaa9d921f39e595961b,AbNormal
1,0005bbd180064abd99e63f9ed3e1ac80,AbNormal
2,000948934c4140d883d670adcb609584,AbNormal
3,000a6bfd02874c6296dc7b2e9c5678a7,AbNormal
4,0018e78ce91343678716e2ea27a51c95,AbNormal
5,001fda4596f545d0a3b0ce85fbea77d2,AbNormal
6,0020734a7b29472298358ad58645a0c9,AbNormal
7,00234c5914cd4c4a888d13f8b3773135,AbNormal
8,00297b6c93e44d49ac534758a23dc74e,AbNormal
9,002d904240d84b188d410d16383a9c3a,AbNormal


**우측 상단의 제출 버튼을 클릭해 결과를 확인하세요**
