In [1]:
import os
from pprint import pprint

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [2]:
RANDOM_STATE = 110

train_data = pd.read_csv("C:/Users/KimDongyoung/Desktop/git_LGaimers5/Lg_aimers5/data/train_data.csv")
test_data = pd.read_csv("C:/Users/KimDongyoung/Desktop/git_LGaimers5/Lg_aimers5/data/test_data.csv")

In [4]:
train_data['Equipment_same_num'].value_counts()

Equipment_same_num
1    40472
0       34
Name: count, dtype: int64

In [5]:
test_data

Unnamed: 0,Set ID,Model.Suffix,Workorder,CURE END POSITION X Collect Result_Dam,CURE END POSITION Z Collect Result_Dam,CURE END POSITION Θ Collect Result_Dam,CURE SPEED Collect Result_Dam,CURE START POSITION X Collect Result_Dam,CURE START POSITION Θ Collect Result_Dam,DISCHARGED SPEED OF RESIN Collect Result_Dam,...,HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill2,HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill2,Head Purge Position Z Collect Result_Fill2,Machine Tact time Collect Result_Fill2,PalletID Collect Result_Fill2,Production Qty Collect Result_Fill2,Receip No Collect Result_Fill2,WorkMode Collect Result_Fill2,target,Equipment_same_num
0,0001be084fbc4aaa9d921f39e595961b,AJX75334501,3J1XF767-1,1000.0,12.5,90,70,280,90,10,...,243.5,243.5,85.0,19.8,13.0,195.0,1.0,0.0,,1
1,0005bbd180064abd99e63f9ed3e1ac80,AJX75334501,4B1XD472-2,1000.0,12.5,90,70,280,90,16,...,243.5,243.5,85.0,19.8,14.0,256.0,1.0,,,1
2,000948934c4140d883d670adcb609584,AJX75334501,3H1XE355-1,240.0,2.5,-90,70,1030,-90,10,...,243.7,243.7,85.0,19.7,1.0,98.0,1.0,0.0,,1
3,000a6bfd02874c6296dc7b2e9c5678a7,AJX75334501,3L1XA128-1,1000.0,12.5,90,70,280,90,10,...,243.5,243.5,85.0,20.0,14.0,0.0,1.0,,,1
4,0018e78ce91343678716e2ea27a51c95,AJX75334501,4A1XA639-1,240.0,2.5,-90,70,1030,-90,16,...,243.7,243.7,85.0,19.8,1.0,215.0,1.0,,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17356,ffea508b59934d689b540f95eb3fa730,AJX75334501,3K1XB597-1,1000.0,12.5,90,70,280,90,10,...,243.5,243.5,85.0,19.5,14.0,131.0,1.0,,,1
17357,ffed8923c8a448a98afc641b770be153,AJX75334501,4A1XB974-1,1000.0,12.5,90,70,280,90,16,...,243.5,243.5,85.0,19.8,12.0,279.0,1.0,,,1
17358,fff1e73734da40adbe805359b3efb462,AJX75334501,3L1XA998-1,240.0,2.5,-90,70,1030,-90,16,...,243.7,243.7,85.0,20.5,4.0,66.0,1.0,,,1
17359,fff8e38bdd09470baf95f71e92075dec,AJX75334501,3F1XC376-1,240.0,2.5,-90,70,1030,-90,10,...,243.7,243.7,85.0,18.9,1.0,117.0,1.0,0.0,,1


In [6]:
# 변수명 변경
train_data = train_data.rename(columns={'1st Pressure 1st Pressure Unit Time_AutoClave': '1st Pressure Unit Time_AutoClave'})
test_data = test_data.rename(columns={'1st Pressure 1st Pressure Unit Time_AutoClave': '1st Pressure Unit Time_AutoClave'})

In [7]:
train_data['2nd_Pressure_Greater_Than_0.6'] = train_data['2nd Pressure Collect Result_AutoClave'].apply(lambda x: 1 if x >= 0.6 else 0)
test_data['2nd_Pressure_Greater_Than_0.6'] = test_data['2nd Pressure Collect Result_AutoClave'].apply(lambda x: 1 if x >= 0.6 else 0)

In [8]:
train_data['3rd_Pressure_Greater_Than_0.7'] = train_data['3rd Pressure Collect Result_AutoClave'].apply(lambda x: 1 if x >= 0.7 else 0)
test_data['3rd_Pressure_Greater_Than_0.7'] = test_data['3rd Pressure Collect Result_AutoClave'].apply(lambda x: 1 if x >= 0.7 else 0)

In [9]:

# 각 변수별로 결측값이 존재하는지 확인하는 코드
missing_values = train_data.isnull().sum()

# 결측값이 존재하는 변수와 그 개수 출력
missing_values = missing_values[missing_values > 0]
print(missing_values)

# 결측값이 존재하는 변수명을 리스트에 담기
missing_columns = missing_values.index.tolist()
# print("결측값이 존재하는 변수명:", missing_columns)

HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Dam      29213
WorkMode Collect Result_Dam                                24059
GMES_ORIGIN_INSP_JUDGE_CODE Collect Result_AutoClave       29213
GMES_ORIGIN_INSP_JUDGE_CODE Judge Value_AutoClave          29213
HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill1    29213
WorkMode Collect Result_Fill1                              24059
HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill2    29213
WorkMode Collect Result_Fill2                              24059
dtype: int64


In [10]:

# 결측값이 존재하는 변수들의 value_counts 계산
for column in missing_columns:
    print(f"Column: {column}")
    print(train_data[column].value_counts())
    print("\n")

Column: HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Dam
HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Dam
OK    11293
Name: count, dtype: int64


Column: WorkMode Collect Result_Dam
WorkMode Collect Result_Dam
7.0    16447
Name: count, dtype: int64


Column: GMES_ORIGIN_INSP_JUDGE_CODE Collect Result_AutoClave
GMES_ORIGIN_INSP_JUDGE_CODE Collect Result_AutoClave
OK    11293
Name: count, dtype: int64


Column: GMES_ORIGIN_INSP_JUDGE_CODE Judge Value_AutoClave
GMES_ORIGIN_INSP_JUDGE_CODE Judge Value_AutoClave
OK    11293
Name: count, dtype: int64


Column: HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill1
HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill1
OK    11293
Name: count, dtype: int64


Column: WorkMode Collect Result_Fill1
WorkMode Collect Result_Fill1
7.0    16447
Name: count, dtype: int64


Column: HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill2
HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill2
OK    11293
Name: count, dtype: int64


Column: 

In [11]:

# "OK" 값을 가진 변수를 찾고 제거
columns_to_drop = [column for column in missing_columns if train_data[column].apply(lambda x: np.any(x == "OK")).any()]

# 드롭할 변수명 출력 (한 줄에 2개씩)
print("드롭할 변수명:")
for i in range(0, len(columns_to_drop), 2):
    print(columns_to_drop[i:i+2])

# 변수 드롭
train_data = train_data.drop(columns=columns_to_drop)
test_data = test_data.drop(columns=columns_to_drop)

드롭할 변수명:
['HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Dam', 'GMES_ORIGIN_INSP_JUDGE_CODE Collect Result_AutoClave']
['GMES_ORIGIN_INSP_JUDGE_CODE Judge Value_AutoClave', 'HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill1']
['HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill2']


In [12]:

# 각 변수별로 결측값이 존재하는지 확인하는 코드
missing_values = train_data.isnull().sum()

# 결측값이 존재하는 변수와 그 개수 출력
missing_values = missing_values[missing_values > 0]
print(missing_values)

# 결측값이 존재하는 변수명을 리스트에 담기
missing_columns = missing_values.index.tolist()
# print("결측값이 존재하는 변수명:", missing_columns)

WorkMode Collect Result_Dam      24059
WorkMode Collect Result_Fill1    24059
WorkMode Collect Result_Fill2    24059
dtype: int64


In [13]:
# 그룹화할 변수들
groupby_columns = [
    "WorkMode Collect Result_Dam"
    , "WorkMode Collect Result_Fill1"
    , "WorkMode Collect Result_Fill2"
    ]

# 그룹화하여 target 변수 값의 비율을 계산
grouped = train_data.groupby(groupby_columns)["target"].value_counts(normalize=True).unstack().fillna(0)

# 결과 출력
print(grouped)

target                                                                                   AbNormal  \
WorkMode Collect Result_Dam WorkMode Collect Result_Fill1 WorkMode Collect Result_Fill2             
7.0                         7.0                           0.0                            0.073326   

target                                                                                     Normal  
WorkMode Collect Result_Dam WorkMode Collect Result_Fill1 WorkMode Collect Result_Fill2            
7.0                         7.0                           0.0                            0.926674  


In [14]:
# WorkMode Collect Result_Dam의 이름을 WorkMode Collect Result로 변경
train_data = train_data.rename(columns={'WorkMode Collect Result_Dam': 'WorkMode Collect Result'})
test_data = test_data.rename(columns={'WorkMode Collect Result_Dam': 'WorkMode Collect Result'})

# WorkMode Collect Result_Fill1, WorkMode Collect Result_Fill2 열 드롭
train_data = train_data.drop(columns=['WorkMode Collect Result_Fill1', 'WorkMode Collect Result_Fill2'])
test_data = test_data.drop(columns=['WorkMode Collect Result_Fill1', 'WorkMode Collect Result_Fill2'])

In [15]:
# WorkMode Collect Result 열의 값이 7인 행을 1로 변경
train_data['WorkMode Collect Result'] = train_data['WorkMode Collect Result'].replace(7, 1)
test_data['WorkMode Collect Result'] = test_data['WorkMode Collect Result'].replace(7, 1)

# WorkMode Collect Result 열의 결측값을 0으로 채움
train_data['WorkMode Collect Result'] = train_data['WorkMode Collect Result'].fillna(0)
test_data['WorkMode Collect Result'] = test_data['WorkMode Collect Result'].fillna(0)

In [16]:
test_data['WorkMode Collect Result'].value_counts()

WorkMode Collect Result
0.0    10349
1.0     7012
Name: count, dtype: int64

In [17]:
# 각 변수별로 결측값이 존재하는지 확인하는 코드
missing_values = train_data.isnull().sum()

# 결측값이 존재하는 변수와 그 개수 출력
missing_values = missing_values[missing_values > 0]
print(missing_values)

# 결측값이 존재하는 변수명을 리스트에 담기
missing_columns = missing_values.index.tolist()
print("결측값이 존재하는 변수명:", missing_columns)


Series([], dtype: int64)
결측값이 존재하는 변수명: []


In [18]:

# 각 변수별로 결측값이 존재하는지 확인하는 코드
missing_values = test_data.isnull().sum()

# 결측값이 존재하는 변수와 그 개수 출력
missing_values = missing_values[missing_values > 0]
print(missing_values)

# 결측값이 존재하는 변수명을 리스트에 담기
missing_columns = missing_values.index.tolist()
print("결측값이 존재하는 변수명:", missing_columns)

target    17361
dtype: int64
결측값이 존재하는 변수명: ['target']


In [19]:
# 'target' 열의 변수 타입을 object로 변경
# -> test 데이터는 float64 타입으로 되어있음 
test_data['target'] = test_data['target'].astype('object')

# object 타입의 변수 출력
train_object_columns = train_data.select_dtypes(include=['object']).columns
test_object_columns = test_data.select_dtypes(include=['object']).columns

print(train_object_columns, f" train_object_columns 갯수 : {len(train_object_columns)}")
print(test_object_columns, f" test_object_columns 갯수 : {len(test_object_columns)}")

# 각 object 변수의 고유 값 개수 출력
print("\nTrain Data:")
for col in train_object_columns:
    unique_count = train_data[col].nunique()
    print(f"{col} unique 값 갯수: {unique_count}")

print("\nTest Data:")
for col in test_object_columns:
    unique_count = test_data[col].nunique()
    print(f"{col} unique 값 갯수: {unique_count}")

Index(['Model.Suffix', 'Workorder', 'Chamber Temp. Judge Value_AutoClave',
       'target'],
      dtype='object')  train_object_columns 갯수 : 4
Index(['Set ID', 'Model.Suffix', 'Workorder',
       'Chamber Temp. Judge Value_AutoClave', 'target'],
      dtype='object')  test_object_columns 갯수 : 5

Train Data:
Model.Suffix unique 값 갯수: 7
Workorder unique 값 갯수: 663
Chamber Temp. Judge Value_AutoClave unique 값 갯수: 2
target unique 값 갯수: 2

Test Data:
Set ID unique 값 갯수: 17361
Model.Suffix unique 값 갯수: 7
Workorder unique 값 갯수: 662
Chamber Temp. Judge Value_AutoClave unique 값 갯수: 2
target unique 값 갯수: 0


In [21]:
# 필요한 라이브러리 임포트
import pandas as pd
import category_encoders as ce

# 타겟 변수와 범주형 변수 지정
## Target Encoding의 smoothing 파라미터는 default로 auto로 설정되어 있음
target = 'target'  # 타겟 변수 이름으로 변경
categorical_columns = [
    'Model.Suffix',
    'Workorder',
    'Chamber Temp. Judge Value_AutoClave'
]  # 범주형 변수 이름으로 변경

# 타겟 값을 숫자로 변환
target_mapping = {'Normal': 0, 'AbNormal': 1}
train_data[target] = train_data[target].map(target_mapping)
test_data[target] = test_data[target].map(target_mapping)

# 열이 존재하는지 확인
missing_columns = [col for col in categorical_columns if col not in train_data.columns]
if missing_columns:
    raise ValueError(f"train_data에 다음 열이 존재하지 않습니다: {missing_columns}")

# 타겟 인코더 생성 및 학습
encoder = ce.TargetEncoder(cols=categorical_columns)
train_data = encoder.fit_transform(train_data, train_data[target])

# Set ID 열을 별도로 저장
set_id = test_data['Set ID']

# 테스트 데이터 인코딩 (Set ID 열 제외)
test_data = test_data.drop(columns=['Set ID'])
test_data = encoder.transform(test_data)

# Set ID 열을 맨 앞에 추가
test_data.insert(0, 'Set ID', set_id)

# categorical_columns에 해당하는 열의 데이터 값만 확인
print(train_data[categorical_columns].head(3))
print(test_data[categorical_columns].head(3))

# 역 매핑 딕셔너리 생성
reverse_target_mapping = {v: k for k, v in target_mapping.items()}

# 타겟 값을 원래대로 변환
train_data[target] = train_data[target].map(reverse_target_mapping)
test_data[target] = test_data[target].map(reverse_target_mapping)

print("--- train_data ---")

# 변환된 타겟 값 확인
print(train_data[[target]].value_counts())

   Model.Suffix  Workorder  Chamber Temp. Judge Value_AutoClave
0      0.049336   0.158385                             0.058361
1      0.049336   0.015314                             0.058361
2      0.056712   0.009534                             0.058361
   Model.Suffix  Workorder  Chamber Temp. Judge Value_AutoClave
0      0.056712   0.091912                             0.058361
1      0.056712   0.024247                             0.058361
2      0.056712   0.091463                             0.058361
--- train_data ---
target  
Normal      38156
AbNormal     2350
Name: count, dtype: int64


In [23]:
# 필요한 라이브러리 임포트
from imblearn.over_sampling import SMOTE

# SMOTE를 사용하여 오버 샘플링 수행
smote = SMOTE(random_state=RANDOM_STATE)
X_train = train_data.drop(columns=['target'])
y_train = train_data['target']
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# 오버 샘플링된 데이터셋을 DataFrame으로 변환
train_data = pd.concat([pd.DataFrame(X_resampled, columns=X_train.columns), pd.Series(y_resampled, name='target')], axis=1)

In [24]:
train_data['target'].value_counts()

target
Normal      38156
AbNormal    38156
Name: count, dtype: int64

In [25]:
model = RandomForestClassifier(random_state=RANDOM_STATE)

In [26]:
features = []

for col in train_data.columns:
    try:
        train_data[col] = train_data[col].astype(int)
        features.append(col)
    except:
        continue

train_x = train_data[features]
train_y = train_data["target"]

model.fit(train_x, train_y)

In [27]:
# test_data에 존재하는 열만 선택
valid_features = [col for col in features if col in test_data.columns]

df_test_x = test_data.loc[:, valid_features]

for col in df_test_x.columns:
    try:
        df_test_x.loc[:, col] = df_test_x[col].astype(int)
    except:
        continue

In [28]:

# 학습 시 사용된 피처 이름을 저장합니다.
train_features = model.feature_names_in_

# 예측 시 동일한 피처를 사용하도록 데이터프레임을 조정합니다.
df_test_x = df_test_x.reindex(columns=train_features, fill_value=0)

# 예측을 수행합니다.
test_pred = model.predict(df_test_x)
test_pred

array(['Normal', 'Normal', 'Normal', ..., 'AbNormal', 'Normal', 'Normal'],
      dtype=object)

In [29]:

# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["target"] = test_pred

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

FileNotFoundError: [Errno 2] No such file or directory: 'submission.csv'