# preprocessing

In [1]:
import os
from pprint import pprint

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt 
%matplotlib inline
import matplotlib.gridspec as gridspec
from mpl_toolkits.mplot3d import Axes3D

In [2]:
# csv 불러오기
train_data = pd.read_csv('trim_train_data.csv')
test_data = pd.read_csv('trim_test_data.csv')

In [3]:
# target 열을 임시로 분리
target_train = train_data['target']
target_test = test_data['target']

# 모든 값이 NaN인 열 제거
train_data = train_data.dropna(axis=1, how='all')
test_data = test_data.dropna(axis=1, how='all')

# target 열을 다시 결합
train_data['target'] = target_train
test_data['target'] = target_test

### 좌표 관련 변수

In [4]:
# 레진 도포 좌표 - 새로운 좌표에 대한 거리와 방향 계산하는 함수
def calculate_vector(row, stage, process):
    x = row[f'HEAD NORMAL COORDINATE X AXIS(Stage{stage}) Collect Result_{process}']
    y = row[f'HEAD NORMAL COORDINATE Y AXIS(Stage{stage}) Collect Result_{process}']
    z = row[f'HEAD NORMAL COORDINATE Z AXIS(Stage{stage}) Collect Result_{process}']
    
    a = row[f'HEAD Standby Position X Collect Result_{process}']
    b = row[f'HEAD Standby Position Y Collect Result_{process}']
    c = row[f'HEAD Standby Position Z Collect Result_{process}']
    
    # 두 좌표 간의 거리 계산
    dx = x - a
    dy = y - b
    dz = z - c
    distance = np.sqrt(dx**2 + dy**2 + dz**2)
    
    # 단위 벡터 계산
    if distance != 0:  # 0으로 나누는 것을 방지
        unit_vector = (dx / distance, dy / distance, dz / distance)
    else:
        unit_vector = (0, 0, 0)  # 거리 0일 때 단위 벡터는 정의되지 않음

    combined_value = distance * (unit_vector[0] + unit_vector[1] + unit_vector[2])
    
    return combined_value

In [5]:
def calculate_and_store_vectors(data, process):
    for stage in range(1, 4):
        data[f'head_normal_vector_stage{stage}_{process}'] = data.apply(calculate_vector, axis=1, stage=stage, process=process)

# train
calculate_and_store_vectors(train_data, 'Dam')
calculate_and_store_vectors(train_data, 'Fill1')
calculate_and_store_vectors(train_data, 'Fill2')

# test
calculate_and_store_vectors(test_data, 'Dam')
calculate_and_store_vectors(test_data, 'Fill1')
calculate_and_store_vectors(test_data, 'Fill2')

In [6]:
# 레진 도포 좌표 X, Y, Z 컬럼 드롭
columns_to_drop = [
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam',
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Dam',
    'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam',
    'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam',
    'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam',
    'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam',
    'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam',
    'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Dam',
    'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Dam',
    'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Dam',

    'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill1',
    'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill1',
    
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2',
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill2',
    'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill2',
    'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill2',
    'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill2',
    'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill2',
    'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill2',
    'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill2',
    'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill2',
    'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill2'
]

# 컬럼 드롭
train_data.drop(columns=columns_to_drop, inplace=True)
test_data.drop(columns=columns_to_drop, inplace=True)

In [7]:
# UV 경화 좌표 합치기
def create_coordinate_columns(data):
    # Dam
    # cure end
    data['cure_end_position_XZΘ_Dam'] = (
        data['CURE END POSITION X Collect Result_Dam'].astype(str) + ',' +
        data['CURE END POSITION Z Collect Result_Dam'].astype(str) + ',' +
        data['CURE END POSITION Θ Collect Result_Dam'].astype(str)
    )

    # cure start
    data['cure_start_position_XΘ_Dam'] = (
        data['CURE START POSITION X Collect Result_Dam'].astype(str) + ',' +
        data['CURE START POSITION Θ Collect Result_Dam'].astype(str)
    )

    # Fill2
    # cure end
    data['cure_end_position_XZ_Fill2'] = (
        data['CURE END POSITION X Collect Result_Fill2'].astype(str) + ',' +
        data['CURE END POSITION Z Collect Result_Fill2'].astype(str) 
    )

    # cure start
    data['cure_start_position_XZ_Fill2'] = (
        data['CURE START POSITION X Collect Result_Fill2'].astype(str) + ',' +
        data['CURE START POSITION Z Collect Result_Fill2'].astype(str) 
    )

# train_data와 test_data에 대해 함수 호출
create_coordinate_columns(train_data)
create_coordinate_columns(test_data)

In [8]:
# UV 경화 좌표 X, Y, Z 컬럼 드롭
columns_to_drop = [
    'CURE END POSITION X Collect Result_Dam',
    'CURE END POSITION Z Collect Result_Dam',
    'CURE END POSITION Θ Collect Result_Dam',
    'CURE START POSITION X Collect Result_Dam',
    'CURE START POSITION Θ Collect Result_Dam',

    'CURE END POSITION X Collect Result_Fill2',
    'CURE END POSITION Z Collect Result_Fill2',
    'CURE START POSITION X Collect Result_Fill2',
    'CURE START POSITION Z Collect Result_Fill2'
]

# 컬럼 드롭
train_data.drop(columns=columns_to_drop, inplace=True)
test_data.drop(columns=columns_to_drop, inplace=True)

In [9]:
# Dam 노즐 zero 위치 Z좌표 드롭
train_data.drop(columns='Head Zero Position Z Collect Result_Dam', inplace=True)
test_data.drop(columns='Head Zero Position Z Collect Result_Dam', inplace=True)

### 기본 변수

In [10]:
# Wip Line 열 제거
wip_line_columns = train_data.filter(like='Wip Line').columns

train_data.drop(columns=wip_line_columns, inplace=True)
test_data.drop(columns=wip_line_columns, inplace=True)

In [11]:
# Process Desc 열 제거
Process_Desc_col = train_data.filter(like='Process Desc').columns

train_data.drop(columns=Process_Desc_col, inplace=True)
test_data.drop(columns=Process_Desc_col, inplace=True)

In [12]:
# Equipment로 시작하는 열 필터링
Equipment_col = train_data.filter(like='Equipment').columns
Equipment_col2 = test_data.filter(like='Equipment').columns

new_train = train_data.filter(items=Equipment_col)
new_test = test_data.filter(items=Equipment_col2)

# Equipment_same_num 파생변수 생성
def determine_equipment_same_num(row):
    if (row['Equipment_Dam'] == 'Dam dispenser #1' and row['Equipment_AutoClave'] == 'Auto Clave Out' and 
        row['Equipment_Fill1'] == 'Fill1 dispenser #1' and row['Equipment_Fill2'] == 'Fill2 dispenser #1') or \
       (row['Equipment_Dam'] == 'Dam dispenser #2' and row['Equipment_AutoClave'] == 'Auto Clave Out' and 
        row['Equipment_Fill1'] == 'Fill1 dispenser #2' and row['Equipment_Fill2'] == 'Fill2 dispenser #2'):
        return 1
    else:
        return 0

train_data['Equipment_same_num'] = new_train.apply(determine_equipment_same_num, axis=1)
test_data['Equipment_same_num'] = new_test.apply(determine_equipment_same_num, axis=1)

train_data = train_data.drop(columns=['Equipment_Dam', 'Equipment_AutoClave', 'Equipment_Fill1', 'Equipment_Fill2'])
test_data = test_data.drop(columns=['Equipment_Dam', 'Equipment_AutoClave', 'Equipment_Fill1', 'Equipment_Fill2'])

In [13]:
# Model.Suffix_Dam의 이름을 Model.Suffix로 변경
train_data = train_data.rename(columns={'Model.Suffix_Dam': 'Model.Suffix'})
test_data = test_data.rename(columns={'Model.Suffix_Dam': 'Model.Suffix'})

# Model.Suffix_AutoClave, Model.Suffix_Fill1, Model.Suffix_Fill2 열 드롭
train_data = train_data.drop(columns=['Model.Suffix_AutoClave', 'Model.Suffix_Fill1', 'Model.Suffix_Fill2'])
test_data = test_data.drop(columns=['Model.Suffix_AutoClave', 'Model.Suffix_Fill1', 'Model.Suffix_Fill2'])

In [14]:
# Workorder_Dam의 이름을 Workorder로 변경
train_data = train_data.rename(columns={'Workorder_Dam': 'Workorder'})
test_data = test_data.rename(columns={'Workorder_Dam': 'Workorder'})

# Workorder_AutoClave, Workorder_Fill1, Workorder_Fill2 열 드롭
train_data = train_data.drop(columns=['Workorder_AutoClave', 'Workorder_Fill1', 'Workorder_Fill2'])
test_data = test_data.drop(columns=['Workorder_AutoClave', 'Workorder_Fill1', 'Workorder_Fill2'])

In [15]:
# Insp. Seq No 열 제거
Insp_Seq_No_col = train_data.filter(like='Insp. Seq No').columns

train_data.drop(columns=Insp_Seq_No_col, inplace=True)
test_data.drop(columns=Insp_Seq_No_col, inplace=True)

In [16]:
# Insp Judge Code 열 제거
Insp_Judge_Code_col = train_data.filter(like='Insp Judge Code').columns

train_data.drop(columns=Insp_Judge_Code_col, inplace=True)
test_data.drop(columns=Insp_Judge_Code_col, inplace=True)

In [17]:
# 값의 종류가 1개이고 결측값이 없는 열을 제거하는 함수
def drop_single_value_columns(df):
    cols_to_drop = [col for col in df.columns if col != 'target' and df[col].nunique() == 1 and df[col].isnull().sum() == 0]
    df_dropped = df.drop(columns=cols_to_drop)
    return df_dropped, cols_to_drop

# train_data와 test_data에서 해당 열 제거 및 삭제된 열 이름과 개수 출력
train_data, train_cols_dropped = drop_single_value_columns(train_data)
test_data, test_cols_dropped = drop_single_value_columns(test_data)

# print("삭제된 train_data 열 이름:", train_cols_dropped)
print("삭제된 train_data 열 개수:", len(train_cols_dropped))

# print("삭제된 test_data 열 이름:", test_cols_dropped)
print("삭제된 test_data 열 개수:", len(test_cols_dropped))

삭제된 train_data 열 개수: 42
삭제된 test_data 열 개수: 42


### 제품 관련 변수

In [18]:
# 파생변수 생성: 3개의 컬럼 값이 모두 동일하면 해당 값을 저장, 아니면 diff
train_data['Receip_No'] = train_data.apply(
    lambda row: row['Receip No Collect Result_Dam'] if (row['Receip No Collect Result_Dam'] == row['Receip No Collect Result_Fill1'] == row['Receip No Collect Result_Fill2']) else 'diff',
    axis=1
)
test_data['Receip_No'] = test_data.apply(
    lambda row: row['Receip No Collect Result_Dam'] if (row['Receip No Collect Result_Dam'] == row['Receip No Collect Result_Fill1'] == row['Receip No Collect Result_Fill2']) else 'diff',
    axis=1
)

In [19]:
# 파생변수 생성: Receip No와 Model.Suffix의 조합
train_data['model_receip'] = train_data['Model.Suffix'] + '_' + train_data['Receip_No'].astype(str)
test_data['model_receip'] = test_data['Model.Suffix'] + '_' + test_data['Receip_No'].astype(str)

In [20]:
# 파생변수 생성: workorder 앞 4자리 -> workorder_prefix
train_data['workorder_prefix'] = train_data['Workorder'].str[:4]
test_data['workorder_prefix'] = test_data['Workorder'].str[:4]

In [21]:
# 파생변수 생성: Receip No와 workorder_prefix의 조합 -> diff, 3.0, 9.0의 경우에만
train_data['workorder_receip'] = train_data.apply(
    lambda row: f"{row['workorder_prefix']}_{row['Receip_No']}" 
    if row['Receip_No'] in ['diff', 3.0, 9.0] else row['workorder_prefix'],
    axis=1
)
test_data['workorder_receip'] = test_data.apply(
    lambda row: f"{row['workorder_prefix']}_{row['Receip_No']}" 
    if row['Receip_No'] in ['diff', 3.0, 9.0] else row['workorder_prefix'],
    axis=1
)

In [22]:
columns_to_drop = [
    'Model.Suffix',
    'Workorder',
    'workorder_prefix',
    'Receip_No',
    'Receip No Collect Result_Dam',
    'Receip No Collect Result_Fill1',
    'Receip No Collect Result_Fill2'
]

# 컬럼 드롭
train_data.drop(columns=columns_to_drop, inplace=True)
test_data.drop(columns=columns_to_drop, inplace=True)

### dam circle & line

In [23]:
## circle train
# Stage1과 Stage2의 차이 절댓값 계산
train_data['Stage1_Stage2_Absolute_Difference'] = abs(train_data['Stage2 Circle1 Distance Speed Collect Result_Dam'] - train_data['Stage1 Circle1 Distance Speed Collect Result_Dam'])

# Stage2와 Stage3의 차이 절댓값 계산
train_data['Stage2_Stage3_Absolute_Difference'] = abs(train_data['Stage3 Circle1 Distance Speed Collect Result_Dam'] - train_data['Stage2 Circle1 Distance Speed Collect Result_Dam'])

# 두 개의 절댓값 차이를 더한 값 계산
train_data['total_circle_distance_speed_Dam'] = (
    train_data['Stage1_Stage2_Absolute_Difference'] + 
    train_data['Stage2_Stage3_Absolute_Difference']
)

In [24]:
## circle test
# Stage1과 Stage2의 차이 절댓값 계산
test_data['Stage1_Stage2_Absolute_Difference'] = abs(test_data['Stage2 Circle1 Distance Speed Collect Result_Dam'] - test_data['Stage1 Circle1 Distance Speed Collect Result_Dam'])

# Stage2와 Stage3의 차이 절댓값 계산
test_data['Stage2_Stage3_Absolute_Difference'] = abs(test_data['Stage3 Circle1 Distance Speed Collect Result_Dam'] - test_data['Stage2 Circle1 Distance Speed Collect Result_Dam'])

# 두 개의 절댓값 차이를 더한 값 계산
test_data['total_circle_distance_speed_Dam'] = (
    test_data['Stage1_Stage2_Absolute_Difference'] + 
    test_data['Stage2_Stage3_Absolute_Difference']
)

In [25]:
## line train
# stage1 (같으면 그 값, 다르면 diff)
train_data['stage1_line_distance_speed_Dam'] = train_data.apply(
    lambda row: row['Stage1 Line1 Distance Speed Collect Result_Dam'] 
    if (row['Stage1 Line1 Distance Speed Collect Result_Dam'] == 
        row['Stage1 Line2 Distance Speed Collect Result_Dam'] == 
        row['Stage1 Line3 Distance Speed Collect Result_Dam'] == 
        row['Stage1 Line4 Distance Speed Collect Result_Dam']) else 'diff',
    axis=1
)

# stage2 (같으면 그 값, 다르면 diff)
train_data['stage2_line_distance_speed_Dam'] = train_data.apply(
    lambda row: row['Stage2 Line1 Distance Speed Collect Result_Dam'] 
    if (row['Stage2 Line1 Distance Speed Collect Result_Dam'] == 
        row['Stage2 Line2 Distance Speed Collect Result_Dam'] == 
        row['Stage2 Line3 Distance Speed Collect Result_Dam'] == 
        row['Stage2 Line4 Distance Speed Collect Result_Dam']) else 'diff',
    axis=1
)

# stage3 (같으면 그 값, 다르면 diff)
train_data['stage3_line_distance_speed_Dam'] = train_data.apply(
    lambda row: row['Stage3 Line1 Distance Speed Collect Result_Dam'] 
    if (row['Stage3 Line1 Distance Speed Collect Result_Dam'] == 
        row['Stage3 Line2 Distance Speed Collect Result_Dam'] == 
        row['Stage3 Line3 Distance Speed Collect Result_Dam'] == 
        row['Stage3 Line4 Distance Speed Collect Result_Dam']) else 'diff',
    axis=1
)

# total stage (같으면 그 값, 다르면 diff)
train_data['total_line_distance_speed_Dam'] = train_data.apply(
    lambda row: row['stage1_line_distance_speed_Dam'] 
    if (row['stage1_line_distance_speed_Dam'] == 
        row['stage2_line_distance_speed_Dam'] == 
        row['stage3_line_distance_speed_Dam']) else 'diff',
    axis=1
)

In [26]:
## line test
# stage1 (같으면 그 값, 다르면 diff)
test_data['stage1_line_distance_speed_Dam'] = test_data.apply(
    lambda row: row['Stage1 Line1 Distance Speed Collect Result_Dam'] 
    if (row['Stage1 Line1 Distance Speed Collect Result_Dam'] == 
        row['Stage1 Line2 Distance Speed Collect Result_Dam'] == 
        row['Stage1 Line3 Distance Speed Collect Result_Dam'] == 
        row['Stage1 Line4 Distance Speed Collect Result_Dam']) else 'diff',
    axis=1
)

# stage2 (같으면 그 값, 다르면 diff)
test_data['stage2_line_distance_speed_Dam'] = test_data.apply(
    lambda row: row['Stage2 Line1 Distance Speed Collect Result_Dam'] 
    if (row['Stage2 Line1 Distance Speed Collect Result_Dam'] == 
        row['Stage2 Line2 Distance Speed Collect Result_Dam'] == 
        row['Stage2 Line3 Distance Speed Collect Result_Dam'] == 
        row['Stage2 Line4 Distance Speed Collect Result_Dam']) else 'diff',
    axis=1
)

# stage3 (같으면 그 값, 다르면 diff)
test_data['stage3_line_distance_speed_Dam'] = test_data.apply(
    lambda row: row['Stage3 Line1 Distance Speed Collect Result_Dam'] 
    if (row['Stage3 Line1 Distance Speed Collect Result_Dam'] == 
        row['Stage3 Line2 Distance Speed Collect Result_Dam'] == 
        row['Stage3 Line3 Distance Speed Collect Result_Dam'] == 
        row['Stage3 Line4 Distance Speed Collect Result_Dam']) else 'diff',
    axis=1
)

# total stage (같으면 그 값, 다르면 diff)
test_data['total_line_distance_speed_Dam'] = test_data.apply(
    lambda row: row['stage1_line_distance_speed_Dam'] 
    if (row['stage1_line_distance_speed_Dam'] == 
        row['stage2_line_distance_speed_Dam'] == 
        row['stage3_line_distance_speed_Dam']) else 'diff',
    axis=1
)

In [27]:
# 드랍할 열 목록
columns_to_drop = [
    'Stage1 Circle1 Distance Speed Collect Result_Dam',
    'Stage1 Circle2 Distance Speed Collect Result_Dam',
    'Stage1 Circle3 Distance Speed Collect Result_Dam',
    'Stage1 Circle4 Distance Speed Collect Result_Dam',
    'Stage1 Line1 Distance Speed Collect Result_Dam',
    'Stage1 Line2 Distance Speed Collect Result_Dam',
    'Stage1 Line3 Distance Speed Collect Result_Dam',
    'Stage1 Line4 Distance Speed Collect Result_Dam',
    'Stage2 Circle1 Distance Speed Collect Result_Dam',
    'Stage2 Circle2 Distance Speed Collect Result_Dam',
    'Stage2 Circle3 Distance Speed Collect Result_Dam',
    'Stage2 Circle4 Distance Speed Collect Result_Dam',
    'Stage2 Line1 Distance Speed Collect Result_Dam',
    'Stage2 Line2 Distance Speed Collect Result_Dam',
    'Stage2 Line3 Distance Speed Collect Result_Dam',
    'Stage2 Line4 Distance Speed Collect Result_Dam',
    'Stage3 Circle1 Distance Speed Collect Result_Dam',
    'Stage3 Circle2 Distance Speed Collect Result_Dam',
    'Stage3 Circle3 Distance Speed Collect Result_Dam',
    'Stage3 Circle4 Distance Speed Collect Result_Dam',
    'Stage3 Line1 Distance Speed Collect Result_Dam',
    'Stage3 Line2 Distance Speed Collect Result_Dam',
    'Stage3 Line3 Distance Speed Collect Result_Dam',
    'Stage3 Line4 Distance Speed Collect Result_Dam',
    'Stage1_Stage2_Absolute_Difference',
    'Stage2_Stage3_Absolute_Difference',
    'stage1_line_distance_speed_Dam',
    'stage2_line_distance_speed_Dam',
    'stage3_line_distance_speed_Dam'
]

# 열 삭제
train_data.drop(columns=columns_to_drop, inplace=True)
test_data.drop(columns=columns_to_drop, inplace=True)

### dam dispense volume & time

In [28]:
## train
# volume*time 파생변수 - Dam
train_data['volume_time_multip_stage1_Dam'] = train_data['Dispense Volume(Stage1) Collect Result_Dam'] * train_data['DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam']
train_data['volume_time_multip_stage2_Dam'] = train_data['Dispense Volume(Stage2) Collect Result_Dam'] * train_data['DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam']
train_data['volume_time_multip_stage3_Dam'] = train_data['Dispense Volume(Stage3) Collect Result_Dam'] * train_data['DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam']

train_data['volume_time_multip_avg_Dam'] = (train_data['volume_time_multip_stage1_Dam'] + 
                                            train_data['volume_time_multip_stage2_Dam'] + 
                                            train_data['volume_time_multip_stage3_Dam']) / 3

# volume*time 파생변수 - Fill1
train_data['volume_time_multip_stage1_Fill1'] = train_data['Dispense Volume(Stage1) Collect Result_Fill1'] * train_data['DISCHARGED TIME OF RESIN(Stage1) Collect Result_Fill1']
train_data['volume_time_multip_stage2_Fill1'] = train_data['Dispense Volume(Stage2) Collect Result_Fill1'] * train_data['DISCHARGED TIME OF RESIN(Stage2) Collect Result_Fill1']
train_data['volume_time_multip_stage3_Fill1'] = train_data['Dispense Volume(Stage3) Collect Result_Fill1'] * train_data['DISCHARGED TIME OF RESIN(Stage3) Collect Result_Fill1']

train_data['volume_time_multip_avg_Fill1'] = (train_data['volume_time_multip_stage1_Fill1'] + 
                                            train_data['volume_time_multip_stage2_Fill1'] + 
                                            train_data['volume_time_multip_stage3_Fill1']) / 3

In [29]:
## test
# volume*time 파생변수 - Dam
test_data['volume_time_multip_stage1_Dam'] = test_data['Dispense Volume(Stage1) Collect Result_Dam'] * test_data['DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam']
test_data['volume_time_multip_stage2_Dam'] = test_data['Dispense Volume(Stage2) Collect Result_Dam'] * test_data['DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam']
test_data['volume_time_multip_stage3_Dam'] = test_data['Dispense Volume(Stage3) Collect Result_Dam'] * test_data['DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam']

test_data['volume_time_multip_avg_Dam'] = (test_data['volume_time_multip_stage1_Dam'] + 
                                            test_data['volume_time_multip_stage2_Dam'] + 
                                            test_data['volume_time_multip_stage3_Dam']) / 3

# volume*time 파생변수 - Fill1
test_data['volume_time_multip_stage1_Fill1'] = test_data['Dispense Volume(Stage1) Collect Result_Fill1'] * test_data['DISCHARGED TIME OF RESIN(Stage1) Collect Result_Fill1']
test_data['volume_time_multip_stage2_Fill1'] = test_data['Dispense Volume(Stage2) Collect Result_Fill1'] * test_data['DISCHARGED TIME OF RESIN(Stage2) Collect Result_Fill1']
test_data['volume_time_multip_stage3_Fill1'] = test_data['Dispense Volume(Stage3) Collect Result_Fill1'] * test_data['DISCHARGED TIME OF RESIN(Stage3) Collect Result_Fill1']

test_data['volume_time_multip_avg_Fill1'] = (test_data['volume_time_multip_stage1_Fill1'] + 
                                            test_data['volume_time_multip_stage2_Fill1'] + 
                                            test_data['volume_time_multip_stage3_Fill1']) / 3

In [30]:
# 삭제할 열 목록 추가
columns_to_drop = [
    'DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam',
    'DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam',
    'DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam',
    'Dispense Volume(Stage1) Collect Result_Dam',
    'Dispense Volume(Stage2) Collect Result_Dam',
    'Dispense Volume(Stage3) Collect Result_Dam',
    'DISCHARGED TIME OF RESIN(Stage1) Collect Result_Fill1',
    'DISCHARGED TIME OF RESIN(Stage2) Collect Result_Fill1',
    'DISCHARGED TIME OF RESIN(Stage3) Collect Result_Fill1',
    'Dispense Volume(Stage1) Collect Result_Fill1',
    'Dispense Volume(Stage2) Collect Result_Fill1',
    'Dispense Volume(Stage3) Collect Result_Fill1',
    'volume_time_multip_stage1_Dam',
    'volume_time_multip_stage2_Dam',
    'volume_time_multip_stage3_Dam',
    'volume_time_multip_stage1_Fill1',
    'volume_time_multip_stage2_Fill1',
    'volume_time_multip_stage3_Fill1'
]

train_data.drop(columns=columns_to_drop, inplace=True, errors='ignore')
test_data.drop(columns=columns_to_drop, inplace=True, errors='ignore')

### dam thickness

In [31]:
# 세 개 컬럼의 평균을 계산하여 새로운 컬럼 생성
train_data['average_thickness_Dam'] = train_data[['THICKNESS 1 Collect Result_Dam', 
                                                  'THICKNESS 2 Collect Result_Dam', 
                                                  'THICKNESS 3 Collect Result_Dam']].mean(axis=1)

test_data['average_thickness_Dam'] = test_data[['THICKNESS 1 Collect Result_Dam', 
                                                'THICKNESS 2 Collect Result_Dam', 
                                                'THICKNESS 3 Collect Result_Dam']].mean(axis=1)

In [32]:
# 삭제할 컬럼 리스트
columns_to_drop = [
    'THICKNESS 1 Collect Result_Dam',
    'THICKNESS 2 Collect Result_Dam',
    'THICKNESS 3 Collect Result_Dam'
]

# 지정한 컬럼 삭제
train_data.drop(columns=columns_to_drop, inplace=True)
test_data.drop(columns=columns_to_drop, inplace=True)

### autoclave pressure & unit time

In [33]:
# 각 압력과 시간의 곱을 담은 새로운 컬럼 생성
train_data['1st_pressure_time_AutoClave'] = train_data['1st Pressure Collect Result_AutoClave'] * train_data['1st Pressure 1st Pressure Unit Time_AutoClave']
train_data['2nd_pressure_time_AutoClave'] = train_data['2nd Pressure Collect Result_AutoClave'] * train_data['2nd Pressure Unit Time_AutoClave']
train_data['3rd_pressure_time_AutoClave'] = train_data['3rd Pressure Collect Result_AutoClave'] * train_data['3rd Pressure Unit Time_AutoClave']

train_data['avg_pressure_time_AutoClave'] = (train_data['1st_pressure_time_AutoClave'] +
                                             train_data['2nd_pressure_time_AutoClave'] +
                                             train_data['3rd_pressure_time_AutoClave']) / 3

In [34]:
# 각 압력과 시간의 곱을 담은 새로운 컬럼 생성
test_data['1st_pressure_time_AutoClave'] = test_data['1st Pressure Collect Result_AutoClave'] * test_data['1st Pressure 1st Pressure Unit Time_AutoClave']
test_data['2nd_pressure_time_AutoClave'] = test_data['2nd Pressure Collect Result_AutoClave'] * test_data['2nd Pressure Unit Time_AutoClave']
test_data['3rd_pressure_time_AutoClave'] = test_data['3rd Pressure Collect Result_AutoClave'] * test_data['3rd Pressure Unit Time_AutoClave']

test_data['avg_pressure_time_AutoClave'] = (test_data['1st_pressure_time_AutoClave'] +
                                             test_data['2nd_pressure_time_AutoClave'] +
                                             test_data['3rd_pressure_time_AutoClave']) / 3

In [35]:
# 삭제할 컬럼 리스트
columns_to_drop = [
    '1st Pressure Collect Result_AutoClave',
    '1st Pressure 1st Pressure Unit Time_AutoClave',
    '2nd Pressure Collect Result_AutoClave',
    '2nd Pressure Unit Time_AutoClave',
    '3rd Pressure Collect Result_AutoClave',
    '3rd Pressure Unit Time_AutoClave',

    '1st_pressure_time_AutoClave',
    '2nd_pressure_time_AutoClave',
    '3rd_pressure_time_AutoClave'
]

# 지정한 컬럼 삭제
train_data.drop(columns=columns_to_drop, inplace=True)
test_data.drop(columns=columns_to_drop, inplace=True)

### total tact time

In [36]:
# total_tact_time
train_data['total_tact_time'] = (train_data['Machine Tact time Collect Result_Dam'] + 
                                 train_data['Machine Tact time Collect Result_Fill1'] +
                                 train_data['Machine Tact time Collect Result_Fill2'] +
                                 train_data['Chamber Temp. Unit Time_AutoClave'])

# total_tact_time
test_data['total_tact_time'] = (test_data['Machine Tact time Collect Result_Dam'] + 
                                 test_data['Machine Tact time Collect Result_Fill1'] +
                                 test_data['Machine Tact time Collect Result_Fill2'] +
                                 test_data['Chamber Temp. Unit Time_AutoClave'])

In [37]:
# 삭제할 컬럼 리스트
columns_to_drop = [
    'Machine Tact time Collect Result_Dam',
    'Machine Tact time Collect Result_Fill1',
    'Machine Tact time Collect Result_Fill2',
    'Chamber Temp. Unit Time_AutoClave',
    'Chamber Temp. Judge Value_AutoClave',
    'GMES_ORIGIN_INSP_JUDGE_CODE Collect Result_AutoClave',
    'GMES_ORIGIN_INSP_JUDGE_CODE Judge Value_AutoClave'
]

# 지정한 컬럼 삭제
train_data.drop(columns=columns_to_drop, inplace=True)
test_data.drop(columns=columns_to_drop, inplace=True)

### 기타 변수 삭제

In [38]:
# 삭제할 컬럼 리스트
columns_to_drop = [
    'PalletID Collect Result_Dam',
    'Production Qty Collect Result_Dam',
    'WorkMode Collect Result_Dam',
    'PalletID Collect Result_Fill1',
    'Production Qty Collect Result_Fill1',
    'WorkMode Collect Result_Fill1',
    'PalletID Collect Result_Fill2',
    'Production Qty Collect Result_Fill2',
    'WorkMode Collect Result_Fill2'
]

# 지정한 컬럼 삭제
train_data.drop(columns=columns_to_drop, inplace=True)
test_data.drop(columns=columns_to_drop, inplace=True)

# modeling

### workorder_receip & model_receip 가중치 부여

In [39]:
# 각 범주별 abnormal 비율 계산
abnormal_counts = train_data[train_data['target'] == 'AbNormal'].groupby('workorder_receip').size()
total_counts = train_data.groupby('workorder_receip').size()
abnormal_ratio = abnormal_counts / total_counts

# 가중치 적용하여 새로운 컬럼 생성
train_data['workorder_receip_weighted'] = train_data['workorder_receip'].map(abnormal_ratio).fillna(0)
test_data['workorder_receip_weighted'] = test_data['workorder_receip'].map(abnormal_ratio).fillna(0)

In [40]:
# 각 범주별 abnormal 비율 계산
abnormal_counts = train_data[train_data['target'] == 'AbNormal'].groupby('model_receip').size()
total_counts = train_data.groupby('model_receip').size()
abnormal_ratio = abnormal_counts / total_counts

# 가중치 적용하여 새로운 컬럼 생성
train_data['model_receip_weighted'] = train_data['model_receip'].map(abnormal_ratio).fillna(0)
test_data['model_receip_weighted'] = test_data['model_receip'].map(abnormal_ratio).fillna(0)

In [41]:
# 'workorder_receip'와 'model_receip' 열 삭제
train_data = train_data.drop(columns=['workorder_receip', 'model_receip'])
test_data = test_data.drop(columns=['workorder_receip', 'model_receip'])

### 컬럼 확인 및 인코딩

In [42]:
# 인코딩 오류나서 직접 레이블 인코딩
train_data['cure_end_position_XZΘ_Dam'] = train_data['cure_end_position_XZΘ_Dam'].replace({'240.0,2.5,-90': 0, '1000.0,12.5,90': 1})
test_data['cure_end_position_XZΘ_Dam'] = test_data['cure_end_position_XZΘ_Dam'].replace({'240.0,2.5,-90': 0, '1000.0,12.5,90': 1})

train_data['cure_start_position_XΘ_Dam'] = train_data['cure_start_position_XΘ_Dam'].replace({'1030,-90': 0, '280,90': 1})
test_data['cure_start_position_XΘ_Dam'] = test_data['cure_start_position_XΘ_Dam'].replace({'1030,-90': 0, '280,90': 1})

train_data['cure_end_position_XZ_Fill2'] = train_data['cure_end_position_XZ_Fill2'].replace({'240,33': 0, '240,32': 1, '1020,33': 2, '240,22': 3})
test_data['cure_end_position_XZ_Fill2'] = test_data['cure_end_position_XZ_Fill2'].replace({'240,33': 0, '240,32': 1, '1020,33': 2, '240,22': 3})

train_data['cure_start_position_XZ_Fill2'] = train_data['cure_start_position_XZ_Fill2'].replace({'1020,33': 0, '1020,32': 1, '1020,22': 2, '1020,23': 3, '240,33': 4})
test_data['cure_start_position_XZ_Fill2'] = test_data['cure_start_position_XZ_Fill2'].replace({'1020,33': 0, '1020,32': 1, '1020,22': 2, '1020,23': 3, '240,33': 4})

train_data['total_line_distance_speed_Dam'] = train_data['total_line_distance_speed_Dam'].replace({'diff': 0, 9000.0: 1, 6500.0: 2, 4000.0: 3})
test_data['total_line_distance_speed_Dam'] = test_data['total_line_distance_speed_Dam'].replace({'diff': 0, 9000.0: 1, 6500.0: 2, 4000.0: 3})

In [44]:
# info 잘리지 않게 출력
train_data.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40506 entries, 0 to 40505
Data columns (total 35 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   CURE SPEED Collect Result_Dam                   40506 non-null  int64  
 1   DISCHARGED SPEED OF RESIN Collect Result_Dam    40506 non-null  int64  
 2   Head Clean Position Z Collect Result_Dam        40506 non-null  float64
 3   Head Purge Position Z Collect Result_Dam        40506 non-null  float64
 4   Head Zero Position Y Collect Result_Dam         40506 non-null  float64
 5   Chamber Temp. Collect Result_AutoClave          40506 non-null  int64  
 6   DISCHARGED SPEED OF RESIN Collect Result_Fill1  40506 non-null  float64
 7   Head Purge Position Z Collect Result_Fill1      40506 non-null  float64
 8   CURE SPEED Collect Result_Fill2                 40506 non-null  int64  
 9   CURE STANDBY POSITION Z Collect Result_

### random forest

In [45]:
# 타깃값 분리
y_train = train_data['target']
x_train = train_data.drop(columns='target')

In [46]:
# 클래스의 비율에 따라 가중치 설정
class_weights = {'Normal': 1, 'AbNormal': 5} 

# 랜덤 포레스트 모델 생성 및 학습
rf = RandomForestClassifier(n_estimators=900, class_weight=class_weights, random_state=42)
rf.fit(x_train, y_train)

In [47]:
# 'Set ID'와 'target' 컬럼 분리
set_id = test_data['Set ID']  
target = test_data['target']   

x_test = test_data.drop(columns=['Set ID', 'target'])

In [48]:
test_pred = rf.predict(x_test)
test_pred

array(['Normal', 'Normal', 'Normal', ..., 'Normal', 'Normal', 'Normal'],
      dtype=object)

In [49]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["target"] = test_pred

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

In [51]:
'''
send files ['code.ipynb', 'submission.csv'] for grade...
done!

Score: None
Duration: 1.538 seconds
=== Message ===
작성하신 답안 제출이 완료되었습니다.
Public Score : 0.14593698175787728
'''

"\nsend files ['code.ipynb', 'submission.csv'] for grade...\ndone!\n\nScore: None\nDuration: 1.538 seconds\n=== Message ===\n작성하신 답안 제출이 완료되었습니다.\nPublic Score : 0.14593698175787728\n"

In [52]:
# 제출 파일 저장
df_sub.to_csv("0813_submission.csv", index=False)