## 공정 4개 나눠서 모델링

전처리: 0814_preprocessing_sep.ipynb

In [1]:
import os
from pprint import pprint

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

In [2]:
# csv 불러오기 (0814_preprocessing_sep.ipynb에서 전처리된 데이터)
train_data = pd.read_csv('sep_model_train_data.csv')
test_data = pd.read_csv('sep_model_test_data.csv')

In [3]:
### 총시간 대비 비율 변수
def calculate_total_time_and_ratios(data):
    data['total_time'] = (
        data['Machine Tact time Collect Result_Dam'] +
        data['Machine Tact time Collect Result_Fill1'] +
        data['Machine Tact time Collect Result_Fill2'] +
        data['Chamber Temp. Unit Time_AutoClave']
    )
    data['time_ratio_Dam'] = (data['Machine Tact time Collect Result_Dam'] / data['total_time']).round(3)
    data['time_ratio_Fill1'] = (data['Machine Tact time Collect Result_Fill1'] / data['total_time']).round(3)
    data['time_ratio_Fill2'] = (data['Machine Tact time Collect Result_Fill2'] / data['total_time']).round(3)
    data['time_ratio_AutoClave'] = (data['Chamber Temp. Unit Time_AutoClave'] / data['total_time']).round(3)
    return data

# train_data와 test_data에 함수 적용
train_data = calculate_total_time_and_ratios(train_data)
test_data = calculate_total_time_and_ratios(test_data)

In [4]:
# 변수 제거
train_data.drop(columns=[
    'total_time'
    , 'Machine Tact time Collect Result_Dam'
    , 'Machine Tact time Collect Result_Fill1'
    , 'Machine Tact time Collect Result_Fill2'
    , 'Chamber Temp. Unit Time_AutoClave'], inplace=True)

test_data.drop(columns=[
    'total_time'
    , 'Machine Tact time Collect Result_Dam'
    , 'Machine Tact time Collect Result_Fill1'
    , 'Machine Tact time Collect Result_Fill2'
    , 'Chamber Temp. Unit Time_AutoClave'], inplace=True)

### 공통 변수

In [5]:
variables = [
    "Equipment_same_num",
    "PalletID",
    "Production_Qty",
    "model_receip_encoded",
    "workorder_receip_encoded"
]

# 변수들로만 이루어진 DataFrame 생성
filtered_data = train_data[variables]

In [6]:
# 상관계수 행렬 계산
correlation_matrix = filtered_data.corr()

# 자기자신을 제외하고 특정 값 이상인 조합 찾기
strong_correlations = correlation_matrix[(correlation_matrix >= 0.8) & (correlation_matrix != 1)]

# 리스트로 변환
strong_correlations_pairs = strong_correlations.stack().reset_index()
strong_correlations_pairs.columns = ['Variable 1', 'Variable 2', 'Correlation']

# 결과 출력
strong_correlations_pairs = strong_correlations_pairs[strong_correlations_pairs['Correlation'] >= 0.7]
print(strong_correlations_pairs)

       Variable 1      Variable 2  Correlation
0        PalletID  Production_Qty     0.944591
1  Production_Qty        PalletID     0.944591


In [7]:
# 열 삭제
train_data.drop("PalletID", axis=1, inplace=True)
test_data.drop("PalletID", axis=1, inplace=True)

In [8]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17361 entries, 0 to 17360
Data columns (total 40 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Set ID                                          17361 non-null  object 
 1   CURE SPEED Collect Result_Dam                   17361 non-null  int64  
 2   DISCHARGED SPEED OF RESIN Collect Result_Dam    17361 non-null  int64  
 3   Head Clean Position Z Collect Result_Dam        17361 non-null  float64
 4   Head Purge Position Z Collect Result_Dam        17361 non-null  float64
 5   Head Zero Position Y Collect Result_Dam         17361 non-null  float64
 6   Chamber Temp. Collect Result_AutoClave          17361 non-null  int64  
 7   DISCHARGED SPEED OF RESIN Collect Result_Fill1  17361 non-null  float64
 8   Head Purge Position Z Collect Result_Fill1      17361 non-null  float64
 9   CURE SPEED Collect Result_Fill2        

-> 상관관계가 높은 이유: Production_Qty가 0이고 PalletID가 1인 경우는 없음 (Production_Qty가 1인 경우 안에 PalletID가 1인 경우가 모두 포함됨). 따라서 PalletID 컬럼을 삭제.

In [9]:
# 공통 변수 리스트
same_variables_train = [
    "target",
    "Equipment_same_num",
    "Production_Qty",
    "model_receip_encoded",
    "workorder_receip_encoded"
]

same_variables_test = [
    "Set ID",
    "target",
    "Equipment_same_num",
    "Production_Qty",
    "model_receip_encoded",
    "workorder_receip_encoded"
]

In [10]:
# 'target'과 'Set ID' 열을 제외한 나머지 변수들을 범주형으로 변환
for var in same_variables_train:
    if var != "target":  # 'target' 제외
        train_data[var] = train_data[var].astype(str).astype('category')

for var in same_variables_test:
    if var != "Set ID" and var != "target":  # 'Set ID'와 'target' 제외
        test_data[var] = test_data[var].astype(str).astype('category')

### dam

In [11]:
# 열 이름 필터링
Process_Desc_col = train_data.filter(like='_Dam').columns

# 필터링된 열 이름 출력
print("<Dam 공정 관련 변수>")
for col in Process_Desc_col:
    print(col)

<Dam 공정 관련 변수>
CURE SPEED Collect Result_Dam
DISCHARGED SPEED OF RESIN Collect Result_Dam
Head Clean Position Z Collect Result_Dam
Head Purge Position Z Collect Result_Dam
Head Zero Position Y Collect Result_Dam
head_normal_vector_stage1_Dam
head_normal_vector_stage2_Dam
head_normal_vector_stage3_Dam
cure_end_position_XZΘ_Dam
cure_start_position_XΘ_Dam
total_circle_distance_speed_Dam
total_line_distance_speed_Dam
volume_time_multip_avg_Dam
average_thickness_Dam
time_ratio_Dam


In [12]:
variables = [
    "CURE SPEED Collect Result_Dam",
    "DISCHARGED SPEED OF RESIN Collect Result_Dam",
    "Head Clean Position Z Collect Result_Dam",
    "Head Purge Position Z Collect Result_Dam",
    "Head Zero Position Y Collect Result_Dam",
    #"head_normal_vector_stage1_Dam",
    #"head_normal_vector_stage2_Dam",
    #"head_normal_vector_stage3_Dam",
    "cure_end_position_XZΘ_Dam",
    "cure_start_position_XΘ_Dam",
    "total_circle_distance_speed_Dam",
    "total_line_distance_speed_Dam",
    "volume_time_multip_avg_Dam",
    "average_thickness_Dam"
]

# 변수들로만 이루어진 DataFrame 생성
filtered_data = train_data[variables]

In [13]:
# 상관계수 행렬 계산
correlation_matrix = filtered_data.corr()

# 자기자신을 제외하고 특정 값 이상인 조합 찾기
strong_correlations = correlation_matrix[(correlation_matrix >= 0.8) & (correlation_matrix != 1)]

# 리스트로 변환
strong_correlations_pairs = strong_correlations.stack().reset_index()
strong_correlations_pairs.columns = ['Variable 1', 'Variable 2', 'Correlation']

# 결과 출력
strong_correlations_pairs = strong_correlations_pairs[strong_correlations_pairs['Correlation'] >= 0.7]
print(strong_correlations_pairs)

Empty DataFrame
Columns: [Variable 1, Variable 2, Correlation]
Index: []


In [14]:
# 드랍할 열 목록
columns_to_drop = [
    'head_normal_vector_stage1_Dam',
    'head_normal_vector_stage2_Dam',
    'head_normal_vector_stage3_Dam',
]

# 열 삭제
train_data.drop(columns=columns_to_drop, inplace=True, errors='ignore')
test_data.drop(columns=columns_to_drop, inplace=True, errors='ignore')

In [15]:
### dam 데이터셋
# 열 이름 필터링 후 공통 변수와 결합
Process_Desc_col = train_data.filter(like='_Dam').columns

# train
final_columns_train = list(Process_Desc_col) + same_variables_train
df_train_dam = train_data[final_columns_train]

# test 
final_columns_test = list(Process_Desc_col) + same_variables_test
df_test_dam = test_data[final_columns_test]

In [16]:
# 변수 리스트
categories = [
    "Head Clean Position Z Collect Result_Dam",
    "Head Purge Position Z Collect Result_Dam",
    "Head Zero Position Y Collect Result_Dam",
    "cure_end_position_XZΘ_Dam",
    "cure_start_position_XΘ_Dam"
]

# 범주형으로 변환
df_train_dam[categories] = df_train_dam[categories].astype(str).astype('category')
df_test_dam[categories] = df_test_dam[categories].astype(str).astype('category')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train_dam[categories] = df_train_dam[categories].astype(str).astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test_dam[categories] = df_test_dam[categories].astype(str).astype('category')


In [17]:
df_train_dam.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40506 entries, 0 to 40505
Data columns (total 17 columns):
 #   Column                                        Non-Null Count  Dtype   
---  ------                                        --------------  -----   
 0   CURE SPEED Collect Result_Dam                 40506 non-null  int64   
 1   DISCHARGED SPEED OF RESIN Collect Result_Dam  40506 non-null  int64   
 2   Head Clean Position Z Collect Result_Dam      40506 non-null  category
 3   Head Purge Position Z Collect Result_Dam      40506 non-null  category
 4   Head Zero Position Y Collect Result_Dam       40506 non-null  category
 5   cure_end_position_XZΘ_Dam                     40506 non-null  category
 6   cure_start_position_XΘ_Dam                    40506 non-null  category
 7   total_circle_distance_speed_Dam               40506 non-null  float64 
 8   total_line_distance_speed_Dam                 40506 non-null  int64   
 9   volume_time_multip_avg_Dam                    4050

In [18]:
df_test_dam.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17361 entries, 0 to 17360
Data columns (total 18 columns):
 #   Column                                        Non-Null Count  Dtype   
---  ------                                        --------------  -----   
 0   CURE SPEED Collect Result_Dam                 17361 non-null  int64   
 1   DISCHARGED SPEED OF RESIN Collect Result_Dam  17361 non-null  int64   
 2   Head Clean Position Z Collect Result_Dam      17361 non-null  category
 3   Head Purge Position Z Collect Result_Dam      17361 non-null  category
 4   Head Zero Position Y Collect Result_Dam       17361 non-null  category
 5   cure_end_position_XZΘ_Dam                     17361 non-null  category
 6   cure_start_position_XΘ_Dam                    17361 non-null  category
 7   total_circle_distance_speed_Dam               17361 non-null  float64 
 8   total_line_distance_speed_Dam                 17361 non-null  int64   
 9   volume_time_multip_avg_Dam                    1736

### fill1

In [19]:
# 열 이름 필터링
Process_Desc_col = train_data.filter(like='_Fill1').columns

# 필터링된 열 이름 출력
print("<Fill1 공정 관련 변수>")
for col in Process_Desc_col:
    print(col)

<Fill1 공정 관련 변수>
DISCHARGED SPEED OF RESIN Collect Result_Fill1
Head Purge Position Z Collect Result_Fill1
head_normal_vector_stage1_Fill1
head_normal_vector_stage2_Fill1
head_normal_vector_stage3_Fill1
volume_time_multip_avg_Fill1
time_ratio_Fill1


In [20]:
variables = [
    "DISCHARGED SPEED OF RESIN Collect Result_Fill1",
    "Head Purge Position Z Collect Result_Fill1",
    "head_normal_vector_stage1_Fill1",
    #"head_normal_vector_stage2_Fill1",
    #"head_normal_vector_stage3_Fill1",
    "volume_time_multip_avg_Fill1",
    "time_ratio_Fill1"
]

# 변수들로만 이루어진 DataFrame 생성
filtered_data = train_data[variables]

In [21]:
# 상관계수 행렬 계산
correlation_matrix = filtered_data.corr()

# 자기자신을 제외하고 특정 값 이상인 조합 찾기
strong_correlations = correlation_matrix[(correlation_matrix >= 0.8) & (correlation_matrix != 1)]

# 리스트로 변환
strong_correlations_pairs = strong_correlations.stack().reset_index()
strong_correlations_pairs.columns = ['Variable 1', 'Variable 2', 'Correlation']

# 결과 출력
strong_correlations_pairs = strong_correlations_pairs[strong_correlations_pairs['Correlation'] >= 0.7]
print(strong_correlations_pairs)

Empty DataFrame
Columns: [Variable 1, Variable 2, Correlation]
Index: []


In [22]:
# 드랍할 열 목록
columns_to_drop = [
    "head_normal_vector_stage2_Fill1",
    "head_normal_vector_stage3_Fill1"
]

# 열 삭제
train_data.drop(columns=columns_to_drop, inplace=True, errors='ignore')
test_data.drop(columns=columns_to_drop, inplace=True, errors='ignore')

In [23]:
### fill1 데이터셋
# 열 이름 필터링 후 공통 변수와 결합
Process_Desc_col = train_data.filter(like='_Fill1').columns

# train
final_columns_train = list(Process_Desc_col) + same_variables_train
df_train_fill1 = train_data[final_columns_train]

# test 
final_columns_test = list(Process_Desc_col) + same_variables_test
df_test_fill1 = test_data[final_columns_test]

In [24]:
# 범주형으로 변환
df_train_fill1["Head Purge Position Z Collect Result_Fill1"] = df_train_fill1["Head Purge Position Z Collect Result_Fill1"].astype(str).astype('category')
df_test_fill1["Head Purge Position Z Collect Result_Fill1"] = df_test_fill1["Head Purge Position Z Collect Result_Fill1"].astype(str).astype('category')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train_fill1["Head Purge Position Z Collect Result_Fill1"] = df_train_fill1["Head Purge Position Z Collect Result_Fill1"].astype(str).astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test_fill1["Head Purge Position Z Collect Result_Fill1"] = df_test_fill1["Head Purge Position Z Collect Result_Fill1"].astype(str).astype('category')


In [25]:
df_train_fill1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40506 entries, 0 to 40505
Data columns (total 10 columns):
 #   Column                                          Non-Null Count  Dtype   
---  ------                                          --------------  -----   
 0   DISCHARGED SPEED OF RESIN Collect Result_Fill1  40506 non-null  float64 
 1   Head Purge Position Z Collect Result_Fill1      40506 non-null  category
 2   head_normal_vector_stage1_Fill1                 40506 non-null  float64 
 3   volume_time_multip_avg_Fill1                    40506 non-null  float64 
 4   time_ratio_Fill1                                40506 non-null  float64 
 5   target                                          40506 non-null  object  
 6   Equipment_same_num                              40506 non-null  category
 7   Production_Qty                                  40506 non-null  category
 8   model_receip_encoded                            40506 non-null  category
 9   workorder_receip_encoded    

In [26]:
df_test_fill1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17361 entries, 0 to 17360
Data columns (total 11 columns):
 #   Column                                          Non-Null Count  Dtype   
---  ------                                          --------------  -----   
 0   DISCHARGED SPEED OF RESIN Collect Result_Fill1  17361 non-null  float64 
 1   Head Purge Position Z Collect Result_Fill1      17361 non-null  category
 2   head_normal_vector_stage1_Fill1                 17361 non-null  float64 
 3   volume_time_multip_avg_Fill1                    17361 non-null  float64 
 4   time_ratio_Fill1                                17361 non-null  float64 
 5   Set ID                                          17361 non-null  object  
 6   target                                          0 non-null      float64 
 7   Equipment_same_num                              17361 non-null  category
 8   Production_Qty                                  17361 non-null  category
 9   model_receip_encoded        

### fill2

In [27]:
# 열 이름 필터링
Process_Desc_col = train_data.filter(like='_Fill2').columns

# 필터링된 열 이름 출력
print("<Fill2 공정 관련 변수>")
for col in Process_Desc_col:
    print(col)

<Fill2 공정 관련 변수>
CURE SPEED Collect Result_Fill2
CURE STANDBY POSITION Z Collect Result_Fill2
Head Purge Position Z Collect Result_Fill2
head_normal_vector_stage1_Fill2
head_normal_vector_stage2_Fill2
head_normal_vector_stage3_Fill2
cure_end_position_XZ_Fill2
cure_start_position_XZ_Fill2
time_ratio_Fill2


In [28]:
variables = [
    "CURE SPEED Collect Result_Fill2",
    "CURE STANDBY POSITION Z Collect Result_Fill2",
    "Head Purge Position Z Collect Result_Fill2",
    "head_normal_vector_stage1_Fill2",
    #"head_normal_vector_stage2_Fill2",
    #"head_normal_vector_stage3_Fill2",
    "cure_end_position_XZ_Fill2",
    "cure_start_position_XZ_Fill2",
    "time_ratio_Fill2"
]

# 변수들로만 이루어진 DataFrame 생성
filtered_data = train_data[variables]

In [29]:
# 상관계수 행렬 계산
correlation_matrix = filtered_data.corr()

# 자기자신을 제외하고 특정 값 이상인 조합 찾기
strong_correlations = correlation_matrix[(correlation_matrix >= 0.8) & (correlation_matrix != 1)]

# 리스트로 변환
strong_correlations_pairs = strong_correlations.stack().reset_index()
strong_correlations_pairs.columns = ['Variable 1', 'Variable 2', 'Correlation']

# 결과 출력
strong_correlations_pairs = strong_correlations_pairs[strong_correlations_pairs['Correlation'] >= 0.7]
print(strong_correlations_pairs)

                                     Variable 1  \
0  CURE STANDBY POSITION Z Collect Result_Fill2   
1                  cure_start_position_XZ_Fill2   

                                     Variable 2  Correlation  
0                  cure_start_position_XZ_Fill2     0.846498  
1  CURE STANDBY POSITION Z Collect Result_Fill2     0.846498  


In [30]:
# 드랍할 열 목록
columns_to_drop = [
    "head_normal_vector_stage2_Fill2",
    "head_normal_vector_stage3_Fill2",
]

# 열 삭제
train_data.drop(columns=columns_to_drop, inplace=True, errors='ignore')
test_data.drop(columns=columns_to_drop, inplace=True, errors='ignore')

In [31]:
### fill2 데이터셋
# 열 이름 필터링 후 공통 변수와 결합
Process_Desc_col = train_data.filter(like='_Fill2').columns

# train
final_columns_train = list(Process_Desc_col) + same_variables_train
df_train_fill2 = train_data[final_columns_train]

# test 
final_columns_test = list(Process_Desc_col) + same_variables_test
df_test_fill2 = test_data[final_columns_test]

In [32]:
# 변수 리스트
categories = [
    'CURE STANDBY POSITION Z Collect Result_Fill2',
    'Head Purge Position Z Collect Result_Fill2',
    'cure_end_position_XZ_Fill2',
    'cure_start_position_XZ_Fill2'
]

# 범주형으로 변환
df_train_fill2[categories] = df_train_fill2[categories].astype(str).astype('category')
df_test_fill2[categories] = df_test_fill2[categories].astype(str).astype('category')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train_fill2[categories] = df_train_fill2[categories].astype(str).astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test_fill2[categories] = df_test_fill2[categories].astype(str).astype('category')


In [33]:
df_train_fill2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40506 entries, 0 to 40505
Data columns (total 12 columns):
 #   Column                                        Non-Null Count  Dtype   
---  ------                                        --------------  -----   
 0   CURE SPEED Collect Result_Fill2               40506 non-null  int64   
 1   CURE STANDBY POSITION Z Collect Result_Fill2  40506 non-null  category
 2   Head Purge Position Z Collect Result_Fill2    40506 non-null  category
 3   head_normal_vector_stage1_Fill2               40506 non-null  float64 
 4   cure_end_position_XZ_Fill2                    40506 non-null  category
 5   cure_start_position_XZ_Fill2                  40506 non-null  category
 6   time_ratio_Fill2                              40506 non-null  float64 
 7   target                                        40506 non-null  object  
 8   Equipment_same_num                            40506 non-null  category
 9   Production_Qty                                4050

In [34]:
df_test_fill2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17361 entries, 0 to 17360
Data columns (total 13 columns):
 #   Column                                        Non-Null Count  Dtype   
---  ------                                        --------------  -----   
 0   CURE SPEED Collect Result_Fill2               17361 non-null  int64   
 1   CURE STANDBY POSITION Z Collect Result_Fill2  17361 non-null  category
 2   Head Purge Position Z Collect Result_Fill2    17361 non-null  category
 3   head_normal_vector_stage1_Fill2               17361 non-null  float64 
 4   cure_end_position_XZ_Fill2                    17361 non-null  category
 5   cure_start_position_XZ_Fill2                  17361 non-null  category
 6   time_ratio_Fill2                              17361 non-null  float64 
 7   Set ID                                        17361 non-null  object  
 8   target                                        0 non-null      float64 
 9   Equipment_same_num                            1736

### autoclave

In [35]:
# 열 이름 필터링
Process_Desc_col = train_data.filter(like='_AutoClave').columns

# 필터링된 열 이름 출력
print("<AutoClave 공정 관련 변수>")
for col in Process_Desc_col:
    print(col)

<AutoClave 공정 관련 변수>
Chamber Temp. Collect Result_AutoClave
avg_pressure_time_AutoClave
time_ratio_AutoClave


In [36]:
variables = [
    "Chamber Temp. Collect Result_AutoClave",
    "avg_pressure_time_AutoClave",
    "time_ratio_AutoClave",
]

# 변수들로만 이루어진 DataFrame 생성
filtered_data = train_data[variables]

In [37]:
# 상관계수 행렬 계산
correlation_matrix = filtered_data.corr()

# 자기자신을 제외하고 특정 값 이상인 조합 찾기
strong_correlations = correlation_matrix[(correlation_matrix >= 0.8) & (correlation_matrix != 1)]

# 리스트로 변환
strong_correlations_pairs = strong_correlations.stack().reset_index()
strong_correlations_pairs.columns = ['Variable 1', 'Variable 2', 'Correlation']

# 결과 출력
strong_correlations_pairs = strong_correlations_pairs[strong_correlations_pairs['Correlation'] >= 0.7]
print(strong_correlations_pairs)

Empty DataFrame
Columns: [Variable 1, Variable 2, Correlation]
Index: []


In [38]:
### autoclave 데이터셋
# 열 이름 필터링 후 공통 변수와 결합
Process_Desc_col = train_data.filter(like='_AutoClave').columns

# train
final_columns_train = list(Process_Desc_col) + same_variables_train
df_train_autoclave = train_data[final_columns_train]

# test 
final_columns_test = list(Process_Desc_col) + same_variables_test
df_test_autoclave = test_data[final_columns_test]

In [39]:
df_train_autoclave.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40506 entries, 0 to 40505
Data columns (total 8 columns):
 #   Column                                  Non-Null Count  Dtype   
---  ------                                  --------------  -----   
 0   Chamber Temp. Collect Result_AutoClave  40506 non-null  int64   
 1   avg_pressure_time_AutoClave             40506 non-null  float64 
 2   time_ratio_AutoClave                    40506 non-null  float64 
 3   target                                  40506 non-null  object  
 4   Equipment_same_num                      40506 non-null  category
 5   Production_Qty                          40506 non-null  category
 6   model_receip_encoded                    40506 non-null  category
 7   workorder_receip_encoded                40506 non-null  category
dtypes: category(4), float64(2), int64(1), object(1)
memory usage: 1.4+ MB


In [40]:
df_test_autoclave.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17361 entries, 0 to 17360
Data columns (total 9 columns):
 #   Column                                  Non-Null Count  Dtype   
---  ------                                  --------------  -----   
 0   Chamber Temp. Collect Result_AutoClave  17361 non-null  int64   
 1   avg_pressure_time_AutoClave             17361 non-null  float64 
 2   time_ratio_AutoClave                    17361 non-null  float64 
 3   Set ID                                  17361 non-null  object  
 4   target                                  0 non-null      float64 
 5   Equipment_same_num                      17361 non-null  category
 6   Production_Qty                          17361 non-null  category
 7   model_receip_encoded                    17361 non-null  category
 8   workorder_receip_encoded                17361 non-null  category
dtypes: category(4), float64(3), int64(1), object(1)
memory usage: 748.3+ KB


### modeling

In [41]:
#pip install catboost

In [102]:
# category 타입의 변수를 찾아 cat_features 리스트를 반환하는 함수
def get_cat_features(df):
    """
    Parameters:
    df (pd.DataFrame): 입력 DataFrame
    
    Returns:
    list: category 타입의 변수 이름 리스트
    """
    cat_features = df.select_dtypes(include=['category']).columns.tolist()
    return cat_features

In [103]:
from catboost import CatBoostClassifier

In [104]:
### 모델 정의
class_weights = {'Normal': 1, 'AbNormal': 6}  # 클래스의 비율에 따라 가중치 설정

model_Dam = CatBoostClassifier(iterations=1000, learning_rate=0.1, class_weights=class_weights, verbose = False)
model_Fill1 = CatBoostClassifier(iterations=1000, learning_rate=0.1, class_weights=class_weights, verbose = False)
model_Fill2 = CatBoostClassifier(iterations=1000, learning_rate=0.1, class_weights=class_weights, verbose = False)
model_AutoClave = CatBoostClassifier(iterations=1000, learning_rate=0.1, class_weights=class_weights, verbose = False)

In [105]:
# 제출 데이터 읽어오기
df_sub = pd.read_csv("submission.csv")

##### dam

In [106]:
### 모델 학습
# 타깃값 분리
y_train = df_train_dam['target']
x_train = df_train_dam.drop(columns='target')

# 범주형 변수 리스트 생성
cat_features_list = get_cat_features(df_train_dam)

# 랜덤 포레스트 학습
model_Dam.fit(x_train, y_train, cat_features=cat_features_list)

<catboost.core.CatBoostClassifier at 0x166fb9710>

In [107]:
### 모델 예측
# 'Set ID'와 'target' 컬럼 분리
set_id = df_test_dam['Set ID']  
target = df_test_dam['target']   

x_test = df_test_dam.drop(columns=['Set ID', 'target'])

test_pred = model_Dam.predict(x_test)
test_pred

array(['Normal', 'Normal', 'Normal', ..., 'Normal', 'Normal', 'Normal'],
      dtype=object)

In [108]:
### 예측 컬럼 생성
df_sub["target_dam"] = test_pred

In [109]:
df_sub["target_dam"].value_counts()

target_dam
Normal      17048
AbNormal      313
Name: count, dtype: int64

##### fill1

In [110]:
### 모델 학습
# 타깃값 분리
y_train = df_train_fill1['target']
x_train = df_train_fill1.drop(columns='target')

# 범주형 변수 리스트 생성
cat_features_list = get_cat_features(df_train_fill1)

# 랜덤 포레스트 학습
model_Fill1.fit(x_train, y_train, cat_features=cat_features_list)

<catboost.core.CatBoostClassifier at 0x165708890>

In [111]:
### 모델 예측
# 'Set ID'와 'target' 컬럼 분리
set_id = df_test_fill1['Set ID']  
target = df_test_fill1['target']   

x_test = df_test_fill1.drop(columns=['Set ID', 'target'])

test_pred = model_Fill1.predict(x_test)
test_pred

array(['Normal', 'Normal', 'Normal', ..., 'Normal', 'AbNormal', 'Normal'],
      dtype=object)

In [112]:
### 예측 컬럼 생성
df_sub["target_fill1"] = test_pred

In [113]:
df_sub["target_fill1"].value_counts()

target_fill1
Normal      16916
AbNormal      445
Name: count, dtype: int64

##### fill2

In [114]:
### 모델 학습
# 타깃값 분리
y_train = df_train_fill2['target']
x_train = df_train_fill2.drop(columns='target')

# 범주형 변수 리스트 생성
cat_features_list = get_cat_features(df_train_fill2)

# 랜덤 포레스트 학습
model_Fill2.fit(x_train, y_train, cat_features=cat_features_list)

<catboost.core.CatBoostClassifier at 0x1631d8f50>

In [115]:
### 모델 예측
# 'Set ID'와 'target' 컬럼 분리
set_id = df_test_fill2['Set ID']  
target = df_test_fill2['target']   

x_test = df_test_fill2.drop(columns=['Set ID', 'target'])

test_pred = model_Fill2.predict(x_test)
test_pred

array(['Normal', 'Normal', 'Normal', ..., 'Normal', 'Normal', 'Normal'],
      dtype=object)

In [116]:
### 예측 컬럼 생성
df_sub["target_fill2"] = test_pred

In [117]:
df_sub["target_fill2"].value_counts()

target_fill2
Normal      17005
AbNormal      356
Name: count, dtype: int64

##### autoclave

In [118]:
### 모델 학습
# 타깃값 분리
y_train = df_train_autoclave['target']
x_train = df_train_autoclave.drop(columns='target')

# 범주형 변수 리스트 생성
cat_features_list = get_cat_features(df_train_autoclave)

# 랜덤 포레스트 학습
model_AutoClave.fit(x_train, y_train, cat_features=cat_features_list)

<catboost.core.CatBoostClassifier at 0x16365c050>

In [119]:
### 모델 예측
# 'Set ID'와 'target' 컬럼 분리
set_id = df_test_autoclave['Set ID']  
target = df_test_autoclave['target']   

x_test = df_test_autoclave.drop(columns=['Set ID', 'target'])

test_pred = model_AutoClave.predict(x_test)
test_pred

array(['Normal', 'Normal', 'Normal', ..., 'Normal', 'Normal', 'Normal'],
      dtype=object)

In [120]:
### 예측 컬럼 생성
df_sub["target_autoclave"] = test_pred

In [121]:
df_sub["target_autoclave"].value_counts()

target_autoclave
Normal      16863
AbNormal      498
Name: count, dtype: int64

##### result

In [124]:
### abnormal 입력
df_sub['target'] = df_sub[['target_dam', 'target_fill1', 'target_fill2', 'target_autoclave']].apply(lambda row: 'AbNormal' 
                                                                                                    if (row == 'AbNormal').sum() >= 1
                                                                                                    else 'Normal', axis=1)

In [125]:
df_sub['target'].value_counts()

target
Normal      16231
AbNormal     1130
Name: count, dtype: int64

In [126]:
# 삭제할 열 목록
columns_to_drop = ['target_dam', 'target_fill1', 'target_fill2', 'target_autoclave']

# 열 삭제
df_sub.drop(columns=columns_to_drop, inplace=True, errors='ignore')
df_sub.drop(columns=columns_to_drop, inplace=True, errors='ignore')

In [127]:
# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

In [128]:
# 제출 파일 저장
df_sub.to_csv("0814_catboost_sep_submission4.csv", index=False)

In [129]:
# auto_class_weights 파라미터 사용 X
'''
send files ['code.ipynb', 'submission.csv'] for grade...
waiting result...
done!

Score: None
Duration: 2.511 seconds
=== Message ===
작성하신 답안 제출이 완료되었습니다.
Public Score : 0.1354723707664884
'''

"\nsend files ['code.ipynb', 'submission.csv'] for grade...\nwaiting result...\ndone!\n\nScore: None\nDuration: 2.511 seconds\n=== Message ===\n작성하신 답안 제출이 완료되었습니다.\nPublic Score : 0.1354723707664884\n"

In [130]:
# auto_class_weights 파라미터 사용
# 0814_catboost_sep_submission2.csv
'''
send files ['code.ipynb', 'submission.csv'] for grade...
waiting result...
done!

Score: None
Duration: 1.941 seconds
=== Message ===
작성하신 답안 제출이 완료되었습니다.
Public Score : 0.17589576547231273
'''

"\nsend files ['code.ipynb', 'submission.csv'] for grade...\nwaiting result...\ndone!\n\nScore: None\nDuration: 1.941 seconds\n=== Message ===\n작성하신 답안 제출이 완료되었습니다.\nPublic Score : 0.17589576547231273\n"

In [132]:
# class_weight 파라미터 지정 사용 1:8
# 0814_catboost_sep_submission3.csv
'''
send files ['code.ipynb', 'submission.csv'] for grade...
waiting result...
done!

Score: None
Duration: 2.780 seconds
=== Message ===
작성하신 답안 제출이 완료되었습니다.
Public Score : 0.1821608040201005
'''

"\nsend files ['code.ipynb', 'submission.csv'] for grade...\nwaiting result...\ndone!\n\nScore: None\nDuration: 2.780 seconds\n=== Message ===\n작성하신 답안 제출이 완료되었습니다.\nPublic Score : 0.1821608040201005\n"

In [133]:
# class_weight 파라미터 지정 사용 1:6
# 0814_catboost_sep_submission4.csv
'''
send files ['code.ipynb', 'submission.csv'] for grade...
waiting result...
done!

Score: None
Duration: 2.920 seconds
=== Message ===
작성하신 답안 제출이 완료되었습니다.
Public Score : 0.17624521072796934
'''

"\nsend files ['code.ipynb', 'submission.csv'] for grade...\nwaiting result...\ndone!\n\nScore: None\nDuration: 2.920 seconds\n=== Message ===\n작성하신 답안 제출이 완료되었습니다.\nPublic Score : 0.17624521072796934\n"