# 제품 이상여부 판별 프로젝트


## 1. 데이터 불러오기


### 필수 라이브러리


In [212]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import os
from pprint import pprint

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from tqdm import tqdm

### 데이터 읽어오기


In [213]:
RANDOM_STATE = 110

train_data = pd.read_csv("train_data (3).csv")
test_data = pd.read_csv("test_data.csv")

In [214]:
abnormal_df = train_data[train_data['target'] == 'AbNormal']
normal_df = train_data[train_data['target'] == 'Normal']

# 'target'이 'AbNormal'인 데이터 추출 및 오버샘플링
abnormal_df_over = abnormal_df.sample(len(normal_df), replace=True, random_state=42)

# 오버샘플링된 데이터프레임 결합
train_data = pd.concat([abnormal_df_over, normal_df])

# 결과 확인
print("오버샘플링된 데이터프레임:")
print(train_data['target'].value_counts())


오버샘플링된 데이터프레임:
target
AbNormal    38156
Normal      38156
Name: count, dtype: int64


In [215]:
train_data = train_data.reset_index(drop=True)

기본 전처리 할것들

In [216]:
# 'Workorder_AutoClave' 열에서 '-' 다음 숫자 값 추출 및 '000' 제거
train_data['Workorder'] = train_data['Workorder'].str.replace(r'-(\d+)', lambda x: '-' + x.group(1).lstrip('0'), regex=True)
test_data['Workorder'] = test_data['Workorder'].str.replace(r'-(\d+)', lambda x: '-' + x.group(1).lstrip('0'), regex=True)

In [217]:
# Dispenser_num 값에 따라 새로운 변수 생성
train_data['Dispenser_1'] = train_data['Dispenser_num'].apply(lambda x: 1 if x == '#1' else 0)
train_data['Dispenser_2'] = train_data['Dispenser_num'].apply(lambda x: 1 if x == '#2' else 0)

test_data['Dispenser_1'] = test_data['Dispenser_num'].apply(lambda x: 1 if x == '#1' else 0)
test_data['Dispenser_2'] = test_data['Dispenser_num'].apply(lambda x: 1 if x == '#2' else 0)

# 불필요한 변수 제거
train_data.drop(['Dispenser_num'], axis=1, inplace=True)
test_data.drop(['Dispenser_num'], axis=1, inplace=True)

In [218]:
# WorkMode Collect Result_Dam의 이름을 WorkMode Collect Result로 변경
train_data = train_data.rename(columns={'WorkMode Collect Result_Dam': 'WorkMode Collect Result'})
test_data = test_data.rename(columns={'WorkMode Collect Result_Dam': 'WorkMode Collect Result'})

# WorkMode Collect Result_Fill1, WorkMode Collect Result_Fill2 열 드롭
train_data = train_data.drop(columns=['WorkMode Collect Result_Fill1', 'WorkMode Collect Result_Fill2'])
test_data = test_data.drop(columns=['WorkMode Collect Result_Fill1', 'WorkMode Collect Result_Fill2'])

In [219]:
# WorkMode Collect Result 열의 값이 7인 행을 1로 변경
train_data['WorkMode Collect Result'] = train_data['WorkMode Collect Result'].replace(7, 1)
test_data['WorkMode Collect Result'] = test_data['WorkMode Collect Result'].replace(7, 1)

# WorkMode Collect Result 열의 결측값을 0으로 채움
train_data['WorkMode Collect Result'] = train_data['WorkMode Collect Result'].fillna(0)
test_data['WorkMode Collect Result'] = test_data['WorkMode Collect Result'].fillna(0)

In [220]:
# 세 변수의 값이 동일하면 해당 값을 가져가고, 하나라도 일치하지 않으면 0의 값을 가지는 파생 변수 생성 함수
def create_receip_no_collect_result(df):
    df['Receip_No_Collect_Result'] = df.apply(
        lambda row: row['Receip No Collect Result_Dam'] 
                    if (row['Receip No Collect Result_Dam'] == row['Receip No Collect Result_Fill1'] == row['Receip No Collect Result_Fill2']) 
                    else 0, 
        axis=1
    )

# 함수 적용
create_receip_no_collect_result(train_data)
create_receip_no_collect_result(test_data)

In [221]:
# 제거할 변수 목록
columns_to_drop = [
    'Receip No Collect Result_Dam',
    'Receip No Collect Result_Fill1',
    'Receip No Collect Result_Fill2'
]

# 변수 제거
train_data.drop(columns=columns_to_drop, inplace=True)
test_data.drop(columns=columns_to_drop, inplace=True)

In [222]:
# 세 변수의 값이 동일하면 해당 값을 가져가고, 하나라도 일치하지 않으면 0의 값을 가지는 파생 변수 생성 함수
def create_palletid_collect_result(df):
    df['PalletID_Collect_Result'] = df.apply(
        lambda row: row['PalletID Collect Result_Dam'] 
                    if (row['PalletID Collect Result_Dam'] == row['PalletID Collect Result_Fill1'] == row['PalletID Collect Result_Fill2']) 
                    else 0, 
        axis=1
    )

# 함수 적용
create_palletid_collect_result(train_data)
create_palletid_collect_result(test_data)

In [223]:
# 제거할 변수 목록
columns_to_drop = [
    'PalletID Collect Result_Dam',
    'PalletID Collect Result_Fill1',
    'PalletID Collect Result_Fill2'
]

# 변수 제거
train_data.drop(columns=columns_to_drop, inplace=True)
test_data.drop(columns=columns_to_drop, inplace=True)

In [224]:
# 세 변수의 값이 동일하면 해당 값을 가져가고, 하나라도 일치하지 않으면 0의 값을 가지는 파생 변수 생성 함수
def create_palletid_collect_result(df):
    df['Production_Qty_Collect_Result'] = df.apply(
        lambda row: row['Production Qty Collect Result_Dam'] 
                    if (row['Production Qty Collect Result_Dam'] == row['Production Qty Collect Result_Fill1'] == row['Production Qty Collect Result_Fill2']) 
                    else 0, 
        axis=1
    )

# 함수 적용
create_palletid_collect_result(train_data)
create_palletid_collect_result(test_data)

In [225]:
# 제거할 변수 목록
columns_to_drop = [
    'Production Qty Collect Result_Dam',
    'Production Qty Collect Result_Fill1',
    'Production Qty Collect Result_Fill2'
]

# 변수 제거
train_data.drop(columns=columns_to_drop, inplace=True)
test_data.drop(columns=columns_to_drop, inplace=True)

In [226]:
# "Chamber Temp. Judge Value_AutoClave" 변수의 값을 기준으로 파생 변수 생성 함수
def create_judge_value_binary(df):
    df['Chamber_Temp_OKNG_AutoClave'] = df['Chamber Temp. Judge Value_AutoClave'].apply(
        lambda x: 1 if x == 'OK' else 0
    )

# 함수 적용
create_judge_value_binary(train_data)
create_judge_value_binary(test_data)

In [227]:
# 'Judge Value'를 포함하는 열 이름 필터링
Process_Desc_col = train_data.filter(like='Judge Value').columns

print("\n Judge Value 포함 변수>")
for col in Process_Desc_col:
    print(col)


 Judge Value 포함 변수>
HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Dam
Chamber Temp. Judge Value_AutoClave
GMES_ORIGIN_INSP_JUDGE_CODE Judge Value_AutoClave
HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill1
HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill2


In [228]:
# 5개의 변수 목록
judge_value_columns = [
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Dam'
    , 'GMES_ORIGIN_INSP_JUDGE_CODE Judge Value_AutoClave'
    , 'HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill1'
    , 'HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill2'
    , 'GMES_ORIGIN_INSP_JUDGE_CODE Collect Result_AutoClave'
]

# 파생 변수 생성 함수
def create_judge_value_feature(df):
    df['Judge_Value_OK'] = df[judge_value_columns].apply(
        lambda row: 1 if any(row == 'OK') else 0, 
        axis=1
    )

# 함수 적용
create_judge_value_feature(train_data)
create_judge_value_feature(test_data)

In [229]:
# 제거할 변수 목록
columns_to_drop = [
    'Chamber Temp. Judge Value_AutoClave'
    , 'HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Dam'
    , 'GMES_ORIGIN_INSP_JUDGE_CODE Judge Value_AutoClave'
    , 'HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill1'
    , 'HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill2'
    , 'GMES_ORIGIN_INSP_JUDGE_CODE Collect Result_AutoClave'
]

# 변수 제거
train_data.drop(columns=columns_to_drop, inplace=True)
test_data.drop(columns=columns_to_drop, inplace=True)

In [230]:
# 변수명 변경
train_data = train_data.rename(columns={'1st Pressure 1st Pressure Unit Time_AutoClave': '1st Pressure Unit Time_AutoClave'})
test_data = test_data.rename(columns={'1st Pressure 1st Pressure Unit Time_AutoClave': '1st Pressure Unit Time_AutoClave'})

In [231]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76312 entries, 0 to 76311
Columns: 108 entries, Model.Suffix to Judge_Value_OK
dtypes: float64(56), int64(49), object(3)
memory usage: 62.9+ MB


In [232]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17361 entries, 0 to 17360
Columns: 109 entries, Set ID to Judge_Value_OK
dtypes: float64(86), int64(20), object(3)
memory usage: 14.4+ MB


In [233]:
for col in train_data.columns :
    cnt = train_data[col].value_counts()
    print(f"{cnt}\n")

Model.Suffix
AJX75334501    63014
AJX75334502     6974
AJX75334505     4595
AJX75334503      797
AJX75334507      586
AJX75334506      250
AJX75334508       96
Name: count, dtype: int64

Workorder
3HPXX066-1    707
3G1XC632-1    539
3M1XH425-1    501
3I1XA992-1    479
3G1X8293-1    459
             ... 
4BPM0084-1      2
3FPXX064-3      2
4CPM0166-1      1
3H1XB714-1      1
3HPM0061-1      1
Name: count, Length: 663, dtype: int64

CURE END POSITION X Collect Result_Dam
240     47675
1000    28637
Name: count, dtype: int64

CURE END POSITION Z Collect Result_Dam
2.5     47675
12.5    28637
Name: count, dtype: int64

CURE END POSITION Θ Collect Result_Dam
-90    47675
 90    28637
Name: count, dtype: int64

CURE SPEED Collect Result_Dam
70     65751
85      4138
100     3414
105     2796
95       213
Name: count, dtype: int64

CURE START POSITION X Collect Result_Dam
1030    47675
280     28637
Name: count, dtype: int64

CURE START POSITION Θ Collect Result_Dam
-90    47675
 90    28637


---

반복적으로 쓰는 툴 함수화

In [234]:
def plot_box(df, col_name):
    """
    주어진 데이터프레임과 열 이름에 대해 박스 플롯을 그리는 함수.

    Parameters:
    dataframe (pd.DataFrame): 데이터프레임
    column_name (str): 열 이름
    """
    plt.figure(figsize=(7, 4))
    plt.boxplot(df[col_name], vert=False)
    plt.xlabel(col_name)
    plt.title(f'Box Plot of {col_name}')
    plt.show()

In [235]:
def value_counts_ratio(df, col_name, target_name='target'):
    """
    주어진 데이터프레임의 특정 열에 대해 각 값마다 타겟 변수의 비율과 갯수, 총 갯수를 출력하는 함수.

    Parameters:
    df (pd.DataFrame): 데이터프레임
    col_name (str): 열 이름
    target_name (str): 타겟 변수 이름
    """
    # 각 값마다 타겟 변수의 비율 계산
    value_counts = df.groupby(col_name)[target_name].value_counts(normalize=True).unstack().fillna(0)
    
    # 각 값마다 타겟 변수의 갯수 계산
    counts = df.groupby(col_name)[target_name].value_counts().unstack().fillna(0)
    
    # 각 값마다 총 갯수 계산
    total_counts = df[col_name].value_counts().rename('Total_Count')
    
    # 비율과 갯수를 합침
    result = value_counts.join(counts, lsuffix='_ratio', rsuffix='_count')
    
    # 총 갯수를 합침
    result = result.join(total_counts, on=col_name)
    
    # 출력 형식 조정
    result.index.name = 'variable'
    print(f"\n{col_name}별 {target_name} 비율 및 갯수\n")
    print(result.rename(columns=lambda x: x.split('_')[0]))

In [236]:
def summarize_group(df, group_by_columns):
    # 데이터프레임을 그룹화
    grouped_df = df.groupby(group_by_columns)
    
    # 결과를 저장할 리스트 초기화
    results = []
    
    # 그룹화된 데이터프레임의 내용을 확인하는 코드
    for name, group in grouped_df:
        # 그룹의 갯수 계산
        group_count = group.shape[0]
        
        # 'target' 변수의 'AdNormal' 비율과 갯수 계산
        adnormal_count = group['target'].value_counts().get('AbNormal', 0)
        adnormal_ratio = adnormal_count / group_count
        
        # 결과 리스트에 추가
        results.append([name, adnormal_count, adnormal_ratio, group_count])
    
    # 결과 리스트를 데이터프레임으로 변환
    results_df = pd.DataFrame(results, columns=['group', "'AdNormal' count", 'ratio', 'Total'])
    
    # 그룹화된 변수들의 이름을 제목행으로 출력
    print(f"Grouped by: {', '.join(group_by_columns)}")
    print()
    # 데이터프레임 출력
    print(results_df)

# 예시코드
# summarize_grouped_data(train_data, ['1st Pressure Collect Result_AutoClave', '1st Pressure Unit Time_AutoClave'])

In [237]:
def plot_ratio(df, group_by_column, target_column='target', abnormal_value='AbNormal'):
    # 데이터프레임을 그룹화
    grouped_df = df.groupby(group_by_column)
    
    # 결과를 저장할 리스트 초기화
    results = []
    
    # 그룹화된 데이터프레임의 내용을 확인하는 코드
    for name, group in grouped_df:
        # 그룹의 갯수 계산
        group_count = group.shape[0]
        
        # 'target' 변수의 'AbNormal' 비율과 갯수 계산
        abnormal_count = group[target_column].value_counts().get(abnormal_value, 0)
        abnormal_ratio = abnormal_count / group_count
        
        # 결과 리스트에 추가
        results.append([name, abnormal_count, abnormal_ratio, group_count])
    
    # 결과 리스트를 데이터프레임으로 변환
    results_df = pd.DataFrame(results, columns=['group', f"'{abnormal_value}' count", 'ratio', 'Total'])
    
    # 그래프 크기 설정
    plt.figure(figsize=(10, 5))
    
    # 막대 그래프 생성
    ax = results_df.plot(kind='bar', x='group', y='ratio', legend=False)
    
    # 각 막대 위에 AbNormal 갯수와 총 갯수 표시
    for i, (abnormal_count, total) in enumerate(zip(results_df[f"'{abnormal_value}' count"], results_df['Total'])):
        ax.text(i, results_df['ratio'][i], f'{abnormal_count} ({total})', ha='center', va='bottom', fontsize=8)
    
     # 그래프 제목 및 축 레이블 설정
    ax.set_title(f'{abnormal_value} Ratio by {group_by_column}')
    ax.set_xlabel(group_by_column)
    ax.set_ylabel(f'{abnormal_value} Ratio')
   
    # 그래프 출력
    plt.show()

In [238]:
import matplotlib.pyplot as plt
import seaborn as sns

def plot_ratio_boxplot(data, time_ratio_column, target_column='target'):
    # 그래프 스타일 설정
    sns.set(style="whitegrid")

    # 그래프 그리기
    plt.figure(figsize=(10, 6))
    sns.boxplot(x=time_ratio_column, y=target_column, data=data)

    # 그래프 제목 및 레이블 설정
    plt.title(f'{time_ratio_column} vs {target_column}')
    plt.xlabel(time_ratio_column)
    plt.ylabel(target_column)

    # 그래프 출력
    plt.show()

# 함수 호출 예제
#plot_time_ratio_vs_target(train_data, 'time_ratio_Dam')

---

In [239]:
train_data.columns.to_list()

['Model.Suffix',
 'Workorder',
 'CURE END POSITION X Collect Result_Dam',
 'CURE END POSITION Z Collect Result_Dam',
 'CURE END POSITION Θ Collect Result_Dam',
 'CURE SPEED Collect Result_Dam',
 'CURE START POSITION X Collect Result_Dam',
 'CURE START POSITION Θ Collect Result_Dam',
 'DISCHARGED SPEED OF RESIN Collect Result_Dam',
 'DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam',
 'DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam',
 'DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam',
 'Dispense Volume(Stage1) Collect Result_Dam',
 'Dispense Volume(Stage2) Collect Result_Dam',
 'Dispense Volume(Stage3) Collect Result_Dam',
 'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam',
 'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam',
 'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam',
 'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam',
 'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam',
 'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Resul

### 1. CURE

In [240]:
# 'CURE'와 '_Dam'를 포함하는 열 이름 필터링
Process_Desc_col = train_data.filter(like='CURE').columns

print("\n CURE 포함 변수>")
for col in Process_Desc_col:
    print(col)


 CURE 포함 변수>
CURE END POSITION X Collect Result_Dam
CURE END POSITION Z Collect Result_Dam
CURE END POSITION Θ Collect Result_Dam
CURE SPEED Collect Result_Dam
CURE START POSITION X Collect Result_Dam
CURE START POSITION Θ Collect Result_Dam
CURE END POSITION X Collect Result_Fill2
CURE END POSITION Z Collect Result_Fill2
CURE SPEED Collect Result_Fill2
CURE STANDBY POSITION Z Collect Result_Fill2
CURE START POSITION X Collect Result_Fill2
CURE START POSITION Z Collect Result_Fill2


In [241]:
summarize_group(train_data, [
'Dispenser_1'
, 'Dispenser_2'
# 'CURE END POSITION X Collect Result_Dam'
, 'CURE END POSITION Θ Collect Result_Dam'
# , 'CURE SPEED Collect Result_Dam'
# , 'CURE START POSITION X Collect Result_Dam'
, 'CURE START POSITION Θ Collect Result_Dam'
# , 'CURE END POSITION X Collect Result_Fill2'
# , 'CURE END POSITION Z Collect Result_Fill2'
# , 'CURE SPEED Collect Result_Fill2'
# , 'CURE STANDBY POSITION Z Collect Result_Fill2'
# , 'CURE START POSITION X Collect Result_Fill2'
# , 'CURE START POSITION Z Collect Result_Fill2'
])

Grouped by: Dispenser_1, Dispenser_2, CURE END POSITION Θ Collect Result_Dam, CURE START POSITION Θ Collect Result_Dam

              group  'AdNormal' count     ratio  Total
0  (0, 0, -90, -90)               306  1.000000    306
1    (0, 0, 90, 90)               235  1.000000    235
2    (0, 1, 90, 90)             13791  0.485564  28402
3  (1, 0, -90, -90)             23824  0.502945  47369


dispenser 종류에 따라 POSITION Θ 값이 따라감  
-> drop

In [242]:
# 제거할 변수 목록
columns_to_drop = [
    'CURE END POSITION Θ Collect Result_Dam'
    , 'CURE START POSITION Θ Collect Result_Dam'
]

# 변수 제거
train_data.drop(columns=columns_to_drop, inplace=True)
test_data.drop(columns=columns_to_drop, inplace=True)

In [243]:
# 'CURE'를 포함하는 열 이름 필터링
Process_Desc_col = train_data.filter(like='CURE').columns

print("\n CURE 포함 변수>")
for col in Process_Desc_col:
    print(col)


 CURE 포함 변수>
CURE END POSITION X Collect Result_Dam
CURE END POSITION Z Collect Result_Dam
CURE SPEED Collect Result_Dam
CURE START POSITION X Collect Result_Dam
CURE END POSITION X Collect Result_Fill2
CURE END POSITION Z Collect Result_Fill2
CURE SPEED Collect Result_Fill2
CURE STANDBY POSITION Z Collect Result_Fill2
CURE START POSITION X Collect Result_Fill2
CURE START POSITION Z Collect Result_Fill2


In [244]:
summarize_group(train_data, [
'Dispenser_1'
, 'Dispenser_2'
, 'CURE END POSITION X Collect Result_Dam'
, 'CURE END POSITION Z Collect Result_Dam'
# , 'CURE SPEED Collect Result_Dam'
, 'CURE START POSITION X Collect Result_Dam'
# , 'CURE END POSITION X Collect Result_Fill2'
# , 'CURE END POSITION Z Collect Result_Fill2'
# , 'CURE SPEED Collect Result_Fill2'
# , 'CURE STANDBY POSITION Z Collect Result_Fill2'
# , 'CURE START POSITION X Collect Result_Fill2'
# , 'CURE START POSITION Z Collect Result_Fill2'
])

Grouped by: Dispenser_1, Dispenser_2, CURE END POSITION X Collect Result_Dam, CURE END POSITION Z Collect Result_Dam, CURE START POSITION X Collect Result_Dam

                     group  'AdNormal' count     ratio  Total
0   (0, 0, 240, 2.5, 1030)               306  1.000000    306
1  (0, 0, 1000, 12.5, 280)               235  1.000000    235
2  (0, 1, 1000, 12.5, 280)             13791  0.485564  28402
3   (1, 0, 240, 2.5, 1030)             23824  0.502945  47369


좌표값을 통해 좌표간의 거리를 계산

In [245]:
# 시작 위치와 끝 위치 열 이름
start_x_col = 'CURE START POSITION X Collect Result_Dam'
start_z_col = 33.5
end_x_col = 'CURE END POSITION X Collect Result_Dam'
end_z_col = 'CURE END POSITION Z Collect Result_Dam'

# 시작 위치와 끝 위치 사이의 거리 계산
train_data['CURE_DISTANCE_Dam'] = np.sqrt(
    (train_data[end_x_col] - train_data[start_x_col]) ** 2 +
    (train_data[end_z_col] - start_z_col) ** 2
)

test_data['CURE_DISTANCE_Dam'] = np.sqrt(
    (train_data[end_x_col] - train_data[start_x_col]) ** 2 +
    (train_data[end_z_col] - start_z_col) ** 2
)

In [246]:
# 시작 위치와 끝 위치 열 이름
start_x_col = 'CURE START POSITION X Collect Result_Fill2'
start_z_col = 'CURE START POSITION Z Collect Result_Fill2'
end_x_col = 'CURE END POSITION X Collect Result_Fill2'
end_z_col = 'CURE END POSITION Z Collect Result_Fill2'

# 시작 위치와 끝 위치 사이의 거리 계산
train_data['CURE_DISTANCE_Fill2'] = np.sqrt(
    (train_data[end_x_col] - train_data[start_x_col]) ** 2 +
    (train_data[end_z_col] - train_data[start_z_col]) ** 2
)

test_data['CURE_DISTANCE_Fill2'] = np.sqrt(
    (train_data[end_x_col] - train_data[start_x_col]) ** 2 +
    (train_data[end_z_col] - train_data[start_z_col]) ** 2
)

In [247]:
# 제거할 변수 목록
columns_to_drop = [
    'CURE START POSITION X Collect Result_Dam'
    , 'CURE END POSITION X Collect Result_Dam'
    , 'CURE END POSITION Z Collect Result_Dam'

    , 'CURE START POSITION X Collect Result_Fill2'
    , 'CURE START POSITION Z Collect Result_Fill2'
    , 'CURE END POSITION X Collect Result_Fill2'
    , 'CURE END POSITION Z Collect Result_Fill2'
]

# 변수 제거
train_data.drop(columns=columns_to_drop, inplace=True)
test_data.drop(columns=columns_to_drop, inplace=True)

In [248]:
# 'CURE'를 포함하는 열 이름 필터링
Process_Desc_col = train_data.filter(like='CURE').columns

print("\n CURE 포함 변수>")
for col in Process_Desc_col:
    print(col)


 CURE 포함 변수>
CURE SPEED Collect Result_Dam
CURE SPEED Collect Result_Fill2
CURE STANDBY POSITION Z Collect Result_Fill2
CURE_DISTANCE_Dam
CURE_DISTANCE_Fill2


In [249]:
summarize_group(train_data, [
# 'Dispenser_1'
# , 'Dispenser_2'
# , 'CURE SPEED Collect Result_Dam'
# , 'CURE SPEED Collect Result_Fill2'
 'CURE STANDBY POSITION Z Collect Result_Fill2'
# , 'CURE_DISTANCE_Dam'
# , 'CURE_DISTANCE_Fill2'
])

Grouped by: CURE STANDBY POSITION Z Collect Result_Fill2

   group  'AdNormal' count     ratio  Total
0  (22,)               551  0.558824    986
1  (23,)               328  0.643137    510
2  (32,)              6914  0.606704  11396
3  (33,)             30363  0.478761  63420


'CURE STANDBY POSITION Z Collect Result_Fill2' 변수의 유의미함을 찾을수 x  
다른 변수와 연결된만한것도 찾지 못함 -> drop

In [250]:
# 제거할 변수 목록
columns_to_drop = ['CURE STANDBY POSITION Z Collect Result_Fill2']

# 변수 제거
train_data.drop(columns=columns_to_drop, inplace=True)
test_data.drop(columns=columns_to_drop, inplace=True)

In [251]:
# 'CURE'를 포함하는 열 이름 필터링
Process_Desc_col = train_data.filter(like='CURE').columns

print("\n CURE 포함 변수>")
for col in Process_Desc_col:
    print(col)


 CURE 포함 변수>
CURE SPEED Collect Result_Dam
CURE SPEED Collect Result_Fill2
CURE_DISTANCE_Dam
CURE_DISTANCE_Fill2


In [252]:
summarize_group(train_data, [
'Dispenser_1'
, 'Dispenser_2'
# , 'CURE SPEED Collect Result_Dam'
# , 'CURE SPEED Collect Result_Fill2'
, 'CURE_DISTANCE_Dam'
, 'CURE_DISTANCE_Fill2'
])

Grouped by: Dispenser_1, Dispenser_2, CURE_DISTANCE_Dam, CURE_DISTANCE_Fill2

                                          group  'AdNormal' count     ratio  \
0              (0, 0, 720.3061848963953, 780.0)               235  1.000000   
1               (0, 0, 790.607993888248, 780.0)               306  1.000000   
2              (0, 1, 720.3061848963953, 780.0)             13440  0.484377   
3  (0, 1, 720.3061848963953, 780.0006410253776)               351  0.536697   
4  (0, 1, 720.3061848963953, 780.0775602464155)                 0  0.000000   
5               (1, 0, 790.607993888248, 780.0)             18413  0.484158   
6   (1, 0, 790.607993888248, 780.0006410253776)              4618  0.575954   
7   (1, 0, 790.607993888248, 780.0640999302557)               793  0.600758   

   Total  
0    235  
1    306  
2  27747  
3    654  
4      1  
5  38031  
6   8018  
7   1320  


거리의 차이에 따라 ratio 값 변화 크지 x

In [253]:
# 거리 / 속도 -> 시간 파생 변수 생성
train_data['CURE_Time_Dam']  = train_data['CURE_DISTANCE_Dam'] / train_data['CURE SPEED Collect Result_Dam']
test_data['CURE_Time_Dam']  = test_data['CURE_DISTANCE_Dam'] / test_data['CURE SPEED Collect Result_Dam']

train_data['CURE_Time_Fill2']  = train_data['CURE_DISTANCE_Fill2'] / train_data['CURE SPEED Collect Result_Fill2']
test_data['CURE_Time_Fill2']  = test_data['CURE_DISTANCE_Fill2'] / test_data['CURE SPEED Collect Result_Fill2']

In [254]:
# 'CURE'를 포함하는 열 이름 필터링
Process_Desc_col = train_data.filter(like='CURE').columns

print("\n CURE 포함 변수>")
for col in Process_Desc_col:
    print(col)


 CURE 포함 변수>
CURE SPEED Collect Result_Dam
CURE SPEED Collect Result_Fill2
CURE_DISTANCE_Dam
CURE_DISTANCE_Fill2
CURE_Time_Dam
CURE_Time_Fill2


In [255]:
# 제거할 변수 목록
columns_to_drop = [
    'CURE_DISTANCE_Dam'
    , 'CURE SPEED Collect Result_Dam'
    , 'CURE_DISTANCE_Fill2'
    , 'CURE SPEED Collect Result_Fill2'
]

# 변수 제거
train_data.drop(columns=columns_to_drop, inplace=True)
test_data.drop(columns=columns_to_drop, inplace=True)

In [256]:
# 'CURE'를 포함하는 열 이름 필터링
Process_Desc_col = train_data.filter(like='CURE').columns

print("\n CURE 포함 변수>")
for col in Process_Desc_col:
    print(col)


 CURE 포함 변수>
CURE_Time_Dam
CURE_Time_Fill2


### 2. HEAD NORMAL COORDINATE

In [257]:
# 'HEAD NORMAL COORDINATE'를 포함하는 열 이름 필터링
Process_Desc_col = train_data.filter(like='HEAD NORMAL COORDINATE').columns

print("\n HEAD NORMAL COORDINATE 포함 변수>")
for col in Process_Desc_col:
    print(col)


 HEAD NORMAL COORDINATE 포함 변수>
HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam
HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam
HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam
HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam
HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam
HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam
HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Dam
HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Dam
HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Dam
HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1
HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill1
HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill1
HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1
HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1
HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1
HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill1
HEAD NORMAL COORDINATE Z AXIS(Stage2) Coll

In [258]:
# 각 스테이지의 좌표 열 정의
stage1_cols = ['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam',
               'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam',
               'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Dam']

stage2_cols = ['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam',
               'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam',
               'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Dam']

stage3_cols = ['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam',
               'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam',
               'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Dam']

# 거리 계산 함수
def calculate_distances(data):
    data['HEAD NORMAL DISTANCE_STAGE1_STAGE2_Dam'] = np.sqrt(
        (data[stage2_cols[0]] - data[stage1_cols[0]]) ** 2 +
        (data[stage2_cols[1]] - data[stage1_cols[1]]) ** 2 +
        (data[stage2_cols[2]] - data[stage1_cols[2]]) ** 2
    )

    data['HEAD NORMAL DISTANCE_STAGE2_STAGE3_Dam'] = np.sqrt(
        (data[stage3_cols[0]] - data[stage2_cols[0]]) ** 2 +
        (data[stage3_cols[1]] - data[stage2_cols[1]]) ** 2 +
        (data[stage3_cols[2]] - data[stage2_cols[2]]) ** 2
    )

    data['HEAD NORMAL DISTANCE_STAGE1_STAGE3_Dam'] = np.sqrt(
        (data[stage3_cols[0]] - data[stage1_cols[0]]) ** 2 +
        (data[stage3_cols[1]] - data[stage1_cols[1]]) ** 2 +
        (data[stage3_cols[2]] - data[stage1_cols[2]]) ** 2
    )

    return data

# train_data에 적용
train_data = calculate_distances(train_data)

# test_data에 적용
test_data = calculate_distances(test_data)

In [259]:
# 필요한 열 이름
stage1_stage2_col = 'HEAD NORMAL DISTANCE_STAGE1_STAGE2_Dam'
stage2_stage3_col = 'HEAD NORMAL DISTANCE_STAGE2_STAGE3_Dam'
stage1_stage3_col = 'HEAD NORMAL DISTANCE_STAGE1_STAGE3_Dam'

# 삼각형의 넓이와 높이를 계산하는 함수
def calculate_triangle_features(data):
    a = data[stage1_stage2_col]
    b = data[stage2_stage3_col]
    c = data[stage1_stage3_col]

    # 헤론의 공식에 따른 삼각형의 넓이 계산
    s = (a + b + c) / 2
    area = np.sqrt(s * (s - a) * (s - b) * (s - c))

    # 높이 계산 (밑변을 c로 가정)
    height = (2 * area) / c

    # 결과를 새로운 열에 저장
    # data['HEAD NORMAL DISTANCE_TRIANGLE_area_Dam'] = area
    data['HEAD NORMAL DISTANCE_TRIANGLE_height_Dam'] = height

    return data

# train_data에 적용
train_data = calculate_triangle_features(train_data)

# test_data에 적용
test_data = calculate_triangle_features(test_data)

In [260]:
# 제거할 변수 목록
columns_to_drop = [
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'
    , 'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam'
    , 'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Dam'

    , 'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam'
    , 'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam'
    , 'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Dam'

    , 'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam'
    , 'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam'
    , 'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Dam'
]

# 변수 제거
train_data.drop(columns=columns_to_drop, inplace=True)
test_data.drop(columns=columns_to_drop, inplace=True)

In [261]:
# 각 스테이지의 좌표 열 정의
stage1_cols = ['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1',
               'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1',
               'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill1']

stage2_cols = ['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill1',
               'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1',
               'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill1']

stage3_cols = ['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill1',
               'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1',
               'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill1']

# 거리 계산 함수
def calculate_distances(data):
    data['HEAD NORMAL DISTANCE_STAGE1_STAGE2_Fill1'] = np.sqrt(
        (data[stage2_cols[0]] - data[stage1_cols[0]]) ** 2 +
        (data[stage2_cols[1]] - data[stage1_cols[1]]) ** 2 +
        (data[stage2_cols[2]] - data[stage1_cols[2]]) ** 2
    )

    data['HEAD NORMAL DISTANCE_STAGE2_STAGE3_Fill1'] = np.sqrt(
        (data[stage3_cols[0]] - data[stage2_cols[0]]) ** 2 +
        (data[stage3_cols[1]] - data[stage2_cols[1]]) ** 2 +
        (data[stage3_cols[2]] - data[stage2_cols[2]]) ** 2
    )

    data['HEAD NORMAL DISTANCE_STAGE1_STAGE3_Fill1'] = np.sqrt(
        (data[stage3_cols[0]] - data[stage1_cols[0]]) ** 2 +
        (data[stage3_cols[1]] - data[stage1_cols[1]]) ** 2 +
        (data[stage3_cols[2]] - data[stage1_cols[2]]) ** 2
    )

    return data

# train_data에 적용
train_data = calculate_distances(train_data)

# test_data에 적용
test_data = calculate_distances(test_data)

In [262]:
# 필요한 열 이름
stage1_stage2_col = 'HEAD NORMAL DISTANCE_STAGE1_STAGE2_Fill1'
stage2_stage3_col = 'HEAD NORMAL DISTANCE_STAGE2_STAGE3_Fill1'
stage1_stage3_col = 'HEAD NORMAL DISTANCE_STAGE1_STAGE3_Fill1'

# 삼각형의 넓이와 높이를 계산하는 함수
def calculate_triangle_features(data):
    a = data[stage1_stage2_col]
    b = data[stage2_stage3_col]
    c = data[stage1_stage3_col]

    # 헤론의 공식에 따른 삼각형의 넓이 계산
    s = (a + b + c) / 2
    area = np.sqrt(s * (s - a) * (s - b) * (s - c))

    # 높이 계산 (밑변을 c로 가정)
    height = (2 * area) / c

    # 결과를 새로운 열에 저장
    # data['HEAD NORMAL DISTANCE_TRIANGLE_area_Fill1'] = area
    data['HEAD NORMAL DISTANCE_TRIANGLE_height_Fill1'] = height

    return data

# train_data에 적용
train_data = calculate_triangle_features(train_data)

# test_data에 적용
test_data = calculate_triangle_features(test_data)

In [263]:
# 제거할 변수 목록
columns_to_drop = [
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'
    , 'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1'
    , 'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill1'

    , 'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill1'
    , 'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1'
    , 'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill1'

    , 'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill1'
    , 'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1'
    , 'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill1'
]

# 변수 제거
train_data.drop(columns=columns_to_drop, inplace=True)
test_data.drop(columns=columns_to_drop, inplace=True)

In [264]:
import numpy as np

# 각 스테이지의 좌표 열 정의
stage1_cols = ['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2',
               'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill2',
               'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill2']

stage2_cols = ['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill2',
               'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill2',
               'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill2']

stage3_cols = ['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill2',
               'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill2',
               'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill2']

# 거리 계산 함수
def calculate_distances(data):
    data['HEAD NORMAL DISTANCE_STAGE1_STAGE2_Fill2'] = np.sqrt(
        (data[stage2_cols[0]] - data[stage1_cols[0]]) ** 2 +
        (data[stage2_cols[1]] - data[stage1_cols[1]]) ** 2 +
        (data[stage2_cols[2]] - data[stage1_cols[2]]) ** 2
    )

    data['HEAD NORMAL DISTANCE_STAGE2_STAGE3_Fill2'] = np.sqrt(
        (data[stage3_cols[0]] - data[stage2_cols[0]]) ** 2 +
        (data[stage3_cols[1]] - data[stage2_cols[1]]) ** 2 +
        (data[stage3_cols[2]] - data[stage2_cols[2]]) ** 2
    )

    data['HEAD NORMAL DISTANCE_STAGE1_STAGE3_Fill2'] = np.sqrt(
        (data[stage3_cols[0]] - data[stage1_cols[0]]) ** 2 +
        (data[stage3_cols[1]] - data[stage1_cols[1]]) ** 2 +
        (data[stage3_cols[2]] - data[stage1_cols[2]]) ** 2
    )

    return data

# train_data에 적용
train_data = calculate_distances(train_data)

# test_data에 적용
test_data = calculate_distances(test_data)

In [265]:
# 제거할 변수 목록
columns_to_drop = [
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2'
    , 'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill2'
    , 'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill2'

    , 'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill2'
    , 'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill2'
    , 'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill2'

    , 'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill2'
    , 'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill2'
    , 'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill2'
]

# 변수 제거
train_data.drop(columns=columns_to_drop, inplace=True)
test_data.drop(columns=columns_to_drop, inplace=True)

In [266]:
# 'HEAD NORMAL'를 포함하는 열 이름 필터링
Process_Desc_col = train_data.filter(like='HEAD NORMAL').columns

print("\n HEAD NORMAL 포함 변수>")
for col in Process_Desc_col:
    print(col)


 HEAD NORMAL 포함 변수>
HEAD NORMAL DISTANCE_STAGE1_STAGE2_Dam
HEAD NORMAL DISTANCE_STAGE2_STAGE3_Dam
HEAD NORMAL DISTANCE_STAGE1_STAGE3_Dam
HEAD NORMAL DISTANCE_TRIANGLE_height_Dam
HEAD NORMAL DISTANCE_STAGE1_STAGE2_Fill1
HEAD NORMAL DISTANCE_STAGE2_STAGE3_Fill1
HEAD NORMAL DISTANCE_STAGE1_STAGE3_Fill1
HEAD NORMAL DISTANCE_TRIANGLE_height_Fill1
HEAD NORMAL DISTANCE_STAGE1_STAGE2_Fill2
HEAD NORMAL DISTANCE_STAGE2_STAGE3_Fill2
HEAD NORMAL DISTANCE_STAGE1_STAGE3_Fill2


In [267]:
# 삭제할 열 이름 정의
columns_to_drop = [
    'HEAD NORMAL DISTANCE_STAGE1_STAGE2_Dam'
    , 'HEAD NORMAL DISTANCE_STAGE2_STAGE3_Dam'
    # , 'HEAD NORMAL DISTANCE_STAGE1_STAGE3_Dam'

    , 'HEAD NORMAL DISTANCE_STAGE1_STAGE2_Fill1'
    , 'HEAD NORMAL DISTANCE_STAGE2_STAGE3_Fill1'
    # , 'HEAD NORMAL DISTANCE_STAGE1_STAGE3_Fill1'

    # , 'HEAD NORMAL DISTANCE_STAGE1_STAGE2_Fill2'
    # , 'HEAD NORMAL DISTANCE_STAGE2_STAGE3_Fill2'
    # , 'HEAD NORMAL DISTANCE_STAGE1_STAGE3_Fill2'
]

# train_data에서 열 삭제
train_data = train_data.drop(columns=columns_to_drop)

# test_data에서 열 삭제
test_data = test_data.drop(columns=columns_to_drop)

In [268]:
# 'HEAD NORMAL'를 포함하는 열 이름 필터링
Process_Desc_col = train_data.filter(like='HEAD NORMAL').columns

print("\n HEAD NORMAL 포함 변수>")
for col in Process_Desc_col:
    print(col)


 HEAD NORMAL 포함 변수>
HEAD NORMAL DISTANCE_STAGE1_STAGE3_Dam
HEAD NORMAL DISTANCE_TRIANGLE_height_Dam
HEAD NORMAL DISTANCE_STAGE1_STAGE3_Fill1
HEAD NORMAL DISTANCE_TRIANGLE_height_Fill1
HEAD NORMAL DISTANCE_STAGE1_STAGE2_Fill2
HEAD NORMAL DISTANCE_STAGE2_STAGE3_Fill2
HEAD NORMAL DISTANCE_STAGE1_STAGE3_Fill2


### 3. RESIN(처리x)

In [269]:
# 'RESIN' 또는 'Dispense Volume'을 포함하는 열 이름 필터링
Process_Desc_col = train_data.filter(regex='RESIN|Dispense Volume').columns

print("\n'RESIN' 또는 'Dispense Volume' 포함 변수>")
for col in Process_Desc_col:
    print(col)


'RESIN' 또는 'Dispense Volume' 포함 변수>
DISCHARGED SPEED OF RESIN Collect Result_Dam
DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam
DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam
DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam
Dispense Volume(Stage1) Collect Result_Dam
Dispense Volume(Stage2) Collect Result_Dam
Dispense Volume(Stage3) Collect Result_Dam
DISCHARGED SPEED OF RESIN Collect Result_Fill1
DISCHARGED TIME OF RESIN(Stage1) Collect Result_Fill1
DISCHARGED TIME OF RESIN(Stage2) Collect Result_Fill1
DISCHARGED TIME OF RESIN(Stage3) Collect Result_Fill1
Dispense Volume(Stage1) Collect Result_Fill1
Dispense Volume(Stage2) Collect Result_Fill1
Dispense Volume(Stage3) Collect Result_Fill1


In [270]:
summarize_group(train_data, [
# 'Dispenser_1'
# , 'Dispenser_2'
# , 'DISCHARGED SPEED OF RESIN Collect Result_Dam'
# , 'DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam'
# , 'DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam'
# , 'DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam'
 'Dispense Volume(Stage1) Collect Result_Dam'
, 'Dispense Volume(Stage2) Collect Result_Dam'
, 'Dispense Volume(Stage3) Collect Result_Dam'
# , 'DISCHARGED SPEED OF RESIN Collect Result_Fill1'
# , 'DISCHARGED TIME OF RESIN(Stage1) Collect Result_Fill1'
# , 'DISCHARGED TIME OF RESIN(Stage2) Collect Result_Fill1'
# , 'DISCHARGED TIME OF RESIN(Stage3) Collect Result_Fill1'
# , 'Dispense Volume(Stage1) Collect Result_Fill1'
# , 'Dispense Volume(Stage2) Collect Result_Fill1'
# , 'Dispense Volume(Stage3) Collect Result_Fill1'
])

Grouped by: Dispense Volume(Stage1) Collect Result_Dam, Dispense Volume(Stage2) Collect Result_Dam, Dispense Volume(Stage3) Collect Result_Dam

                  group  'AdNormal' count     ratio  Total
0    (0.67, 0.26, 1.49)               135  0.579399    233
1    (0.67, 0.27, 1.49)              4588  0.610025   7521
2    (0.67, 0.28, 1.49)                 0  0.000000      2
3    (0.67, 0.33, 1.49)               862  0.651057   1324
4    (0.67, 0.34, 1.49)              7500  0.590133  12709
..                  ...               ...       ...    ...
155  (1.63, 0.92, 1.49)               697  0.465909   1496
156  (1.63, 0.93, 1.49)               744  0.587214   1267
157  (1.63, 0.94, 1.49)               547  0.541048   1011
158  (2.34, 0.71, 1.49)                 0  0.000000      1
159  (2.34, 0.72, 1.49)                 0  0.000000      3

[160 rows x 4 columns]


In [271]:
# # 파생 변수 생성 함수
# def create_time_speed_product(df):
#     stages = ['Stage1', 'Stage2', 'Stage3']
#     for stage in stages:
#         time_col = f'DISCHARGED TIME OF RESIN({stage}) Collect Result_Dam'
#         speed_col = 'DISCHARGED SPEED OF RESIN Collect Result_Dam'
#         new_col_name = f'RESIN Time_x_Speed_{stage}_Dam'
#         df[new_col_name] = df[time_col] * df[speed_col]

# # 함수 적용
# create_time_speed_product(train_data)
# create_time_speed_product(test_data)

In [272]:
# summarize_group(train_data, [
# 'Dispenser_1'
# , 'Dispenser_2'
# , 'DISCHARGED SPEED OF RESIN Collect Result_Dam'

# ])

In [273]:
# # 파생 변수 생성 함수
# def create_volume_time_ratio(df):
#     stages = ['Stage1', 'Stage2', 'Stage3']
#     for stage in stages:
#         time_col = f'DISCHARGED TIME OF RESIN({stage}) Collect Result_Dam'
#         volume_col = f'Dispense Volume({stage}) Collect Result_Dam'
#         new_col_name = f'RESIN Volume_Time_Ratio_{stage}_Dam'
#         df[new_col_name] = df[volume_col] / df[time_col]

# # 함수 적용
# create_volume_time_ratio(train_data)
# create_volume_time_ratio(test_data)

In [274]:
# # 출력 옵션을 설정
# pd.set_option('display.max_rows', None)


# # 출력 옵션을 원래대로
# pd.reset_option('display.max_rows')

In [275]:
# summarize_group(train_data, [
# 'Dispenser_1'
# , 'Dispenser_2'
# , 'DISCHARGED SPEED OF RESIN Collect Result_Dam'
# , 'DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam'
# , 'DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam'
# , 'DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam'
# , 'Dispense Volume(Stage1) Collect Result_Dam'
# , 'Dispense Volume(Stage2) Collect Result_Dam'
# , 'Dispense Volume(Stage3) Collect Result_Dam'
# # , 'DISCHARGED SPEED OF RESIN Collect Result_Fill1'
# # , 'DISCHARGED TIME OF RESIN(Stage1) Collect Result_Fill1'
# # , 'DISCHARGED TIME OF RESIN(Stage2) Collect Result_Fill1'
# # , 'DISCHARGED TIME OF RESIN(Stage3) Collect Result_Fill1'
# # , 'Dispense Volume(Stage1) Collect Result_Fill1'
# # , 'Dispense Volume(Stage2) Collect Result_Fill1'
# # , 'Dispense Volume(Stage3) Collect Result_Fill1'
# # , 'RESIN Time_x_Speed_Stage1_Dam'
# # , 'RESIN Time_x_Speed_Stage2_Dam'
# # , 'RESIN Time_x_Speed_Stage3_Dam'
# #  'RESIN Volume_Time_Ratio_Stage1_Dam'
# # , 'RESIN Volume_Time_Ratio_Stage2_Dam'
# # , 'RESIN Volume_Time_Ratio_Stage3_Dam'
# ])

In [276]:
# # 'RESIN' 또는 'Dispense Volume'을 포함하는 열 이름 필터링
# Process_Desc_col = train_data.filter(regex='RESIN|Dispense Volume').columns

# print("\n'RESIN' 또는 'Dispense Volume' 포함 변수>")
# for col in Process_Desc_col:
#     print(col)

### 4. Distance Speed Collect Result_Dam

Dam 공정의 Circle, Line 길이 변수들 처리

In [277]:
# 'Distance Speed Collect Result_Dam'를 포함하는 열 이름 필터링
Process_Desc_col = train_data.filter(like='Distance Speed Collect Result_Dam').columns

print("\n Distance Speed Collect Result_Dam 포함 변수>")
for col in Process_Desc_col:
    print(col)


 Distance Speed Collect Result_Dam 포함 변수>
Stage1 Circle1 Distance Speed Collect Result_Dam
Stage1 Circle2 Distance Speed Collect Result_Dam
Stage1 Circle3 Distance Speed Collect Result_Dam
Stage1 Circle4 Distance Speed Collect Result_Dam
Stage1 Line1 Distance Speed Collect Result_Dam
Stage1 Line2 Distance Speed Collect Result_Dam
Stage1 Line3 Distance Speed Collect Result_Dam
Stage1 Line4 Distance Speed Collect Result_Dam
Stage2 Circle1 Distance Speed Collect Result_Dam
Stage2 Circle2 Distance Speed Collect Result_Dam
Stage2 Circle3 Distance Speed Collect Result_Dam
Stage2 Circle4 Distance Speed Collect Result_Dam
Stage2 Line1 Distance Speed Collect Result_Dam
Stage2 Line2 Distance Speed Collect Result_Dam
Stage2 Line3 Distance Speed Collect Result_Dam
Stage2 Line4 Distance Speed Collect Result_Dam
Stage3 Circle1 Distance Speed Collect Result_Dam
Stage3 Circle2 Distance Speed Collect Result_Dam
Stage3 Circle3 Distance Speed Collect Result_Dam
Stage3 Circle4 Distance Speed Collect Resu

Stage 별 Speed 값들의 평균 계산

In [278]:
def add_stage_totals(data, stages, suffix='_Distance_Speed_avg_Dam'):
    for stage in stages:
        stage_cols = data.filter(like=stage).columns
        data[f'{stage}{suffix}'] = data[stage_cols].sum(axis=1) / 8

stages = ['Stage1', 'Stage2', 'Stage3']

# train_data에 대해 파생변수 추가
add_stage_totals(train_data, stages)

# test_data에 대해 파생변수 추가
add_stage_totals(test_data, stages)

In [279]:
# 제거할 변수 목록
columns_to_drop = [
'Stage1 Circle1 Distance Speed Collect Result_Dam'
, 'Stage1 Circle2 Distance Speed Collect Result_Dam'
, 'Stage1 Circle3 Distance Speed Collect Result_Dam'
, 'Stage1 Circle4 Distance Speed Collect Result_Dam'
, 'Stage1 Line1 Distance Speed Collect Result_Dam'
, 'Stage1 Line2 Distance Speed Collect Result_Dam'
, 'Stage1 Line3 Distance Speed Collect Result_Dam'
, 'Stage1 Line4 Distance Speed Collect Result_Dam'
, 'Stage2 Circle1 Distance Speed Collect Result_Dam'
, 'Stage2 Circle2 Distance Speed Collect Result_Dam'
, 'Stage2 Circle3 Distance Speed Collect Result_Dam'
, 'Stage2 Circle4 Distance Speed Collect Result_Dam'
, 'Stage2 Line1 Distance Speed Collect Result_Dam'
, 'Stage2 Line2 Distance Speed Collect Result_Dam'
, 'Stage2 Line3 Distance Speed Collect Result_Dam'
, 'Stage2 Line4 Distance Speed Collect Result_Dam'
, 'Stage3 Circle1 Distance Speed Collect Result_Dam'
, 'Stage3 Circle2 Distance Speed Collect Result_Dam'
, 'Stage3 Circle3 Distance Speed Collect Result_Dam'
, 'Stage3 Circle4 Distance Speed Collect Result_Dam'
, 'Stage3 Line1 Distance Speed Collect Result_Dam'
, 'Stage3 Line2 Distance Speed Collect Result_Dam'
, 'Stage3 Line3 Distance Speed Collect Result_Dam'
, 'Stage3 Line4 Distance Speed Collect Result_Dam'
]

# 변수 제거
train_data.drop(columns=columns_to_drop, inplace=True)
test_data.drop(columns=columns_to_drop, inplace=True)

In [280]:
# 'Distance_Speed_avg_Dam'를 포함하는 열 이름 필터링
Process_Desc_col = train_data.filter(like='Distance_Speed_avg_Dam').columns

print("\n Distance_Speed_avg_Dam 포함 변수>")
for col in Process_Desc_col:
    print(col)


 Distance_Speed_avg_Dam 포함 변수>
Stage1_Distance_Speed_avg_Dam
Stage2_Distance_Speed_avg_Dam
Stage3_Distance_Speed_avg_Dam


### 5. THICKNESS

In [281]:
# 'THICKNESS'를 포함하는 열 이름 필터링
Process_Desc_col = train_data.filter(like='THICKNESS').columns

print("\n THICKNESS 포함 변수>")
for col in Process_Desc_col:
    print(col)


 THICKNESS 포함 변수>
THICKNESS 1 Collect Result_Dam
THICKNESS 2 Collect Result_Dam
THICKNESS 3 Collect Result_Dam


In [282]:
# 새로운 파생변수 생성 함수
def create_total_thickness_dam(data):
    data['THICKNESS_total_Dam'] = (
        data['THICKNESS 1 Collect Result_Dam']**2 
        + data['THICKNESS 2 Collect Result_Dam']**2 
        + data['THICKNESS 3 Collect Result_Dam']**2
    )
    # 기존 변수 삭제
    data.drop(columns=[
        'THICKNESS 1 Collect Result_Dam',
        'THICKNESS 2 Collect Result_Dam',
        'THICKNESS 3 Collect Result_Dam'
    ], inplace=True)
    return data

train_data = create_total_thickness_dam(train_data)
test_data = create_total_thickness_dam(test_data)

In [283]:
# 'THICKNESS'를 포함하는 열 이름 필터링
Process_Desc_col = train_data.filter(like='THICKNESS').columns

print("\n THICKNESS 포함 변수>")
for col in Process_Desc_col:
    print(col)


 THICKNESS 포함 변수>
THICKNESS_total_Dam


### 6. AutoClave

In [284]:
# '_AutoClave'를 포함하는 열 이름 필터링
Process_Desc_col = train_data.filter(like='_AutoClave').columns

# 필터링된 열 이름 출력
print("<AutoClave 공정 관련 변수>")
for col in Process_Desc_col:
    print(col)

<AutoClave 공정 관련 변수>
1st Pressure Collect Result_AutoClave
1st Pressure Unit Time_AutoClave
2nd Pressure Collect Result_AutoClave
2nd Pressure Unit Time_AutoClave
3rd Pressure Collect Result_AutoClave
3rd Pressure Unit Time_AutoClave
Chamber Temp. Collect Result_AutoClave
Chamber Temp. Unit Time_AutoClave
Chamber_Temp_OKNG_AutoClave


In [285]:
# 파생변수 생성
train_data['1st_Pressure_x_AutoClave'] = train_data['1st Pressure Collect Result_AutoClave'] * train_data['1st Pressure Unit Time_AutoClave'] 
test_data['1st_Pressure_x_AutoClave'] = test_data['1st Pressure Collect Result_AutoClave'] * test_data['1st Pressure Unit Time_AutoClave'] 

train_data['2nd_Pressure_x_AutoClave'] = train_data['2nd Pressure Collect Result_AutoClave'] * train_data['2nd Pressure Unit Time_AutoClave'] 
test_data['2nd_Pressure_x_AutoClave'] = test_data['2nd Pressure Collect Result_AutoClave'] * test_data['2nd Pressure Unit Time_AutoClave'] 

train_data['3rd_Pressure_x_AutoClave'] = train_data['3rd Pressure Collect Result_AutoClave'] * train_data['3rd Pressure Unit Time_AutoClave'] 
test_data['3rd_Pressure_x_AutoClave'] = test_data['3rd Pressure Collect Result_AutoClave'] * test_data['3rd Pressure Unit Time_AutoClave'] 

train_data['All_Pressure_x_AutoClave'] = train_data['1st_Pressure_x_AutoClave'] + train_data['2nd_Pressure_x_AutoClave'] + train_data['3rd_Pressure_x_AutoClave']
test_data['All_Pressure_x_AutoClave'] = test_data['1st_Pressure_x_AutoClave'] + test_data['2nd_Pressure_x_AutoClave'] + test_data['3rd_Pressure_x_AutoClave']

train_data['All_Pressure_avg_AutoClave'] = train_data['All_Pressure_x_AutoClave'] / train_data['Chamber Temp. Unit Time_AutoClave']
test_data['All_Pressure_avg_AutoClave'] = test_data['All_Pressure_x_AutoClave'] / test_data['Chamber Temp. Unit Time_AutoClave']

train_data['Chamber_Temp_x_AutoClave'] = train_data['Chamber Temp. Collect Result_AutoClave'] * train_data['Chamber Temp. Unit Time_AutoClave']
test_data['Chamber_Temp_x_AutoClave'] = test_data['Chamber Temp. Collect Result_AutoClave'] * test_data['Chamber Temp. Unit Time_AutoClave']

train_data['All_Pressure_frac_Chamber_Temp_AutoClave'] = train_data['All_Pressure_x_AutoClave'] / train_data['Chamber_Temp_x_AutoClave']
test_data['All_Pressure_frac_Chamber_Temp_AutoClave'] = test_data['All_Pressure_x_AutoClave'] / test_data['Chamber_Temp_x_AutoClave']


In [286]:
# '_AutoClave'를 포함하는 열 이름 필터링
Process_Desc_col = train_data.filter(like='_AutoClave').columns

# 필터링된 열 이름 출력
print("<AutoClave 공정 관련 변수>")
for col in Process_Desc_col:
    print(col)

<AutoClave 공정 관련 변수>
1st Pressure Collect Result_AutoClave
1st Pressure Unit Time_AutoClave
2nd Pressure Collect Result_AutoClave
2nd Pressure Unit Time_AutoClave
3rd Pressure Collect Result_AutoClave
3rd Pressure Unit Time_AutoClave
Chamber Temp. Collect Result_AutoClave
Chamber Temp. Unit Time_AutoClave
Chamber_Temp_OKNG_AutoClave
1st_Pressure_x_AutoClave
2nd_Pressure_x_AutoClave
3rd_Pressure_x_AutoClave
All_Pressure_x_AutoClave
All_Pressure_avg_AutoClave
Chamber_Temp_x_AutoClave
All_Pressure_frac_Chamber_Temp_AutoClave


In [287]:
# 제거할 변수 목록
columns_to_drop = [
# '1st Pressure Collect Result_AutoClave'
'1st Pressure Unit Time_AutoClave'
# , '2nd Pressure Collect Result_AutoClave'
, '2nd Pressure Unit Time_AutoClave'
# , '3rd Pressure Collect Result_AutoClave'
, '3rd Pressure Unit Time_AutoClave'
# , 'Chamber Temp. Collect Result_AutoClave'
# , 'Chamber Temp. Unit Time_AutoClave'

# , '1st_Pressure_x_AutoClave'
# , '2nd_Pressure_x_AutoClave'
# , '3rd_Pressure_x_AutoClave'
, 'All_Pressure_x_AutoClave'
# , 'All_Pressure_avg_AutoClave'
# , 'Chamber_Temp_x_AutoClave'
# , 'All_Pressure_frac_Chamber_Temp_AutoClave'
]

# 변수 제거
train_data.drop(columns=columns_to_drop, inplace=True)
test_data.drop(columns=columns_to_drop, inplace=True)

In [288]:
# '_AutoClave'를 포함하는 열 이름 필터링
Process_Desc_col = train_data.filter(like='_AutoClave').columns

# 필터링된 열 이름 출력
print("<AutoClave 공정 관련 변수>")
for col in Process_Desc_col:
    print(col)

<AutoClave 공정 관련 변수>
1st Pressure Collect Result_AutoClave
2nd Pressure Collect Result_AutoClave
3rd Pressure Collect Result_AutoClave
Chamber Temp. Collect Result_AutoClave
Chamber Temp. Unit Time_AutoClave
Chamber_Temp_OKNG_AutoClave
1st_Pressure_x_AutoClave
2nd_Pressure_x_AutoClave
3rd_Pressure_x_AutoClave
All_Pressure_avg_AutoClave
Chamber_Temp_x_AutoClave
All_Pressure_frac_Chamber_Temp_AutoClave


### 7. ETC..

7-1. workorder

In [289]:
# 타겟 변수를 0과 1로 변환
train_data['target_binary'] = train_data['target'].apply(lambda x: 1 if x == 'AbNormal' else 0)

# Workorder 변수의 값에 대한 타겟 변수 비율 계산
workorder_target_ratio = train_data.groupby('Workorder')['target_binary'].mean()

# 파생 변수 생성 함수
def create_derived_variable(row, ratio_dict, threshold):
    return 1 if ratio_dict.get(row['Workorder'], 0) >= threshold else 0

# 파생 변수 생성
train_data['Workorder_0.9'] = train_data.apply(create_derived_variable, axis=1, ratio_dict=workorder_target_ratio, threshold=0.9)
train_data['Workorder_0.6'] = train_data.apply(create_derived_variable, axis=1, ratio_dict=workorder_target_ratio, threshold=0.6)

test_data['Workorder_0.9'] = test_data.apply(create_derived_variable, axis=1, ratio_dict=workorder_target_ratio, threshold=0.9)
test_data['Workorder_0.6'] = test_data.apply(create_derived_variable, axis=1, ratio_dict=workorder_target_ratio, threshold=0.6)

# 불필요한 변수 제거
train_data.drop(['target_binary'], axis=1, inplace=True)

7-2. Machine Tact time

In [290]:
### 총시간 대비 비율 변수
def calculate_total_time_and_ratios(data):
    data['total_time'] = (
        data['Machine Tact time Collect Result_Dam'] +
        data['Machine Tact time Collect Result_Fill1'] +
        data['Machine Tact time Collect Result_Fill2'] +
        data['Chamber Temp. Unit Time_AutoClave']
    )
    data['time_ratio_Dam'] = (data['Machine Tact time Collect Result_Dam'] / data['total_time']).round(3)
    data['time_ratio_Fill1'] = (data['Machine Tact time Collect Result_Fill1'] / data['total_time']).round(3)
    data['time_ratio_Fill2'] = (data['Machine Tact time Collect Result_Fill2'] / data['total_time']).round(3)
    data['time_ratio_AutoClave'] = (data['Chamber Temp. Unit Time_AutoClave'] / data['total_time']).round(3)
    return data

# train_data와 test_data에 함수 적용
train_data = calculate_total_time_and_ratios(train_data)
test_data = calculate_total_time_and_ratios(test_data)

In [291]:
# 변수 제거
train_data.drop(columns=[
    'total_time'
    , 'Machine Tact time Collect Result_Dam'
    , 'Machine Tact time Collect Result_Fill1'
    , 'Machine Tact time Collect Result_Fill2'
    , 'Chamber Temp. Unit Time_AutoClave'], inplace=True)

test_data.drop(columns=[
    'total_time'
    , 'Machine Tact time Collect Result_Dam'
    , 'Machine Tact time Collect Result_Fill1'
    , 'Machine Tact time Collect Result_Fill2'
    , 'Chamber Temp. Unit Time_AutoClave'], inplace=True)

---

In [292]:
train_data.info()
print('---')
# test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76312 entries, 0 to 76311
Data columns (total 60 columns):
 #   Column                                                 Non-Null Count  Dtype  
---  ------                                                 --------------  -----  
 0   Model.Suffix                                           76312 non-null  object 
 1   Workorder                                              76312 non-null  object 
 2   DISCHARGED SPEED OF RESIN Collect Result_Dam           76312 non-null  int64  
 3   DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam    76312 non-null  float64
 4   DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam    76312 non-null  float64
 5   DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam    76312 non-null  float64
 6   Dispense Volume(Stage1) Collect Result_Dam             76312 non-null  float64
 7   Dispense Volume(Stage2) Collect Result_Dam             76312 non-null  float64
 8   Dispense Volume(Stage3) Collect Result_Dam    

In [293]:
# 각 변수별로 결측값이 존재하는지 확인하는 코드
missing_values = train_data.isnull().sum()

# 결측값이 존재하는 변수와 그 개수 출력
missing_values = missing_values[missing_values > 0]
print(missing_values)

# 결측값이 존재하는 변수명을 리스트에 담기
missing_columns = missing_values.index.tolist()
# print("결측값이 존재하는 변수명:", missing_columns)

Series([], dtype: int64)


In [294]:
for col in train_data.columns :
    cnt = train_data[col].value_counts()
    print(f"{cnt}\n")

Model.Suffix
AJX75334501    63014
AJX75334502     6974
AJX75334505     4595
AJX75334503      797
AJX75334507      586
AJX75334506      250
AJX75334508       96
Name: count, dtype: int64

Workorder
3HPXX066-1    707
3G1XC632-1    539
3M1XH425-1    501
3I1XA992-1    479
3G1X8293-1    459
             ... 
4BPM0084-1      2
3FPXX064-3      2
4CPM0166-1      1
3H1XB714-1      1
3HPM0061-1      1
Name: count, Length: 663, dtype: int64

DISCHARGED SPEED OF RESIN Collect Result_Dam
10    42472
16    33819
15       21
Name: count, dtype: int64

DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam
13.2    19517
9.7     11743
9.6     10875
21.3     9883
17.0     9563
14.9     7292
14.2     2214
14.7     1932
13.1     1530
13.8      488
13.5      334
13.6      310
11.6      271
14.8      142
21.2      120
14.3       55
13.7       21
17.1       21
10.6        1
Name: count, dtype: int64

DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam
4.9     21639
3.9      7825
8.4      5857
6.6      4947
8.3 

---

## 타겟 인코딩

In [295]:
# # 'target' 열의 변수 타입을 object로 변경
# # -> test 데이터는 float64 타입으로 되어있음 
# test_data['target'] = test_data['target'].astype('object')

# # object 타입의 변수 출력
# train_object_columns = train_data.select_dtypes(include=['object']).columns
# test_object_columns = test_data.select_dtypes(include=['object']).columns

# print(train_object_columns, f" train_object_columns 갯수 : {len(train_object_columns)}")
# print(test_object_columns, f" test_object_columns 갯수 : {len(test_object_columns)}")

# # 각 object 변수의 고유 값 개수 출력
# print("\nTrain Data:")
# for col in train_object_columns:
#     unique_count = train_data[col].nunique()
#     print(f"{col} unique 값 갯수: {unique_count}")

# print("\nTest Data:")
# for col in test_object_columns:
#     unique_count = test_data[col].nunique()
#     print(f"{col} unique 값 갯수: {unique_count}")

In [296]:
# # 필요한 라이브러리 임포트
# import pandas as pd
# import category_encoders as ce

# # 타겟 변수와 범주형 변수 지정
# ## Target Encoding의 smoothing 파라미터는 default로 auto로 설정되어 있음
# target = 'target'  # 타겟 변수 이름으로 변경
# categorical_columns = [
#     'Model.Suffix',
#     'Workorder',
# ]  # 범주형 변수 이름으로 변경

# # 타겟 값을 숫자로 변환
# target_mapping = {'Normal': 0, 'AbNormal': 1}
# train_data[target] = train_data[target].map(target_mapping)
# test_data[target] = test_data[target].map(target_mapping)

# # 열이 존재하는지 확인
# missing_columns = [col for col in categorical_columns if col not in train_data.columns]
# if missing_columns:
#     raise ValueError(f"train_data에 다음 열이 존재하지 않습니다: {missing_columns}")

# # 타겟 인코더 생성 및 학습
# encoder = ce.TargetEncoder(cols=categorical_columns)
# train_data = encoder.fit_transform(train_data, train_data[target])

# # Set ID 열을 별도로 저장
# set_id = test_data['Set ID']

# # 테스트 데이터 인코딩 (Set ID 열 제외)
# test_data = test_data.drop(columns=['Set ID'])
# test_data = encoder.transform(test_data)

# # Set ID 열을 맨 앞에 추가
# test_data.insert(0, 'Set ID', set_id)

# # categorical_columns에 해당하는 열의 데이터 값만 확인
# print(train_data[categorical_columns].head(3))
# print(test_data[categorical_columns].head(3))

# # 역 매핑 딕셔너리 생성
# reverse_target_mapping = {v: k for k, v in target_mapping.items()}

# # 타겟 값을 원래대로 변환
# train_data[target] = train_data[target].map(reverse_target_mapping)
# test_data[target] = test_data[target].map(reverse_target_mapping)

# print("--- train_data ---")

# # 변환된 타겟 값 확인
# print(train_data[[target]].value_counts())

In [297]:
train_data.info()
print('---')
# test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76312 entries, 0 to 76311
Data columns (total 60 columns):
 #   Column                                                 Non-Null Count  Dtype  
---  ------                                                 --------------  -----  
 0   Model.Suffix                                           76312 non-null  object 
 1   Workorder                                              76312 non-null  object 
 2   DISCHARGED SPEED OF RESIN Collect Result_Dam           76312 non-null  int64  
 3   DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam    76312 non-null  float64
 4   DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam    76312 non-null  float64
 5   DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam    76312 non-null  float64
 6   Dispense Volume(Stage1) Collect Result_Dam             76312 non-null  float64
 7   Dispense Volume(Stage2) Collect Result_Dam             76312 non-null  float64
 8   Dispense Volume(Stage3) Collect Result_Dam    

In [298]:
# train_data.info()
print('---')
test_data.info()

---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17361 entries, 0 to 17360
Data columns (total 61 columns):
 #   Column                                                 Non-Null Count  Dtype  
---  ------                                                 --------------  -----  
 0   Set ID                                                 17361 non-null  object 
 1   Model.Suffix                                           17361 non-null  object 
 2   Workorder                                              17361 non-null  object 
 3   DISCHARGED SPEED OF RESIN Collect Result_Dam           17361 non-null  int64  
 4   DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam    17361 non-null  float64
 5   DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam    17361 non-null  float64
 6   DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam    17361 non-null  float64
 7   Dispense Volume(Stage1) Collect Result_Dam             17361 non-null  float64
 8   Dispense Volume(Stage2) Collect Result_Dam

## 데이터 분할

In [299]:
# df_train, df_val = train_test_split(
#     train_data,
#     test_size=0.2,
#     stratify=train_data["target"],
#     shuffle=True,
#     random_state=RANDOM_STATE,
# )

# def print_stats(df: pd.DataFrame):
#     num_normal = len(df[df["target"] == "Normal"])
#     num_abnormal = len(df[df["target"] == "AbNormal"])

#     print(f"  Total: Normal: {num_normal}, AbNormal: {num_abnormal}" + f" ratio: {num_abnormal/num_normal}")


# # Print statistics
# print(f"  \tAbnormal\tNormal")
# print_stats(df_train)
# print_stats(df_val)

In [300]:
# train_data.info()

### 상관계수

상관계수 0.7 이상 drop

공통 변수

In [301]:
# com_variables = [
#     'Model.Suffix'
#     , 'Workorder'
#     , 'WorkMode Collect Result'
#     , 'Dispenser_1'
#     , 'Dispenser_2'
#     , 'Receip_No_Collect_Result'
#     , 'PalletID_Collect_Result'
#     , 'Production_Qty_Collect_Result'
#     , 'Judge_Value_OK'
#     , 'Workorder_0.9'
#     , 'Workorder_0.6'
# ]

# # 변수들로만 이루어진 DataFrame 생성
# filtered_data = train_data[com_variables]

In [302]:
# # 상관계수 행렬 계산
# correlation_matrix = filtered_data.corr()

# # 자기자신을 제외하고 특정 값 이상인 조합 찾기
# strong_correlations = correlation_matrix[(correlation_matrix >= 0.7) & (correlation_matrix != 1)]

# # 리스트로 변환
# strong_correlations_pairs = strong_correlations.stack().reset_index()
# strong_correlations_pairs.columns = ['Variable 1', 'Variable 2', 'Correlation']

# # 결과 출력
# strong_correlations_pairs = strong_correlations_pairs[strong_correlations_pairs['Correlation'] >= 0.7]
# print(strong_correlations_pairs)

In [303]:
# # 변수 삭제
# train_data.drop(['PalletID_Collect_Result'], axis=1, inplace=True)
# test_data.drop(['PalletID_Collect_Result'], axis=1, inplace=True)

In [304]:
# # 공통 변수 리스트
# com_variables_train = [
#     'target'
#     , 'Model.Suffix'
#     , 'Workorder'
#     , 'WorkMode Collect Result'
#     , 'Dispenser_1'
#     , 'Dispenser_2'
#     , 'Receip_No_Collect_Result'
#     # , 'PalletID_Collect_Result'
#     , 'Production_Qty_Collect_Result'
#     , 'Judge_Value_OK'
#     , 'Workorder_0.9'
#     , 'Workorder_0.6'
# ]

# com_variables_test = [
#     'target'
#     , 'Set ID'
#     , 'Model.Suffix'
#     , 'Workorder'
#     , 'WorkMode Collect Result'
#     , 'Dispenser_1'
#     , 'Dispenser_2'
#     , 'Receip_No_Collect_Result'
#     # , 'PalletID_Collect_Result'
#     , 'Production_Qty_Collect_Result'
#     , 'Judge_Value_OK'
#     , 'Workorder_0.9'
#     , 'Workorder_0.6'
# ]

Dam

In [305]:
# # 열 이름 필터링
# Process_Desc_col = train_data.filter(like='_Dam').columns

# # 필터링된 열 이름 출력
# print("<Dam 공정 관련 변수>")
# for col in Process_Desc_col:
#     print(col)

In [306]:
# # 새로운 변수 목록
# variables = [
#     "DISCHARGED SPEED OF RESIN Collect Result_Dam",
#     "DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam",
#     "DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam",
#     "DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam",
#     "Dispense Volume(Stage1) Collect Result_Dam",
#     "Dispense Volume(Stage2) Collect Result_Dam",
#     "Dispense Volume(Stage3) Collect Result_Dam",
#     "Head Clean Position Z Collect Result_Dam",
#     "Head Purge Position Z Collect Result_Dam",
#     "Head Zero Position Y Collect Result_Dam",
#     "Head Zero Position Z Collect Result_Dam",
#     "CURE_Time_Dam",
#     "HEAD NORMAL DISTANCE_STAGE1_STAGE3_Dam",
#     "HEAD NORMAL DISTANCE_TRIANGLE_height_Dam",
#     "Stage1_Distance_Speed_avg_Dam",
#     "Stage2_Distance_Speed_avg_Dam",
#     "Stage3_Distance_Speed_avg_Dam",
#     "THICKNESS_total_Dam",
#     "time_ratio_Dam"
# ]

# # 변수들로만 이루어진 DataFrame 생성
# filtered_data = train_data[variables]

In [307]:
# # 상관계수 행렬 계산
# correlation_matrix = filtered_data.corr()

# # 자기자신을 제외하고 특정 값 이상인 조합 찾기
# strong_correlations = correlation_matrix[(correlation_matrix >= 0.7) & (correlation_matrix != 1)]

# # 리스트로 변환
# strong_correlations_pairs = strong_correlations.stack().reset_index()
# strong_correlations_pairs.columns = ['Variable 1', 'Variable 2', 'Correlation']

# # 결과 출력
# strong_correlations_pairs = strong_correlations_pairs[strong_correlations_pairs['Correlation'] >= 0.7]
# print(strong_correlations_pairs)

In [308]:
# # 드랍할 열 목록
# columns_to_drop = [
#     "DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam"
#     , "DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam"
#     , "DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam"
#     , "Dispense Volume(Stage1) Collect Result_Dam"
#     , "Head Clean Position Z Collect Result_Dam"
#     , "Head Zero Position Z Collect Result_Dam"
#     , "Stage3_Distance_Speed_avg_Dam"
# ]

# # 열 삭제
# train_data.drop(columns=columns_to_drop, inplace=True, errors='ignore')
# test_data.drop(columns=columns_to_drop, inplace=True, errors='ignore')

AutoCalve

In [309]:
# # 열 이름 필터링
# Process_Desc_col = train_data.filter(like='_AutoClave').columns

# # 필터링된 열 이름 출력
# print("<AutoClave 공정 관련 변수>")
# for col in Process_Desc_col:
#     print(col)

In [310]:
# # 새로운 변수 목록
# variables = [
#     "1st Pressure Collect Result_AutoClave"
#     , "2nd Pressure Collect Result_AutoClave"
#     , "3rd Pressure Collect Result_AutoClave"
#     , "Chamber Temp. Collect Result_AutoClave"
#     , "Chamber_Temp_OKNG_AutoClave"
#     , "1st_Pressure_x_AutoClave"
#     , "2nd_Pressure_x_AutoClave"
#     , "3rd_Pressure_x_AutoClave"
#     , "All_Pressure_avg_AutoClave"
#     , "Chamber_Temp_x_AutoClave"
#     , "All_Pressure_frac_Chamber_Temp_AutoClave"
#     , "time_ratio_AutoClave"
# ]

# # 변수들로만 이루어진 DataFrame 생성
# filtered_data = train_data[variables]

In [311]:
# # 상관계수 행렬 계산
# correlation_matrix = filtered_data.corr()

# # 자기자신을 제외하고 특정 값 이상인 조합 찾기
# strong_correlations = correlation_matrix[(correlation_matrix >= 0.7) & (correlation_matrix != 1)]

# # 리스트로 변환
# strong_correlations_pairs = strong_correlations.stack().reset_index()
# strong_correlations_pairs.columns = ['Variable 1', 'Variable 2', 'Correlation']

# # 결과 출력
# strong_correlations_pairs = strong_correlations_pairs[strong_correlations_pairs['Correlation'] >= 0.7]
# print(strong_correlations_pairs)

In [312]:
# # 드랍할 열 목록
# columns_to_drop = [
#     "2nd_Pressure_x_AutoClave"
#     , 'Chamber_Temp_OKNG_AutoClave'
#     , 'Chamber_Temp_x_AutoClave'
# ]

# # 열 삭제
# train_data.drop(columns=columns_to_drop, inplace=True, errors='ignore')
# test_data.drop(columns=columns_to_drop, inplace=True, errors='ignore')

Fill1

In [313]:
# # 열 이름 필터링
# Process_Desc_col = train_data.filter(like='_Fill1').columns

# # 필터링된 열 이름 출력
# print("<Fill1 공정 관련 변수>")
# for col in Process_Desc_col:
#     print(col)

In [314]:
# # 새로운 변수 목록
# variables = [
#     "DISCHARGED SPEED OF RESIN Collect Result_Fill1",
#     "DISCHARGED TIME OF RESIN(Stage1) Collect Result_Fill1",
#     "DISCHARGED TIME OF RESIN(Stage2) Collect Result_Fill1",
#     "DISCHARGED TIME OF RESIN(Stage3) Collect Result_Fill1",
#     "Dispense Volume(Stage1) Collect Result_Fill1",
#     "Dispense Volume(Stage2) Collect Result_Fill1",
#     "Dispense Volume(Stage3) Collect Result_Fill1",
#     "Head Purge Position Z Collect Result_Fill1",
#     "HEAD NORMAL DISTANCE_STAGE1_STAGE3_Fill1",
#     "HEAD NORMAL DISTANCE_TRIANGLE_height_Fill1",
#     "time_ratio_Fill1"
# ]

# # 변수들로만 이루어진 DataFrame 생성
# filtered_data = train_data[variables]

In [315]:
# # 상관계수 행렬 계산
# correlation_matrix = filtered_data.corr()

# # 자기자신을 제외하고 특정 값 이상인 조합 찾기
# strong_correlations = correlation_matrix[(correlation_matrix >= 0.7) & (correlation_matrix != 1)]

# # 리스트로 변환
# strong_correlations_pairs = strong_correlations.stack().reset_index()
# strong_correlations_pairs.columns = ['Variable 1', 'Variable 2', 'Correlation']

# # 결과 출력
# strong_correlations_pairs = strong_correlations_pairs[strong_correlations_pairs['Correlation'] >= 0.7]
# print(strong_correlations_pairs)

In [316]:
# # 드랍할 열 목록
# columns_to_drop = [
#     "Dispense Volume(Stage1) Collect Result_Fill1"
#     , "Dispense Volume(Stage2) Collect Result_Fill1"
# ]

# # 열 삭제
# train_data.drop(columns=columns_to_drop, inplace=True, errors='ignore')
# test_data.drop(columns=columns_to_drop, inplace=True, errors='ignore')

Fill2

In [317]:
# # 열 이름 필터링
# Process_Desc_col = train_data.filter(like='_Fill2').columns

# # 필터링된 열 이름 출력
# print("<Fill1 공정 관련 변수>")
# for col in Process_Desc_col:
#     print(col)

In [318]:
# # 새로운 변수 목록
# variables = [
#     "Head Purge Position Z Collect Result_Fill2"
#     , "CURE_Time_Fill2"
#     , "HEAD NORMAL DISTANCE_STAGE1_STAGE2_Fill2"
#     , "HEAD NORMAL DISTANCE_STAGE2_STAGE3_Fill2"
#     , "HEAD NORMAL DISTANCE_STAGE1_STAGE3_Fill2"
#     , "time_ratio_Fill2"
# ]

# # 변수들로만 이루어진 DataFrame 생성
# filtered_data = train_data[variables]

In [319]:
# # 상관계수 행렬 계산
# correlation_matrix = filtered_data.corr()

# # 자기자신을 제외하고 특정 값 이상인 조합 찾기
# strong_correlations = correlation_matrix[(correlation_matrix >= 0.7) & (correlation_matrix != 1)]

# # 리스트로 변환
# strong_correlations_pairs = strong_correlations.stack().reset_index()
# strong_correlations_pairs.columns = ['Variable 1', 'Variable 2', 'Correlation']

# # 결과 출력
# strong_correlations_pairs = strong_correlations_pairs[strong_correlations_pairs['Correlation'] >= 0.7]
# print(strong_correlations_pairs)

In [320]:
# # 드랍할 열 목록
# columns_to_drop = [
#     "HEAD NORMAL DISTANCE_STAGE1_STAGE2_Fill2",
#     "HEAD NORMAL DISTANCE_STAGE2_STAGE3_Fill2"
# ]

# # 열 삭제
# train_data.drop(columns=columns_to_drop, inplace=True, errors='ignore')
# test_data.drop(columns=columns_to_drop, inplace=True, errors='ignore')

공정별 변수 + 공통 변수 결합  
-> 총 4개의 데이터셋을 구성하고 각 데이터셋 마다 train,test 를 만듬

In [321]:
# # CSV 파일로 저장
# train_data.to_csv('train_data_0816.csv', index=False)
# test_data.to_csv('test_data_0816.csv', index=False)

In [322]:
train_data.columns

Index(['Model.Suffix', 'Workorder',
       'DISCHARGED SPEED OF RESIN Collect Result_Dam',
       'DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam',
       'DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam',
       'DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam',
       'Dispense Volume(Stage1) Collect Result_Dam',
       'Dispense Volume(Stage2) Collect Result_Dam',
       'Dispense Volume(Stage3) Collect Result_Dam',
       'Head Clean Position Z Collect Result_Dam',
       'Head Purge Position Z Collect Result_Dam',
       'Head Zero Position Y Collect Result_Dam',
       'Head Zero Position Z Collect Result_Dam', 'WorkMode Collect Result',
       '1st Pressure Collect Result_AutoClave',
       '2nd Pressure Collect Result_AutoClave',
       '3rd Pressure Collect Result_AutoClave',
       'Chamber Temp. Collect Result_AutoClave',
       'DISCHARGED SPEED OF RESIN Collect Result_Fill1',
       'DISCHARGED TIME OF RESIN(Stage1) Collect Result_Fill1',
       'DISCHARGED TI

In [323]:
from sklearn.preprocessing import LabelEncoder

# LabelEncoder 객체 생성
label_encoder = LabelEncoder()

# 'target' 컬럼에 Label Encoding 적용
train_data['target'] = label_encoder.fit_transform(train_data['target'])

In [324]:
# 공통 변수 리스트
com_variables_train = [
    'target'
    , 'Model.Suffix'
    , 'Workorder'
    , 'WorkMode Collect Result'
    , 'Dispenser_1'
    , 'Dispenser_2'
    , 'Receip_No_Collect_Result'
    # , 'PalletID_Collect_Result'
    , 'Production_Qty_Collect_Result'
    , 'Judge_Value_OK'
    , 'Workorder_0.9'
    , 'Workorder_0.6'
]

com_variables_test = [
    'target'
    , 'Set ID'
    , 'Model.Suffix'
    , 'Workorder'
    , 'WorkMode Collect Result'
    , 'Dispenser_1'
    , 'Dispenser_2'
    , 'Receip_No_Collect_Result'
    # , 'PalletID_Collect_Result'
    , 'Production_Qty_Collect_Result'
    , 'Judge_Value_OK'
    , 'Workorder_0.9'
    , 'Workorder_0.6'
]

In [325]:
# 공정 이름 필터링 후 공통 변수와 결합
def create_dataset(train_data, test_data, process_name, com_variables_train, com_variables_test):
    # 열 이름 필터링
    Process_Desc_col = train_data.filter(like=process_name).columns
    
    # train 데이터셋 생성
    final_columns_train = list(Process_Desc_col) + com_variables_train
    train_dataset = train_data[final_columns_train]
    
    # test 데이터셋 생성
    final_columns_test = list(Process_Desc_col) + com_variables_test
    test_dataset = test_data[final_columns_test]
    
    return train_dataset, test_dataset

# 공통 변수 정의
## com_variables_train = [...]  -> 이전 코드에서 정의한 변수 사용
## com_variables_test = [...]   -> 이전 코드에서 정의한 변수 사용

# 데이터셋 생성
train_data_dam, test_data_dam = create_dataset(train_data, test_data, '_Dam', com_variables_train, com_variables_test)
train_data_fill1, test_data_fill1 = create_dataset(train_data, test_data, '_Fill1', com_variables_train, com_variables_test)
train_data_fill2, test_data_fill2 = create_dataset(train_data, test_data, '_Fill2', com_variables_train, com_variables_test)
train_data_autoclave, test_data_autoclave = create_dataset(train_data, test_data, '_AutoClave', com_variables_train, com_variables_test)

In [326]:
train_data_autoclave.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76312 entries, 0 to 76311
Data columns (total 23 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   1st Pressure Collect Result_AutoClave     76312 non-null  float64
 1   2nd Pressure Collect Result_AutoClave     76312 non-null  float64
 2   3rd Pressure Collect Result_AutoClave     76312 non-null  float64
 3   Chamber Temp. Collect Result_AutoClave    76312 non-null  int64  
 4   Chamber_Temp_OKNG_AutoClave               76312 non-null  int64  
 5   1st_Pressure_x_AutoClave                  76312 non-null  float64
 6   2nd_Pressure_x_AutoClave                  76312 non-null  float64
 7   3rd_Pressure_x_AutoClave                  76312 non-null  float64
 8   All_Pressure_avg_AutoClave                76312 non-null  float64
 9   Chamber_Temp_x_AutoClave                  76312 non-null  int64  
 10  All_Pressure_frac_Chamber_Temp_Aut

## 3. 모델 학습

### 모델 정의

In [327]:
import os
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from matplotlib import pyplot as plt

import torch
import torch.nn as nn
from pytorch_tabnet.tab_model import TabNetClassifier

In [328]:
import seaborn as sns
from pprint import pprint

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [329]:
import random

np.random.seed(42)
random.seed(42)
os.environ['PYTHONHASHSEED'] = str(42)

In [330]:
train_data = train_data.dropna(axis=1)
test_data = test_data.dropna(axis=1)

In [331]:
train_data_dam = train_data_dam.dropna(axis=1)
test_data_dam = test_data_dam.dropna(axis=1)

In [332]:
train_data_autoclave = train_data_autoclave.dropna(axis=1)
test_data_autoclave = test_data_autoclave.dropna(axis=1)

In [333]:
train_data_fill1 = train_data_fill1.dropna(axis=1)
test_data_fill1 = test_data_fill1.dropna(axis=1)

In [334]:
train_data_fill2 = train_data_fill2.dropna(axis=1)
test_data_fill2 = test_data_fill2.dropna(axis=1)

In [335]:
X_train = train_data.drop(columns=['target'])
y_train = train_data['target']

In [336]:
X_train_dam = train_data_dam.drop(columns=['target'])

In [337]:
X_train_autoclave = train_data_autoclave.drop(columns=['target'])

In [338]:
X_train_fill1 = train_data_fill1.drop(columns=['target'])

In [339]:
X_train_fill2 = train_data_fill2.drop(columns=['target'])

In [340]:
nunique = X_train_dam.nunique()
types = X_train_dam.dtypes

categorical_columns_dam = []
categorical_dims_dam =  {}
for col in X_train_dam.columns:
    if types[col] == 'object' or nunique[col] < 2:
        print(col, X_train_dam[col].nunique())
        l_enc = LabelEncoder()
        X_train_dam[col] = X_train_dam[col].fillna("VV_likely")
        X_train_dam[col] = l_enc.fit_transform(X_train_dam[col].values)
        categorical_columns_dam.append(col)
        categorical_dims_dam[col] = len(l_enc.classes_)
    else:
        X_train_dam.fillna(train_data.loc[:, col].mean(), inplace=True)

Model.Suffix 7
Workorder 663


In [341]:
nunique = X_train_autoclave.nunique()
types = X_train_autoclave.dtypes

categorical_columns_autoclave = []
categorical_dims_autoclave =  {}
for col in X_train_autoclave.columns:
    if types[col] == 'object' or nunique[col] < 2:
        print(col, X_train_autoclave[col].nunique())
        l_enc = LabelEncoder()
        X_train_autoclave[col] = X_train_autoclave[col].fillna("VV_likely")
        X_train_autoclave[col] = l_enc.fit_transform(X_train_autoclave[col].values)
        categorical_columns_autoclave.append(col)
        categorical_dims_autoclave[col] = len(l_enc.classes_)
    else:
        X_train_autoclave.fillna(train_data.loc[:, col].mean(), inplace=True)

Model.Suffix 7
Workorder 663


In [342]:
nunique = X_train_fill1.nunique()
types = X_train_fill1.dtypes

categorical_columns_fill1 = []
categorical_dims_fill1 =  {}
for col in X_train_fill1.columns:
    if types[col] == 'object' or nunique[col] < 2:
        print(col, X_train_fill1[col].nunique())
        l_enc = LabelEncoder()
        X_train_fill1[col] = X_train_fill1[col].fillna("VV_likely")
        X_train_fill1[col] = l_enc.fit_transform(X_train_fill1[col].values)
        categorical_columns_fill1.append(col)
        categorical_dims_fill1[col] = len(l_enc.classes_)
    else:
        X_train_fill1.fillna(train_data.loc[:, col].mean(), inplace=True)

Model.Suffix 7
Workorder 663


In [343]:
nunique = X_train_fill2.nunique()
types = X_train_fill2.dtypes

categorical_columns_fill2 = []
categorical_dims_fill2 =  {}
for col in X_train_fill2.columns:
    if types[col] == 'object' or nunique[col] < 2:
        print(col, X_train_fill2[col].nunique())
        l_enc = LabelEncoder()
        X_train_fill2[col] = X_train_fill2[col].fillna("VV_likely")
        X_train_fill2[col] = l_enc.fit_transform(X_train_fill2[col].values)
        categorical_columns_fill2.append(col)
        categorical_dims_fill2[col] = len(l_enc.classes_)
    else:
        X_train_fill2.fillna(train_data.loc[:, col].mean(), inplace=True)

Model.Suffix 7
Workorder 663


In [344]:
# features = [ col for col in X_train.columns] 
# cat_idxs = [ i for i, f in enumerate(features) if f in categorical_columns]
# cat_dims = [ categorical_dims[f] for i, f in enumerate(features) if f in categorical_columns]

In [345]:
# cat_idxs

In [346]:
from sklearn.model_selection import train_test_split

x_train_dam, x_valid_dam, y_train_dam, y_valid_dam = train_test_split(X_train_dam, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state=34)

In [347]:
x_train_autoclave, x_valid_autoclave, y_train_autoclave, y_valid_autoclave = train_test_split(X_train_autoclave, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state=34)

In [348]:
x_train_fill1, x_valid_fill1, y_train_fill1, y_valid_fill1 = train_test_split(X_train_fill1, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state=34)

In [349]:
x_train_fill2, x_valid_fill2, y_train_fill2, y_valid_fill2 = train_test_split(X_train_fill2, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state=34)

In [350]:
features = [ col for col in X_train_dam.columns] 
cat_idxs_dam = [ i for i, f in enumerate(features) if f in categorical_columns_dam]
cat_dims_dam = [ categorical_dims_dam[f] for i, f in enumerate(features) if f in categorical_columns_dam]

In [351]:
features = [ col for col in X_train_autoclave.columns] 
cat_idxs_autoclave = [ i for i, f in enumerate(features) if f in categorical_columns_autoclave]
cat_dims_autoclave = [ categorical_dims_autoclave[f] for i, f in enumerate(features) if f in categorical_columns_autoclave]

In [352]:
features = [ col for col in X_train_fill1.columns] 
cat_idxs_fill1 = [ i for i, f in enumerate(features) if f in categorical_columns_fill1]
cat_dims_fill1 = [ categorical_dims_fill1[f] for i, f in enumerate(features) if f in categorical_columns_fill1]

In [353]:
features = [ col for col in X_train_fill2.columns] 
cat_idxs_fill2 = [ i for i, f in enumerate(features) if f in categorical_columns_fill2]
cat_dims_fill2 = [ categorical_dims_fill2[f] for i, f in enumerate(features) if f in categorical_columns_fill2]

In [354]:
x_train_dam_np = x_train_dam.values
y_train_dam_np = y_train_dam.values
x_valid_dam_np = x_valid_dam.values
y_valid_dam_np = y_valid_dam.values

In [355]:
x_train_autoclave_np = x_train_autoclave.values
y_train_autoclave_np = y_train_autoclave.values
x_valid_autoclave_np = x_valid_autoclave.values
y_valid_autoclave_np = y_valid_autoclave.values

In [356]:
x_train_fill1_np = x_train_fill1.values
y_train_fill1_np = y_train_fill1.values
x_valid_fill1_np = x_valid_fill1.values
y_valid_fill1_np = y_valid_fill1.values

In [357]:
x_train_fill2_np = x_train_fill2.values
y_train_fill2_np = y_train_fill2.values
x_valid_fill2_np = x_valid_fill2.values
y_valid_fill2_np = y_valid_fill2.values

## 모델 학습

In [358]:
clf_dam = TabNetClassifier(cat_idxs=cat_idxs_dam,
                       cat_dims=cat_dims_dam,
                       cat_emb_dim=1000,
                       optimizer_fn=torch.optim.Adam,
                       optimizer_params=dict(lr=1e-2),
                       scheduler_params={"step_size":50,
                                         "gamma":0.9},
                       scheduler_fn=torch.optim.lr_scheduler.StepLR,
                       mask_type='sparsemax' # "sparsemax", entmax
                      )



In [359]:
clf_autoclave = TabNetClassifier(cat_idxs=cat_idxs_autoclave,
                       cat_dims=cat_dims_autoclave,
                       cat_emb_dim=1000,
                       optimizer_fn=torch.optim.Adam,
                       optimizer_params=dict(lr=1e-2),
                       scheduler_params={"step_size":50,
                                         "gamma":0.9},
                       scheduler_fn=torch.optim.lr_scheduler.StepLR,
                       mask_type='sparsemax' # "sparsemax", entmax
                      )

In [360]:
clf_fill1 = TabNetClassifier(cat_idxs=cat_idxs_fill1,
                       cat_dims=cat_dims_fill1,
                       cat_emb_dim=1000,
                       optimizer_fn=torch.optim.Adam,
                       optimizer_params=dict(lr=1e-2),
                       scheduler_params={"step_size":50,
                                         "gamma":0.9},
                       scheduler_fn=torch.optim.lr_scheduler.StepLR,
                       mask_type='sparsemax' # "sparsemax", entmax
                      )

In [361]:
clf_fill2 = TabNetClassifier(cat_idxs=cat_idxs_fill2,
                       cat_dims=cat_dims_fill2,
                       cat_emb_dim=1000,
                       optimizer_fn=torch.optim.Adam,
                       optimizer_params=dict(lr=1e-2),
                       scheduler_params={"step_size":50,
                                         "gamma":0.9},
                       scheduler_fn=torch.optim.lr_scheduler.StepLR,
                       mask_type='sparsemax' # "sparsemax", entmax
                      )

In [362]:
max_epochs = 20

clf_dam.fit(
    X_train=x_train_dam_np, y_train=y_train_dam_np,
    eval_set=[(x_train_dam_np, y_train_dam_np), (x_valid_dam_np, y_valid_dam_np)],
    eval_name=['train', 'valid'],
    eval_metric=['auc'],
    max_epochs=max_epochs , patience=20,
    batch_size=1024, virtual_batch_size=128,
    num_workers=0,
    weights=1,
    drop_last=False,
)

epoch 0  | loss: 0.67643 | train_auc: 0.49813 | valid_auc: 0.49687 |  0:00:06s
epoch 1  | loss: 0.62983 | train_auc: 0.48857 | valid_auc: 0.47888 |  0:00:12s
epoch 2  | loss: 0.61738 | train_auc: 0.50533 | valid_auc: 0.51137 |  0:00:18s
epoch 3  | loss: 0.6071  | train_auc: 0.57517 | valid_auc: 0.57593 |  0:00:25s
epoch 4  | loss: 0.60302 | train_auc: 0.60596 | valid_auc: 0.60097 |  0:00:32s
epoch 5  | loss: 0.59715 | train_auc: 0.64235 | valid_auc: 0.63968 |  0:00:38s
epoch 6  | loss: 0.59435 | train_auc: 0.6711  | valid_auc: 0.66862 |  0:00:45s
epoch 7  | loss: 0.58868 | train_auc: 0.68477 | valid_auc: 0.68038 |  0:00:52s
epoch 8  | loss: 0.59066 | train_auc: 0.72471 | valid_auc: 0.71799 |  0:00:58s
epoch 9  | loss: 0.58598 | train_auc: 0.73844 | valid_auc: 0.73421 |  0:01:05s
epoch 10 | loss: 0.58192 | train_auc: 0.74822 | valid_auc: 0.73952 |  0:01:12s
epoch 11 | loss: 0.5816  | train_auc: 0.75045 | valid_auc: 0.74216 |  0:01:20s
epoch 12 | loss: 0.57577 | train_auc: 0.74348 | vali



In [363]:
max_epochs = 15

clf_autoclave.fit(
    X_train=x_train_autoclave_np, y_train=y_train_autoclave_np,
    eval_set=[(x_train_autoclave_np, y_train_autoclave_np), (x_valid_autoclave_np, y_valid_autoclave_np)],
    eval_name=['train', 'valid'],
    eval_metric=['auc'],
    max_epochs=max_epochs , patience=20,
    batch_size=1024, virtual_batch_size=128,
    num_workers=0,
    weights=1,
    drop_last=False,
)

epoch 0  | loss: 0.69067 | train_auc: 0.5094  | valid_auc: 0.51058 |  0:00:06s
epoch 1  | loss: 0.62775 | train_auc: 0.62353 | valid_auc: 0.61948 |  0:00:14s
epoch 2  | loss: 0.60684 | train_auc: 0.68849 | valid_auc: 0.68797 |  0:00:21s
epoch 3  | loss: 0.59325 | train_auc: 0.71183 | valid_auc: 0.70772 |  0:00:27s
epoch 4  | loss: 0.59295 | train_auc: 0.72812 | valid_auc: 0.72378 |  0:00:35s
epoch 5  | loss: 0.5876  | train_auc: 0.73337 | valid_auc: 0.72711 |  0:00:41s
epoch 6  | loss: 0.57801 | train_auc: 0.73861 | valid_auc: 0.73156 |  0:00:49s
epoch 7  | loss: 0.5723  | train_auc: 0.74163 | valid_auc: 0.73952 |  0:00:55s
epoch 8  | loss: 0.57397 | train_auc: 0.74831 | valid_auc: 0.74171 |  0:01:03s
epoch 9  | loss: 0.56766 | train_auc: 0.75602 | valid_auc: 0.74845 |  0:01:10s
epoch 10 | loss: 0.56527 | train_auc: 0.76364 | valid_auc: 0.75755 |  0:01:17s
epoch 11 | loss: 0.56044 | train_auc: 0.75905 | valid_auc: 0.75217 |  0:01:24s
epoch 12 | loss: 0.55769 | train_auc: 0.76669 | vali



In [364]:
max_epochs = 20

clf_fill1.fit(
    X_train=x_train_fill1_np, y_train=y_train_fill1_np,
    eval_set=[(x_train_fill1_np, y_train_fill1_np), (x_valid_fill1_np, y_valid_fill1_np)],
    eval_name=['train', 'valid'],
    eval_metric=['auc'],
    max_epochs=max_epochs , patience=20,
    batch_size=1024, virtual_batch_size=128,
    num_workers=0,
    weights=1,
    drop_last=False,
)

epoch 0  | loss: 0.69156 | train_auc: 0.5353  | valid_auc: 0.53666 |  0:00:06s
epoch 1  | loss: 0.63506 | train_auc: 0.63476 | valid_auc: 0.63407 |  0:00:12s
epoch 2  | loss: 0.6252  | train_auc: 0.67395 | valid_auc: 0.67736 |  0:00:19s
epoch 3  | loss: 0.6121  | train_auc: 0.69833 | valid_auc: 0.69719 |  0:00:25s
epoch 4  | loss: 0.60752 | train_auc: 0.71089 | valid_auc: 0.70808 |  0:00:32s
epoch 5  | loss: 0.59857 | train_auc: 0.72425 | valid_auc: 0.72095 |  0:00:38s
epoch 6  | loss: 0.59295 | train_auc: 0.73484 | valid_auc: 0.73059 |  0:00:45s
epoch 7  | loss: 0.58486 | train_auc: 0.74289 | valid_auc: 0.73719 |  0:00:51s
epoch 8  | loss: 0.58278 | train_auc: 0.74542 | valid_auc: 0.73732 |  0:00:58s
epoch 9  | loss: 0.57893 | train_auc: 0.75178 | valid_auc: 0.74393 |  0:01:04s
epoch 10 | loss: 0.57513 | train_auc: 0.75565 | valid_auc: 0.74685 |  0:01:11s
epoch 11 | loss: 0.57324 | train_auc: 0.75996 | valid_auc: 0.75438 |  0:01:18s
epoch 12 | loss: 0.56956 | train_auc: 0.7658  | vali



In [365]:
max_epochs = 20

clf_fill2.fit(
    X_train=x_train_fill2_np, y_train=y_train_fill2_np,
    eval_set=[(x_train_fill2_np, y_train_fill2_np), (x_valid_fill2_np, y_valid_fill2_np)],
    eval_name=['train', 'valid'],
    eval_metric=['auc'],
    max_epochs=max_epochs , patience=20,
    batch_size=1024, virtual_batch_size=128,
    num_workers=0,
    weights=1,
    drop_last=False,
)

epoch 0  | loss: 0.65976 | train_auc: 0.58885 | valid_auc: 0.57916 |  0:00:06s
epoch 1  | loss: 0.61758 | train_auc: 0.69396 | valid_auc: 0.68947 |  0:00:12s
epoch 2  | loss: 0.60082 | train_auc: 0.70493 | valid_auc: 0.70408 |  0:00:18s
epoch 3  | loss: 0.59057 | train_auc: 0.71693 | valid_auc: 0.71538 |  0:00:24s
epoch 4  | loss: 0.58757 | train_auc: 0.73517 | valid_auc: 0.73539 |  0:00:30s
epoch 5  | loss: 0.58306 | train_auc: 0.74086 | valid_auc: 0.73805 |  0:00:37s
epoch 6  | loss: 0.57637 | train_auc: 0.74044 | valid_auc: 0.73767 |  0:00:43s
epoch 7  | loss: 0.57355 | train_auc: 0.74828 | valid_auc: 0.7434  |  0:00:49s
epoch 8  | loss: 0.57361 | train_auc: 0.74415 | valid_auc: 0.74052 |  0:00:55s
epoch 9  | loss: 0.57275 | train_auc: 0.74675 | valid_auc: 0.74067 |  0:01:02s
epoch 10 | loss: 0.5689  | train_auc: 0.75053 | valid_auc: 0.74543 |  0:01:08s
epoch 11 | loss: 0.56676 | train_auc: 0.75462 | valid_auc: 0.74871 |  0:01:14s
epoch 12 | loss: 0.56579 | train_auc: 0.75993 | vali



In [366]:
test_data_dam

Unnamed: 0,DISCHARGED SPEED OF RESIN Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam,Dispense Volume(Stage1) Collect Result_Dam,Dispense Volume(Stage2) Collect Result_Dam,Dispense Volume(Stage3) Collect Result_Dam,Head Clean Position Z Collect Result_Dam,Head Purge Position Z Collect Result_Dam,Head Zero Position Y Collect Result_Dam,...,Model.Suffix,Workorder,WorkMode Collect Result,Dispenser_1,Dispenser_2,Receip_No_Collect_Result,Production_Qty_Collect_Result,Judge_Value_OK,Workorder_0.9,Workorder_0.6
0,10,17.0,4.9,17.0,1.19,0.34,1.19,130.85,130.85,300.0,...,AJX75334501,3J1XF767-1,1.0,0,1,1.0,195.0,0,0,1
1,16,14.2,8.3,14.2,0.99,0.58,0.99,124.00,130.85,300.0,...,AJX75334501,4B1XD472-2,0.0,0,1,1.0,256.0,1,0,0
2,10,9.7,4.9,9.7,0.67,0.34,0.67,133.50,133.50,300.0,...,AJX75334501,3H1XE355-1,1.0,1,0,1.0,98.0,0,0,0
3,10,21.3,10.6,21.3,1.49,0.74,1.49,130.85,130.85,300.0,...,AJX75334501,3L1XA128-1,0.0,0,1,1.0,0.0,0,0,0
4,16,13.2,7.5,13.2,0.92,0.52,0.92,130.85,130.85,300.0,...,AJX75334501,4A1XA639-1,0.0,1,0,1.0,215.0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17356,10,21.3,10.6,21.3,1.49,0.74,1.49,130.85,130.85,300.0,...,AJX75334501,3K1XB597-1,0.0,0,1,1.0,131.0,0,0,0
17357,16,13.2,7.6,13.2,0.92,0.53,0.92,130.85,130.85,300.0,...,AJX75334501,4A1XB974-1,0.0,0,1,1.0,279.0,1,0,0
17358,16,13.2,6.6,13.2,1.45,0.72,1.45,130.85,130.85,300.0,...,AJX75334501,3L1XA998-1,0.0,1,0,1.0,66.0,0,0,1
17359,10,9.7,3.9,9.7,0.67,0.27,0.67,133.50,133.50,303.5,...,AJX75334501,3F1XC376-1,1.0,1,0,1.0,117.0,0,0,1


In [367]:
test_data_dam = test_data_dam.drop(columns = ['Set ID'])
test_data_autoclave = test_data_autoclave.drop(columns = ['Set ID'])
test_data_fill1 = test_data_fill1.drop(columns = ['Set ID'])
test_data_fill2 = test_data_fill2.drop(columns = ['Set ID'])

In [368]:
nunique = test_data_dam.nunique()
types = test_data_dam.dtypes

for col in test_data_dam.columns:
    if types[col] == 'object' or nunique[col] < 2:
        print(col, test_data_dam[col].nunique())
        l_enc = LabelEncoder()
        test_data_dam[col] = l_enc.fit_transform(test_data_dam[col].values)
    else:
        test_data_dam.fillna(train_data.loc[:, col].mean(), inplace=True)

Model.Suffix 7
Workorder 662


In [369]:
nunique = test_data_autoclave.nunique()
types = test_data_autoclave.dtypes

for col in test_data_autoclave.columns:
    if types[col] == 'object' or nunique[col] < 2:
        print(col, test_data_autoclave[col].nunique())
        l_enc = LabelEncoder()
        test_data_autoclave[col] = l_enc.fit_transform(test_data_autoclave[col].values)
    else:
        test_data_autoclave.fillna(train_data.loc[:, col].mean(), inplace=True)

Model.Suffix 7
Workorder 662


In [370]:
nunique = test_data_fill1.nunique()
types = test_data_fill1.dtypes

for col in test_data_fill1.columns:
    if types[col] == 'object' or nunique[col] < 2:
        print(col, test_data_fill1[col].nunique())
        l_enc = LabelEncoder()
        test_data_fill1[col] = l_enc.fit_transform(test_data_fill1[col].values)
    else:
        test_data_fill1.fillna(train_data.loc[:, col].mean(), inplace=True)

Model.Suffix 7
Workorder 662


In [371]:
nunique = test_data_fill2.nunique()
types = test_data_fill2.dtypes

for col in test_data_fill2.columns:
    if types[col] == 'object' or nunique[col] < 2:
        print(col, test_data_fill2[col].nunique())
        l_enc = LabelEncoder()
        test_data_fill2[col] = l_enc.fit_transform(test_data_fill2[col].values)
    else:
        test_data_fill2.fillna(train_data.loc[:, col].mean(), inplace=True)

Model.Suffix 7
Workorder 662


In [372]:
x_test_dam_np = test_data_dam.values

In [373]:
x_test_autoclave_np = test_data_autoclave.values

In [374]:
x_test_fill1_np = test_data_fill1.values

In [375]:
x_test_fill2_np = test_data_fill2.values

In [424]:
preds_dam = clf_dam.predict_proba(x_test_dam_np)

In [377]:
preds_autoclave = clf_autoclave.predict_proba(x_test_autoclave_np)

In [378]:
preds_fill1 = clf_fill1.predict_proba(x_test_fill1_np)

In [379]:
preds_fill2 = clf_fill2.predict_proba(x_test_fill2_np)

In [425]:
n=0
j=0
for i in preds_dam:
    if i[0] > i[1]:
        n = n+1
    else:
        j = j+1

In [426]:
len(preds_dam)

17361

In [427]:
print(n)
print(j)

6447
10914


In [383]:
n=0
j=0
for i in preds_autoclave:
    if i[0] > i[1]:
        n = n+1
    else:
        j = j+1

In [384]:
print(n)
print(j)

6543
10818


In [385]:
n=0
j=0
for i in preds_fill1:
    if i[0] > i[1]:
        n = n+1
    else:
        j = j+1

In [386]:
print(n)
print(j)

6329
11032


In [387]:
n=0
j=0
for i in preds_fill2:
    if i[0] > i[1]:
        n = n+1
    else:
        j = j+1

In [388]:
print(n)
print(j)

6072
11289


In [465]:
df_target_dam = np.where(preds_dam[:,1] >= 0.5, 1, 0)

In [466]:
df_target_total = pd.DataFrame(df_target_dam, columns=['target_dam'])

In [467]:
df_target_total.value_counts()

target_dam
1             10914
0              6447
Name: count, dtype: int64

In [468]:
df_target_autoclave = np.where(preds_autoclave[:,1] >= 0.5, 1, 0)

In [469]:
df_target_fill1 = np.where(preds_fill1[:,1] >= 0.5, 1, 0)

In [470]:
df_target_fill2 = np.where(preds_fill2[:,1] >= 0.5, 1, 0)

In [471]:
df_target_autoclave = pd.DataFrame(df_target_autoclave, columns=['target_autoclave'])

In [473]:
df_target_fill1 = pd.DataFrame(df_target_fill1, columns=['target_fill1'])

In [472]:
df_target_fill2 = pd.DataFrame(df_target_fill2, columns=['target_fill2'])

In [475]:
df_target_all = pd.concat([df_target_total, df_target_autoclave, df_target_fill1, df_target_fill2], axis=1)

In [478]:
for col in df_target_all.columns :
    cnt = df_target_all[col].value_counts()
    print(f"{cnt}\n")

target_dam
1    10914
0     6447
Name: count, dtype: int64

target_autoclave
1    10818
0     6543
Name: count, dtype: int64

target_fill1
1    11032
0     6329
Name: count, dtype: int64

target_fill2
1    11289
0     6072
Name: count, dtype: int64



In [496]:
df_target_all['sum'] = df_target_all[['target_dam', 'target_autoclave', 'target_fill1', 'target_fill2']].sum(axis=1)

# 1의 개수가 2개 이상이면 "Normal", 그렇지 않으면 "AbNormal"
df_target_all['final'] = df_target_all['sum'].apply(lambda x: 'Normal' if x >= 1 else 'AbNormal')

# 중간 계산에 사용한 'sum_ones' 열 제거 (선택 사항)
df_target_all.drop(columns=['sum'], inplace=True)

In [497]:
df_target_all['final'].value_counts()

final
Normal      13937
AbNormal     3424
Name: count, dtype: int64

In [498]:
submission = pd.read_csv('submission.csv')

In [500]:
submission['target'] = df_target_all['final']

In [501]:
submission['target'].value_counts()

target
Normal      13937
AbNormal     3424
Name: count, dtype: int64

In [502]:
submission.to_csv("submission.csv", index=False)

In [None]:
#일단 0.173874점 기록했음... 좀 더 올릴수는 있을거라고 생각함돠

In [505]:
# from sklearn.metrics import accuracy_score

# accuracy = accuracy_score(y_valid_dam_np, preds_dam[:,1])

In [506]:
# from sklearn.metrics import f1_score

# f1_score = f1_score(y_valid_dam_np, preds_dam[:,1], average='micro')

In [507]:
# accuracy

In [508]:
# f1_score

In [509]:
# correct_predictions = np.sum(y_valid_dam_np ==  df_target)

In [510]:
# incorrect_predictions = np.sum(y_valid_dam_np != df_target)

In [511]:
# correct_predictions

In [512]:
# incorrect_predictions

In [437]:
# true_positives = np.sum((y_valid_dam_np == 1) & (df_target == 1))
# true_negatives = np.sum((y_valid_dam_np == 0) & (df_target == 0))

In [438]:
# true_positives

In [439]:
# true_negatives

In [440]:
# y_valid_dam.value_counts()

In [441]:
# false_positives = np.sum((y_valid_dam_np == 1) & (df_target == 0))

# # y_pred가 0이고 y_true가 1인 경우의 개수 (False Negatives)
# false_negatives = np.sum((y_valid_dam_np == 0) & (df_target == 1))

In [442]:
# false_positives

In [443]:
# false_negatives