# 제품 이상여부 판별 프로젝트


## 1. 데이터 불러오기


### 필수 라이브러리


In [132]:
import numpy as np
import pandas as pd

### 데이터 읽어오기


In [133]:
train_data = pd.read_csv("./data/train.csv")
test_data = pd.read_csv("./data/test.csv")

## 2. 데이터 전처리

### 2-1. 데이터 결측값 처리

In [134]:
import pandas as pd

def shift_row_values(row, start_col_index, move_limit, total_columns):
    move_count = 0  # 이동 카운터 초기화
    for col_index in range(start_col_index, total_columns):  # 모든 열을 대상으로
        if pd.isna(row[col_index]) or row[col_index] == "OK":  # 빈값 또는 "OK" 확인
            # 빈값 또는 "OK"가 발견되면 현재 위치부터 이후 3칸 간격의 변수 값을 앞으로 이동
            for shift_index in range(col_index, total_columns - 3, 3):  # 3칸씩 이동
                # 값을 이동
                row[shift_index] = row[shift_index + 3]
                row[shift_index + 3] = None  # 원래 자리 비우기
                move_count += 1  # 이동 카운트 증가

                if move_count >= move_limit:  # 설정된 횟수에 도달하면 중지
                    break
        if move_count >= move_limit:  # 외부 루프에서도 체크
            break
    return row

def shift_values(data, start_col_index, move_limit):
    total_columns = data.shape[1]
    data = data.apply(shift_row_values, axis=1, args=(start_col_index, move_limit, total_columns))
    return data

# 변수 이름 설정 및 시작 열 인덱스 및 이동 횟수 설정
variables_with_limits = [
    ('HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam', 52),
    ('HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1', 22),
    ('HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2', 22)
]

# 각 변수에 대해 함수 호출
def process_data(data, variables_with_limits, output_file):
    for start_var, move_limit in variables_with_limits:
        start_col_index = data.columns.get_loc(start_var)  # 각 변수의 시작 열 인덱스 찾기
        data = shift_values(data, start_col_index, move_limit)
    data.to_csv(output_file, index=False)
    print(f'데이터가 성공적으로 수정되고 저장되었습니다: {output_file}')

# 데이터 처리
process_data(train_data, variables_with_limits, './data/clean_train_data.csv')
process_data(test_data, variables_with_limits, './data/clean_test_data.csv')

데이터가 성공적으로 수정되고 저장되었습니다: ./data/clean_train_data.csv
데이터가 성공적으로 수정되고 저장되었습니다: ./data/clean_test_data.csv


In [135]:
# csv 불러오기
train_data = pd.read_csv('./data/clean_train_data.csv')
test_data = pd.read_csv('./data/clean_test_data.csv')

### 2-2. 기본 전처리  

In [136]:
# train_data와 test_data에서 '?'를 포함하는 열 이름 필터링
train_Process_Desc_col = train_data.filter(like='?').columns
test_Process_Desc_col = test_data.filter(like='?').columns

# ? -> Θ로 변경할 열 이름과 새 열 이름 생성
train_new_columns = {col: col.replace('?', 'Θ') for col in train_Process_Desc_col}
test_new_columns = {col: col.replace('?', 'Θ') for col in test_Process_Desc_col}

# 열 이름 변경
train_data.rename(columns=train_new_columns, inplace=True)
test_data.rename(columns=test_new_columns, inplace=True)

# 'Θ'를 포함하는 열 이름 필터링
train_Process_Desc_col = train_data.filter(like='Θ').columns
test_Process_Desc_col = test_data.filter(like='Θ').columns

In [137]:
# target 열을 임시로 분리
target_train = train_data['target']
target_test = test_data['target']

# 모든 값이 NaN인 열 제거
train_data = train_data.dropna(axis=1, how='all')
test_data = test_data.dropna(axis=1, how='all')

# target 열을 다시 결합
train_data['target'] = target_train
test_data['target'] = target_test

In [138]:
# Wip Line 열 제거
wip_line_columns = train_data.filter(like='Wip Line').columns

train_data.drop(columns=wip_line_columns, inplace=True)
test_data.drop(columns=wip_line_columns, inplace=True)

In [139]:
# Process Desc 열 제거
Process_Desc_col = train_data.filter(like='Process Desc').columns

train_data.drop(columns=Process_Desc_col, inplace=True)
test_data.drop(columns=Process_Desc_col, inplace=True)

In [140]:
# Insp. Seq No 열 제거
Insp_Seq_No_col = train_data.filter(like='Insp. Seq No').columns

train_data.drop(columns=Insp_Seq_No_col, inplace=True)
test_data.drop(columns=Insp_Seq_No_col, inplace=True)

In [141]:
# Insp Judge Code 열 제거
Insp_Judge_Code_col = train_data.filter(like='Insp Judge Code').columns

train_data.drop(columns=Insp_Judge_Code_col, inplace=True)
test_data.drop(columns=Insp_Judge_Code_col, inplace=True)

### 2. 제품 구분

receip no, workorder, model.suffix

In [142]:
### Receip_No
# 파생변수 생성: Receip_No 3개의 컬럼 값이 모두 동일하면 해당 값을 저장, 아니면 diff
train_data['Receip_No'] = train_data.apply(
    lambda row: row['Receip No Collect Result_Dam'] if (row['Receip No Collect Result_Dam'] == row['Receip No Collect Result_Fill1'] == row['Receip No Collect Result_Fill2']) else 'diff',
    axis=1
)
test_data['Receip_No'] = test_data.apply(
    lambda row: row['Receip No Collect Result_Dam'] if (row['Receip No Collect Result_Dam'] == row['Receip No Collect Result_Fill1'] == row['Receip No Collect Result_Fill2']) else 'diff',
    axis=1
)

# 필요없는 변수 삭제
train_data = train_data.drop(columns=['Receip No Collect Result_Dam', 'Receip No Collect Result_Fill1', 'Receip No Collect Result_Fill2'])
test_data = test_data.drop(columns=['Receip No Collect Result_Dam', 'Receip No Collect Result_Fill1', 'Receip No Collect Result_Fill2'])

In [143]:
### model_receip
# 파생변수 생성: Receip No와 Model.Suffix의 조합
train_data['model_receip'] = train_data['Model.Suffix_Dam'] + '_' + train_data['Receip_No'].astype(str)
test_data['model_receip'] = test_data['Model.Suffix_Dam'] + '_' + test_data['Receip_No'].astype(str)

# 필요없는 변수 삭제
train_data = train_data.drop(columns=['Model.Suffix_Dam', 'Model.Suffix_AutoClave', 'Model.Suffix_Fill1', 'Model.Suffix_Fill2'])
test_data = test_data.drop(columns=['Model.Suffix_Dam', 'Model.Suffix_AutoClave', 'Model.Suffix_Fill1', 'Model.Suffix_Fill2'])

In [144]:
# ### workorder_receip
# # Workorder -뒤의 번호 구분을 제거
# train_data['cleaned_workorder'] = train_data['Workorder_Dam'].str.split('-').str[0]
# test_data['cleaned_workorder'] = test_data['Workorder_Dam'].str.split('-').str[0]

# # 파생변수 생성: Receip No와 workorder의 조합
# train_data['workorder_receip'] = train_data['cleaned_workorder'] + '_' + train_data['Receip_No'].astype(str)
# test_data['workorder_receip'] = test_data['cleaned_workorder'] + '_' + test_data['Receip_No'].astype(str)

# # 필요없는 변수 삭제
# train_data = train_data.drop(columns=['Receip_No', 'cleaned_workorder', 'Workorder_Dam', 'Workorder_AutoClave', 'Workorder_Fill1', 'Workorder_Fill2'])
# test_data = test_data.drop(columns=['Receip_No', 'cleaned_workorder', 'Workorder_Dam', 'Workorder_AutoClave', 'Workorder_Fill1', 'Workorder_Fill2'])

In [145]:
### workorder_receip
# Workorder 앞의 4자리만 저장
train_data['cleaned_workorder'] = train_data['Workorder_Dam'].str[:4]
test_data['cleaned_workorder'] = test_data['Workorder_Dam'].str[:4]

# 필요없는 변수 삭제
train_data = train_data.drop(columns=['Receip_No', 'Workorder_Dam', 'Workorder_AutoClave', 'Workorder_Fill1', 'Workorder_Fill2'])
test_data = test_data.drop(columns=['Receip_No', 'Workorder_Dam', 'Workorder_AutoClave', 'Workorder_Fill1', 'Workorder_Fill2'])

### 3. 공통 변수 (dam, fill1, fill2)

- workmode

In [146]:
# WorkMode Collect Result_Dam의 이름을 WorkMode Collect Result로 변경
train_data = train_data.rename(columns={'WorkMode Collect Result_Dam': 'WorkMode Collect Result'})
test_data = test_data.rename(columns={'WorkMode Collect Result_Dam': 'WorkMode Collect Result'})

# WorkMode Collect Result_Fill1, WorkMode Collect Result_Fill2 열 드롭
train_data = train_data.drop(columns=['WorkMode Collect Result_Fill1', 'WorkMode Collect Result_Fill2'])
test_data = test_data.drop(columns=['WorkMode Collect Result_Fill1', 'WorkMode Collect Result_Fill2'])

In [147]:
# WorkMode Collect Result 열의 값이 7인 행을 1로 변경
train_data['WorkMode Collect Result'] = train_data['WorkMode Collect Result'].replace(7, 1)
test_data['WorkMode Collect Result'] = test_data['WorkMode Collect Result'].replace(7, 1)

# WorkMode Collect Result 열의 결측값을 0으로 채움
train_data['WorkMode Collect Result'] = train_data['WorkMode Collect Result'].fillna(0)
test_data['WorkMode Collect Result'] = test_data['WorkMode Collect Result'].fillna(0)

- equipment  
(dispenser1 & dispenser2 변수를 만들 경우 다른 변수들에 의해  
이미 설명이 되는 변수라 상관계수가 너무 높아서 제거하게 됨.  
따라서 equipment가 같은지만 판단하는 파생변수 사용)

In [148]:
# Equipment로 시작하는 열 필터링
Equipment_col = train_data.filter(like='Equipment').columns
Equipment_col2 = test_data.filter(like='Equipment').columns

new_train = train_data.filter(items=Equipment_col)
new_test = test_data.filter(items=Equipment_col2)

# Equipment_same_num 파생변수 생성
def determine_equipment_same_num(row):
    if (row['Equipment_Dam'] == 'Dam dispenser #1' and row['Equipment_AutoClave'] == 'Auto Clave Out' and 
        row['Equipment_Fill1'] == 'Fill1 dispenser #1' and row['Equipment_Fill2'] == 'Fill2 dispenser #1') or \
       (row['Equipment_Dam'] == 'Dam dispenser #2' and row['Equipment_AutoClave'] == 'Auto Clave Out' and 
        row['Equipment_Fill1'] == 'Fill1 dispenser #2' and row['Equipment_Fill2'] == 'Fill2 dispenser #2'):
        return 1
    else:
        return 0

train_data['Equipment_same_num'] = new_train.apply(determine_equipment_same_num, axis=1)
test_data['Equipment_same_num'] = new_test.apply(determine_equipment_same_num, axis=1)

train_data = train_data.drop(columns=['Equipment_Dam', 'Equipment_AutoClave', 'Equipment_Fill1', 'Equipment_Fill2'])
test_data = test_data.drop(columns=['Equipment_Dam', 'Equipment_AutoClave', 'Equipment_Fill1', 'Equipment_Fill2'])

- palletID

In [149]:
# 세 변수의 값이 동일하면 해당 값을 가져가고, 하나라도 일치하지 않으면 diff의 값을 가지는 파생 변수 생성 함수
def create_palletid_collect_result(df):
    df['PalletID_Collect_Result'] = df.apply(
        lambda row: row['PalletID Collect Result_Dam'] 
                    if (row['PalletID Collect Result_Dam'] == row['PalletID Collect Result_Fill1'] == row['PalletID Collect Result_Fill2']) 
                    else 'diff', 
        axis=1
    )

# 함수 적용
create_palletid_collect_result(train_data)
create_palletid_collect_result(test_data)

In [150]:
# 제거할 변수 목록
columns_to_drop = [
    'PalletID Collect Result_Dam',
    'PalletID Collect Result_Fill1',
    'PalletID Collect Result_Fill2'
]

# 변수 제거
train_data.drop(columns=columns_to_drop, inplace=True)
test_data.drop(columns=columns_to_drop, inplace=True)

- production Qty

In [151]:
# 세 변수의 값이 동일하면 해당 값을 가져가고, 하나라도 일치하지 않으면 0의 값을 가지는 파생 변수 생성 함수
def create_palletid_collect_result(df):
    df['Production_Qty_Collect_Result'] = df.apply(
        lambda row: row['Production Qty Collect Result_Dam'] 
                    if (row['Production Qty Collect Result_Dam'] == row['Production Qty Collect Result_Fill1'] == row['Production Qty Collect Result_Fill2']) 
                    else 0, 
        axis=1
    )

# 함수 적용
create_palletid_collect_result(train_data)
create_palletid_collect_result(test_data)

In [152]:
# 제거할 변수 목록
columns_to_drop = [
    'Production Qty Collect Result_Dam',
    'Production Qty Collect Result_Fill1',
    'Production Qty Collect Result_Fill2'
]

# 변수 제거
train_data.drop(columns=columns_to_drop, inplace=True)
test_data.drop(columns=columns_to_drop, inplace=True)

### 4. CURE 변수
- dam -> distance 파생변수 (standby는 단일값, start와 end는 값은 여러개지만 distance 파생변수를 만들었을 때 더 의미있었음)
- fill2 -> 변수값 범주화 (start, end, standby를 각각 범주화했을 때가 합쳐서 distance 만들었을 때보다 더 의미있었음)

In [153]:
### dam
# 시작 위치와 끝 위치 열 이름
start_x_col = 'CURE START POSITION X Collect Result_Dam'
start_z_col = 33.5
end_x_col = 'CURE END POSITION X Collect Result_Dam'
end_z_col = 'CURE END POSITION Z Collect Result_Dam'

# 시작 위치와 끝 위치 사이의 거리 계산
train_data['CURE_DISTANCE_Dam'] = np.sqrt(
    (train_data[end_x_col] - train_data[start_x_col]) ** 2 +
    (train_data[end_z_col] - start_z_col) ** 2
)

test_data['CURE_DISTANCE_Dam'] = np.sqrt(
    (train_data[end_x_col] - train_data[start_x_col]) ** 2 +
    (train_data[end_z_col] - start_z_col) ** 2
)

In [154]:
### fill2
# UV 경화 좌표 합치기
def create_coordinate_columns(data):
    # Fill2
    # cure end
    data['cure_end_position_XZ_Fill2'] = (
        data['CURE END POSITION X Collect Result_Fill2'].astype(str) + ',' +
        data['CURE END POSITION Z Collect Result_Fill2'].astype(str) 
    )

    # cure start
    data['cure_start_position_XZ_Fill2'] = (
        data['CURE START POSITION X Collect Result_Fill2'].astype(str) + ',' +
        data['CURE START POSITION Z Collect Result_Fill2'].astype(str) 
    )

    # cure standby
    data['cure_standby_position_XZ_Fill2'] = (
        data['CURE STANDBY POSITION X Collect Result_Fill2'].astype(str) + ',' +
        data['CURE STANDBY POSITION Z Collect Result_Fill2'].astype(str) 
    )

# train_data와 test_data에 대해 함수 호출
create_coordinate_columns(train_data)
create_coordinate_columns(test_data)

In [155]:
# 제거할 변수 목록
columns_to_drop = [
    'CURE END POSITION X Collect Result_Dam',
    'CURE END POSITION Z Collect Result_Dam',
    'CURE END POSITION Θ Collect Result_Dam',
    'CURE START POSITION X Collect Result_Dam',
    'CURE START POSITION Z Collect Result_Dam',
    'CURE START POSITION Θ Collect Result_Dam',

    'CURE END POSITION X Collect Result_Fill2',
    'CURE END POSITION Z Collect Result_Fill2',
    'CURE END POSITION Θ Collect Result_Fill2',
    'CURE START POSITION X Collect Result_Fill2',
    'CURE START POSITION Z Collect Result_Fill2',
    'CURE START POSITION Θ Collect Result_Fill2',
    'CURE STANDBY POSITION X Collect Result_Fill2',
    'CURE STANDBY POSITION Z Collect Result_Fill2',
    'CURE STANDBY POSITION Θ Collect Result_Fill2'
]

# 변수 제거
train_data.drop(columns=columns_to_drop, inplace=True)
test_data.drop(columns=columns_to_drop, inplace=True)

### 5. HEAD 변수

- dam

In [156]:
# 각 스테이지의 좌표 열 정의
stage1_cols = ['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam',
               'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam',
               'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Dam']

stage2_cols = ['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam',
               'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam',
               'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Dam']

stage3_cols = ['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam',
               'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam',
               'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Dam']

# 거리 계산 함수
def calculate_distances(data):
    data['HEAD NORMAL DISTANCE_STAGE1_STAGE2_Dam'] = np.sqrt(
        (data[stage2_cols[0]] - data[stage1_cols[0]]) ** 2 +
        (data[stage2_cols[1]] - data[stage1_cols[1]]) ** 2 +
        (data[stage2_cols[2]] - data[stage1_cols[2]]) ** 2
    )

    data['HEAD NORMAL DISTANCE_STAGE2_STAGE3_Dam'] = np.sqrt(
        (data[stage3_cols[0]] - data[stage2_cols[0]]) ** 2 +
        (data[stage3_cols[1]] - data[stage2_cols[1]]) ** 2 +
        (data[stage3_cols[2]] - data[stage2_cols[2]]) ** 2
    )

    data['HEAD NORMAL DISTANCE_STAGE1_STAGE3_Dam'] = np.sqrt(
        (data[stage3_cols[0]] - data[stage1_cols[0]]) ** 2 +
        (data[stage3_cols[1]] - data[stage1_cols[1]]) ** 2 +
        (data[stage3_cols[2]] - data[stage1_cols[2]]) ** 2
    )

    return data

# train_data에 적용
train_data = calculate_distances(train_data)

# test_data에 적용
test_data = calculate_distances(test_data)

In [157]:
# 필요한 열 이름
stage1_stage2_col = 'HEAD NORMAL DISTANCE_STAGE1_STAGE2_Dam'
stage2_stage3_col = 'HEAD NORMAL DISTANCE_STAGE2_STAGE3_Dam'
stage1_stage3_col = 'HEAD NORMAL DISTANCE_STAGE1_STAGE3_Dam'

# 삼각형의 넓이와 높이를 계산하는 함수
def calculate_triangle_features(data):
    a = data[stage1_stage2_col]
    b = data[stage2_stage3_col]
    c = data[stage1_stage3_col]

    # 헤론의 공식에 따른 삼각형의 넓이 계산
    s = (a + b + c) / 2
    area = np.sqrt(s * (s - a) * (s - b) * (s - c))

    # 높이 계산 (밑변을 c로 가정)
    height = (2 * area) / c

    # 결과를 새로운 열에 저장
    data['HEAD NORMAL DISTANCE_TRIANGLE_area_Dam'] = area
    data['HEAD NORMAL DISTANCE_TRIANGLE_height_Dam'] = height

    return data

# train_data에 적용
train_data = calculate_triangle_features(train_data)

# test_data에 적용
test_data = calculate_triangle_features(test_data)

In [158]:
# 제거할 변수 목록
columns_to_drop = [
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'
    , 'HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Dam'
    , 'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam'
    , 'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Dam'

    , 'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam'
    , 'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam'
    , 'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Dam'

    , 'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam'
    , 'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam'
    , 'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Dam'

    , 'HEAD NORMAL DISTANCE_STAGE1_STAGE2_Dam'
    , 'HEAD NORMAL DISTANCE_STAGE2_STAGE3_Dam'
]

# 변수 제거
train_data.drop(columns=columns_to_drop, inplace=True)
test_data.drop(columns=columns_to_drop, inplace=True)

In [159]:
# Dam 노즐 zero 위치 Z좌표 드롭
train_data.drop(columns='Head Zero Position Z Collect Result_Dam', inplace=True)
test_data.drop(columns='Head Zero Position Z Collect Result_Dam', inplace=True)

- fill1

In [160]:
# 각 스테이지의 좌표 열 정의
stage1_cols = ['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1',
               'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1',
               'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill1']

stage2_cols = ['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill1',
               'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1',
               'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill1']

stage3_cols = ['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill1',
               'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1',
               'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill1']

# 거리 계산 함수
def calculate_distances(data):
    data['HEAD NORMAL DISTANCE_STAGE1_STAGE2_Fill1'] = np.sqrt(
        (data[stage2_cols[0]] - data[stage1_cols[0]]) ** 2 +
        (data[stage2_cols[1]] - data[stage1_cols[1]]) ** 2 +
        (data[stage2_cols[2]] - data[stage1_cols[2]]) ** 2
    )

    data['HEAD NORMAL DISTANCE_STAGE2_STAGE3_Fill1'] = np.sqrt(
        (data[stage3_cols[0]] - data[stage2_cols[0]]) ** 2 +
        (data[stage3_cols[1]] - data[stage2_cols[1]]) ** 2 +
        (data[stage3_cols[2]] - data[stage2_cols[2]]) ** 2
    )

    data['HEAD NORMAL DISTANCE_STAGE1_STAGE3_Fill1'] = np.sqrt(
        (data[stage3_cols[0]] - data[stage1_cols[0]]) ** 2 +
        (data[stage3_cols[1]] - data[stage1_cols[1]]) ** 2 +
        (data[stage3_cols[2]] - data[stage1_cols[2]]) ** 2
    )

    return data

# train_data에 적용
train_data = calculate_distances(train_data)

# test_data에 적용
test_data = calculate_distances(test_data)

In [161]:
# 필요한 열 이름
stage1_stage2_col = 'HEAD NORMAL DISTANCE_STAGE1_STAGE2_Fill1'
stage2_stage3_col = 'HEAD NORMAL DISTANCE_STAGE2_STAGE3_Fill1'
stage1_stage3_col = 'HEAD NORMAL DISTANCE_STAGE1_STAGE3_Fill1'

# 삼각형의 넓이와 높이를 계산하는 함수
def calculate_triangle_features(data):
    a = data[stage1_stage2_col]
    b = data[stage2_stage3_col]
    c = data[stage1_stage3_col]

    # 헤론의 공식에 따른 삼각형의 넓이 계산
    s = (a + b + c) / 2
    area = np.sqrt(s * (s - a) * (s - b) * (s - c))

    # 높이 계산 (밑변을 c로 가정)
    height = (2 * area) / c

    # 결과를 새로운 열에 저장
    data['HEAD NORMAL DISTANCE_TRIANGLE_area_Fill1'] = area
    data['HEAD NORMAL DISTANCE_TRIANGLE_height_Fill1'] = height

    return data

# train_data에 적용
train_data = calculate_triangle_features(train_data)

# test_data에 적용
test_data = calculate_triangle_features(test_data)

In [162]:
# 제거할 변수 목록
columns_to_drop = [
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'
    , 'HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill1'
    , 'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1'
    , 'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill1'

    , 'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill1'
    , 'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1'
    , 'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill1'

    , 'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill1'
    , 'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1'
    , 'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill1'

    , 'HEAD NORMAL DISTANCE_STAGE1_STAGE2_Fill1'
    , 'HEAD NORMAL DISTANCE_STAGE2_STAGE3_Fill1'
]

# 변수 제거
train_data.drop(columns=columns_to_drop, inplace=True)
test_data.drop(columns=columns_to_drop, inplace=True)

- fill2

In [163]:
# 각 스테이지의 좌표 열 정의
stage1_cols = ['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2',
               'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill2',
               'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill2']

stage2_cols = ['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill2',
               'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill2',
               'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill2']

stage3_cols = ['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill2',
               'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill2',
               'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill2']

# 거리 계산 함수
def calculate_distances(data):
    data['HEAD NORMAL DISTANCE_STAGE1_STAGE2_Fill2'] = np.sqrt(
        (data[stage2_cols[0]] - data[stage1_cols[0]]) ** 2 +
        (data[stage2_cols[1]] - data[stage1_cols[1]]) ** 2 +
        (data[stage2_cols[2]] - data[stage1_cols[2]]) ** 2
    )

    data['HEAD NORMAL DISTANCE_STAGE2_STAGE3_Fill2'] = np.sqrt(
        (data[stage3_cols[0]] - data[stage2_cols[0]]) ** 2 +
        (data[stage3_cols[1]] - data[stage2_cols[1]]) ** 2 +
        (data[stage3_cols[2]] - data[stage2_cols[2]]) ** 2
    )

    data['HEAD NORMAL DISTANCE_STAGE1_STAGE3_Fill2'] = np.sqrt(
        (data[stage3_cols[0]] - data[stage1_cols[0]]) ** 2 +
        (data[stage3_cols[1]] - data[stage1_cols[1]]) ** 2 +
        (data[stage3_cols[2]] - data[stage1_cols[2]]) ** 2
    )

    return data

# train_data에 적용
train_data = calculate_distances(train_data)

# test_data에 적용
test_data = calculate_distances(test_data)

In [164]:
# 제거할 변수 목록
columns_to_drop = [
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2'
    , 'HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill2'
    , 'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill2'
    , 'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill2'

    , 'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill2'
    , 'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill2'
    , 'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill2'

    , 'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill2'
    , 'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill2'
    , 'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill2'
]

# 변수 제거
train_data.drop(columns=columns_to_drop, inplace=True)
test_data.drop(columns=columns_to_drop, inplace=True)

### 6. Resin 변수

- dam

In [165]:
# volume*time 파생변수 - Dam
train_data['volume_time_multip_stage1_Dam'] = train_data['Dispense Volume(Stage1) Collect Result_Dam'] * train_data['DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam']
train_data['volume_time_multip_stage2_Dam'] = train_data['Dispense Volume(Stage2) Collect Result_Dam'] * train_data['DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam']
train_data['volume_time_multip_stage3_Dam'] = train_data['Dispense Volume(Stage3) Collect Result_Dam'] * train_data['DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam']

train_data['volume_time_multip_avg_Dam'] = (train_data['volume_time_multip_stage1_Dam'] + 
                                            train_data['volume_time_multip_stage2_Dam'] + 
                                            train_data['volume_time_multip_stage3_Dam']) / 3

# volume*time 파생변수 - Dam
test_data['volume_time_multip_stage1_Dam'] = test_data['Dispense Volume(Stage1) Collect Result_Dam'] * test_data['DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam']
test_data['volume_time_multip_stage2_Dam'] = test_data['Dispense Volume(Stage2) Collect Result_Dam'] * test_data['DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam']
test_data['volume_time_multip_stage3_Dam'] = test_data['Dispense Volume(Stage3) Collect Result_Dam'] * test_data['DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam']

test_data['volume_time_multip_avg_Dam'] = (test_data['volume_time_multip_stage1_Dam'] + 
                                            test_data['volume_time_multip_stage2_Dam'] + 
                                            test_data['volume_time_multip_stage3_Dam']) / 3

In [166]:
# 삭제할 열 목록 추가
columns_to_drop = [
    'DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam',
    'DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam',
    'DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam',
    'Dispense Volume(Stage1) Collect Result_Dam',
    'Dispense Volume(Stage2) Collect Result_Dam',
    'Dispense Volume(Stage3) Collect Result_Dam',
    'volume_time_multip_stage1_Dam',
    'volume_time_multip_stage2_Dam',
    'volume_time_multip_stage3_Dam'
]

train_data.drop(columns=columns_to_drop, inplace=True, errors='ignore')
test_data.drop(columns=columns_to_drop, inplace=True, errors='ignore')

- fill1

In [167]:
# volume*time 파생변수 - Fill1
train_data['volume_time_multip_stage1_Fill1'] = train_data['Dispense Volume(Stage1) Collect Result_Fill1'] * train_data['DISCHARGED TIME OF RESIN(Stage1) Collect Result_Fill1']
train_data['volume_time_multip_stage2_Fill1'] = train_data['Dispense Volume(Stage2) Collect Result_Fill1'] * train_data['DISCHARGED TIME OF RESIN(Stage2) Collect Result_Fill1']
train_data['volume_time_multip_stage3_Fill1'] = train_data['Dispense Volume(Stage3) Collect Result_Fill1'] * train_data['DISCHARGED TIME OF RESIN(Stage3) Collect Result_Fill1']

train_data['volume_time_multip_avg_Fill1'] = (train_data['volume_time_multip_stage1_Fill1'] + 
                                            train_data['volume_time_multip_stage2_Fill1'] + 
                                            train_data['volume_time_multip_stage3_Fill1']) / 3

# volume*time 파생변수 - Fill1
test_data['volume_time_multip_stage1_Fill1'] = test_data['Dispense Volume(Stage1) Collect Result_Fill1'] * test_data['DISCHARGED TIME OF RESIN(Stage1) Collect Result_Fill1']
test_data['volume_time_multip_stage2_Fill1'] = test_data['Dispense Volume(Stage2) Collect Result_Fill1'] * test_data['DISCHARGED TIME OF RESIN(Stage2) Collect Result_Fill1']
test_data['volume_time_multip_stage3_Fill1'] = test_data['Dispense Volume(Stage3) Collect Result_Fill1'] * test_data['DISCHARGED TIME OF RESIN(Stage3) Collect Result_Fill1']

test_data['volume_time_multip_avg_Fill1'] = (test_data['volume_time_multip_stage1_Fill1'] + 
                                            test_data['volume_time_multip_stage2_Fill1'] + 
                                            test_data['volume_time_multip_stage3_Fill1']) / 3

In [168]:
# 삭제할 열 목록 추가
columns_to_drop = [
    'DISCHARGED TIME OF RESIN(Stage1) Collect Result_Fill1',
    'DISCHARGED TIME OF RESIN(Stage2) Collect Result_Fill1',
    'DISCHARGED TIME OF RESIN(Stage3) Collect Result_Fill1',
    'Dispense Volume(Stage1) Collect Result_Fill1',
    'Dispense Volume(Stage2) Collect Result_Fill1',
    'Dispense Volume(Stage3) Collect Result_Fill1',
    'volume_time_multip_stage1_Fill1',
    'volume_time_multip_stage2_Fill1',
    'volume_time_multip_stage3_Fill1'
]

train_data.drop(columns=columns_to_drop, inplace=True, errors='ignore')
test_data.drop(columns=columns_to_drop, inplace=True, errors='ignore')

### 7. Circle, Line 변수

In [169]:
### circle
# 열 이름 변경
train_data.rename(columns={
    'Stage1 Circle1 Distance Speed Collect Result_Dam': 'Stage1_Circle_Distance_Speed_Dam',
    'Stage2 Circle1 Distance Speed Collect Result_Dam': 'Stage2_Circle_Distance_Speed_Dam',
    'Stage3 Circle1 Distance Speed Collect Result_Dam': 'Stage3_Circle_Distance_Speed_Dam'
}, inplace=True)

test_data.rename(columns={
    'Stage1 Circle1 Distance Speed Collect Result_Dam': 'Stage1_Circle_Distance_Speed_Dam',
    'Stage2 Circle1 Distance Speed Collect Result_Dam': 'Stage2_Circle_Distance_Speed_Dam',
    'Stage3 Circle1 Distance Speed Collect Result_Dam': 'Stage3_Circle_Distance_Speed_Dam'
}, inplace=True)

In [170]:
# 제거할 변수 목록
columns_to_drop = [
    'Stage1 Circle2 Distance Speed Collect Result_Dam',
    'Stage1 Circle3 Distance Speed Collect Result_Dam',
    'Stage1 Circle4 Distance Speed Collect Result_Dam',
    
    'Stage2 Circle2 Distance Speed Collect Result_Dam',
    'Stage2 Circle3 Distance Speed Collect Result_Dam',
    'Stage2 Circle4 Distance Speed Collect Result_Dam',
    
    'Stage3 Circle2 Distance Speed Collect Result_Dam',
    'Stage3 Circle3 Distance Speed Collect Result_Dam',
    'Stage3 Circle4 Distance Speed Collect Result_Dam'
]

# 변수 제거
train_data.drop(columns=columns_to_drop, inplace=True)
test_data.drop(columns=columns_to_drop, inplace=True)

In [171]:
### line
# line1&3과 line2&4를 합친 파생변수 생성 함수
def check_distance_speed(data, stage):
    # 단계에 따라 라인 번호 정의
    line_pairs = [(1, 3), (2, 4)]
    
    # 각 라인 쌍에 대해 반복
    for line1, line2 in line_pairs:
        line1_name = f'Stage{stage} Line{line1} Distance Speed Collect Result_Dam'
        line2_name = f'Stage{stage} Line{line2} Distance Speed Collect Result_Dam'
        
        # 새로운 열 이름 설정
        new_col_name = f'stage{stage}_line{line1}{line2}_distance_speed_Dam'
        
        # 조건에 따라 값 설정
        data[new_col_name] = data.apply(
            lambda row: row[line1_name] if row[line1_name] == row[line2_name] else 'diff', axis=1
        )

# train_data와 test_data 모두에 대해 함수 호출
for stage in range(1, 4):
    check_distance_speed(train_data, stage)
    check_distance_speed(test_data, stage)

In [172]:
# train_data에서 변수들을 object 타입으로 변환
train_data['stage1_line24_distance_speed_Dam'] = train_data['stage1_line24_distance_speed_Dam'].astype(object)
train_data['stage2_line24_distance_speed_Dam'] = train_data['stage2_line24_distance_speed_Dam'].astype(object)
train_data['stage3_line24_distance_speed_Dam'] = train_data['stage3_line24_distance_speed_Dam'].astype(object)

# test_data에서 변수들을 object 타입으로 변환
test_data['stage1_line24_distance_speed_Dam'] = test_data['stage1_line24_distance_speed_Dam'].astype(object)
test_data['stage2_line24_distance_speed_Dam'] = test_data['stage2_line24_distance_speed_Dam'].astype(object)
test_data['stage3_line24_distance_speed_Dam'] = test_data['stage3_line24_distance_speed_Dam'].astype(object)

In [173]:
# 제거할 변수 목록
columns_to_drop = [
    'Stage1 Line1 Distance Speed Collect Result_Dam',
    'Stage1 Line2 Distance Speed Collect Result_Dam',
    'Stage1 Line3 Distance Speed Collect Result_Dam',
    'Stage1 Line4 Distance Speed Collect Result_Dam',
    
    'Stage2 Line1 Distance Speed Collect Result_Dam',
    'Stage2 Line2 Distance Speed Collect Result_Dam',
    'Stage2 Line3 Distance Speed Collect Result_Dam',
    'Stage2 Line4 Distance Speed Collect Result_Dam',
    
    'Stage3 Line1 Distance Speed Collect Result_Dam',
    'Stage3 Line2 Distance Speed Collect Result_Dam',
    'Stage3 Line3 Distance Speed Collect Result_Dam',
    'Stage3 Line4 Distance Speed Collect Result_Dam'
]

# 변수 제거
train_data.drop(columns=columns_to_drop, inplace=True)
test_data.drop(columns=columns_to_drop, inplace=True)

### 8. Thickness 변수

In [174]:
# 세 개 컬럼의 평균을 계산하여 새로운 컬럼 생성
train_data['average_thickness_Dam'] = train_data[['THICKNESS 1 Collect Result_Dam', 
                                                  'THICKNESS 2 Collect Result_Dam', 
                                                  'THICKNESS 3 Collect Result_Dam']].mean(axis=1)

test_data['average_thickness_Dam'] = test_data[['THICKNESS 1 Collect Result_Dam', 
                                                'THICKNESS 2 Collect Result_Dam', 
                                                'THICKNESS 3 Collect Result_Dam']].mean(axis=1)

In [175]:
# 삭제할 컬럼 리스트
columns_to_drop = [
    'THICKNESS 1 Collect Result_Dam',
    'THICKNESS 2 Collect Result_Dam',
    'THICKNESS 3 Collect Result_Dam'
]

# 지정한 컬럼 삭제
train_data.drop(columns=columns_to_drop, inplace=True)
test_data.drop(columns=columns_to_drop, inplace=True)

### 9. Autoclave 관련 변수

In [176]:
# 각 압력과 시간의 곱을 담은 새로운 컬럼 생성
train_data['1st_pressure_time_AutoClave'] = train_data['1st Pressure Collect Result_AutoClave'] * train_data['1st Pressure 1st Pressure Unit Time_AutoClave']
train_data['2nd_pressure_time_AutoClave'] = train_data['2nd Pressure Collect Result_AutoClave'] * train_data['2nd Pressure Unit Time_AutoClave']
train_data['3rd_pressure_time_AutoClave'] = train_data['3rd Pressure Collect Result_AutoClave'] * train_data['3rd Pressure Unit Time_AutoClave']

train_data['avg_pressure_time_AutoClave'] = (train_data['1st_pressure_time_AutoClave'] +
                                             train_data['2nd_pressure_time_AutoClave'] +
                                             train_data['3rd_pressure_time_AutoClave']) / 3

In [177]:
# 각 압력과 시간의 곱을 담은 새로운 컬럼 생성
test_data['1st_pressure_time_AutoClave'] = test_data['1st Pressure Collect Result_AutoClave'] * test_data['1st Pressure 1st Pressure Unit Time_AutoClave']
test_data['2nd_pressure_time_AutoClave'] = test_data['2nd Pressure Collect Result_AutoClave'] * test_data['2nd Pressure Unit Time_AutoClave']
test_data['3rd_pressure_time_AutoClave'] = test_data['3rd Pressure Collect Result_AutoClave'] * test_data['3rd Pressure Unit Time_AutoClave']

test_data['avg_pressure_time_AutoClave'] = (test_data['1st_pressure_time_AutoClave'] +
                                             test_data['2nd_pressure_time_AutoClave'] +
                                             test_data['3rd_pressure_time_AutoClave']) / 3

In [178]:
# 삭제할 컬럼 리스트
columns_to_drop = [
    '1st Pressure Collect Result_AutoClave',
    '1st Pressure 1st Pressure Unit Time_AutoClave',
    '2nd Pressure Collect Result_AutoClave',
    '2nd Pressure Unit Time_AutoClave',
    '3rd Pressure Collect Result_AutoClave',
    '3rd Pressure Unit Time_AutoClave',
]

# 지정한 컬럼 삭제
train_data.drop(columns=columns_to_drop, inplace=True)
test_data.drop(columns=columns_to_drop, inplace=True)

### 10. Time 변수

In [179]:
# 총시간 대비 비율 변수
def calculate_total_time_and_ratios(data):
    data['total_time'] = (
        data['Machine Tact time Collect Result_Dam'] +
        data['Machine Tact time Collect Result_Fill1'] +
        data['Machine Tact time Collect Result_Fill2'] +
        data['Chamber Temp. Unit Time_AutoClave']
    )
    data['time_ratio_Dam'] = (data['Machine Tact time Collect Result_Dam'] / data['total_time']).round(3)
    data['time_ratio_Fill1'] = (data['Machine Tact time Collect Result_Fill1'] / data['total_time']).round(3)
    data['time_ratio_Fill2'] = (data['Machine Tact time Collect Result_Fill2'] / data['total_time']).round(3)
    data['time_ratio_AutoClave'] = (data['Chamber Temp. Unit Time_AutoClave'] / data['total_time']).round(3)
    return data

# train_data와 test_data에 함수 적용
train_data = calculate_total_time_and_ratios(train_data)
test_data = calculate_total_time_and_ratios(test_data)

In [180]:
# 변수 제거
train_data.drop(columns=[
    'total_time'
    , 'Machine Tact time Collect Result_Dam'
    , 'Machine Tact time Collect Result_Fill1'
    , 'Machine Tact time Collect Result_Fill2'
    , 'Chamber Temp. Unit Time_AutoClave'], inplace=True)

test_data.drop(columns=[
    'total_time'
    , 'Machine Tact time Collect Result_Dam'
    , 'Machine Tact time Collect Result_Fill1'
    , 'Machine Tact time Collect Result_Fill2'
    , 'Chamber Temp. Unit Time_AutoClave'], inplace=True)

### 11. 변수 확인

In [181]:
# 삭제할 변수 리스트
columns_to_drop = [
    'Chamber Temp. Judge Value_AutoClave', 
    'GMES_ORIGIN_INSP_JUDGE_CODE Collect Result_AutoClave', 
    'GMES_ORIGIN_INSP_JUDGE_CODE Judge Value_AutoClave'
]

train_data = train_data.drop(columns=columns_to_drop)
test_data = test_data.drop(columns=columns_to_drop)

In [182]:
# 값의 종류가 1개이고 결측값이 없는 열을 제거하는 함수
def drop_single_value_columns(df):
    cols_to_drop = [col for col in df.columns if col != 'target' and df[col].nunique() == 1 and df[col].isnull().sum() == 0]
    df_dropped = df.drop(columns=cols_to_drop)
    return df_dropped, cols_to_drop

# train_data와 test_data에서 해당 열 제거 및 삭제된 열 이름과 개수 출력
train_data, train_cols_dropped = drop_single_value_columns(train_data)
test_data, test_cols_dropped = drop_single_value_columns(test_data)

# print("삭제된 train_data 열 이름:", train_cols_dropped)
print("삭제된 train_data 열 개수:", len(train_cols_dropped))

# print("삭제된 test_data 열 이름:", test_cols_dropped)
print("삭제된 test_data 열 개수:", len(test_cols_dropped))

삭제된 train_data 열 개수: 37
삭제된 test_data 열 개수: 37


### 12. target encoding

In [183]:
# object 타입의 변수 출력
train_object_columns = train_data.select_dtypes(include=['object']).columns
test_object_columns = test_data.select_dtypes(include=['object']).columns

print(train_object_columns, f" train_object_columns 갯수 : {len(train_object_columns)}")
print(test_object_columns, f" test_object_columns 갯수 : {len(test_object_columns)}")

# 각 object 변수의 고유 값 개수 출력
print("\nTrain Data:")
for col in train_object_columns:
    unique_count = train_data[col].nunique()
    print(f"{col} unique 값 갯수: {unique_count}")

print("\nTest Data:")
for col in test_object_columns:
    unique_count = test_data[col].nunique()
    print(f"{col} unique 값 갯수: {unique_count}")

Index(['target', 'model_receip', 'cleaned_workorder',
       'PalletID_Collect_Result', 'cure_end_position_XZ_Fill2',
       'cure_start_position_XZ_Fill2', 'cure_standby_position_XZ_Fill2',
       'stage1_line13_distance_speed_Dam', 'stage1_line24_distance_speed_Dam',
       'stage2_line13_distance_speed_Dam', 'stage2_line24_distance_speed_Dam',
       'stage3_line13_distance_speed_Dam', 'stage3_line24_distance_speed_Dam'],
      dtype='object')  train_object_columns 갯수 : 13
Index(['Set ID', 'model_receip', 'cleaned_workorder',
       'PalletID_Collect_Result', 'cure_end_position_XZ_Fill2',
       'cure_start_position_XZ_Fill2', 'cure_standby_position_XZ_Fill2',
       'stage1_line13_distance_speed_Dam', 'stage1_line24_distance_speed_Dam',
       'stage2_line13_distance_speed_Dam', 'stage2_line24_distance_speed_Dam',
       'stage3_line13_distance_speed_Dam', 'stage3_line24_distance_speed_Dam'],
      dtype='object')  test_object_columns 갯수 : 13

Train Data:
target unique 값 갯수: 2
mode

In [184]:
# 전체 평균 타겟 값 계산 (abnormal 전체 비율)
train_data['target_01'] = train_data['target'].apply(lambda x: 1 if x == 'AbNormal' else 0)
global_mean = train_data['target_01'].mean()

In [185]:
# 적용할 열 리스트
columns_to_encode = [
    'model_receip',
    'cleaned_workorder',
    'PalletID_Collect_Result',
    'cure_end_position_XZ_Fill2',
    'cure_start_position_XZ_Fill2',
    'cure_standby_position_XZ_Fill2',
    'stage1_line13_distance_speed_Dam',
    'stage1_line24_distance_speed_Dam',
    'stage2_line13_distance_speed_Dam',
    'stage2_line24_distance_speed_Dam',
    'stage3_line13_distance_speed_Dam',
    'stage3_line24_distance_speed_Dam'
]

# 전체 데이터의 평균 타겟값
global_mean = train_data['target_01'].mean()

for column in columns_to_encode:
    # 각 column에 대한 평균 타겟값과 카운트 계산
    target_mean = train_data.groupby(column)['target_01'].mean()
    count = train_data.groupby(column)['target_01'].count()

    # 스무딩 적용
    '''
    추천 알파 값:
    0.5: 일반적으로 많이 사용되는 값으로, 기존 데이터와 전체 평균 간의 균형을 잘 맞춰줍니다.
    0.3: 데이터가 충분히 많고 각 카테고리의 타겟 값이 잘 분포되어 있을 때 사용.
    0.7: 데이터가 적거나 특정 카테고리가 상대적으로 적을 때 사용.
    '''
    alpha = 0.5
    smoothed_values = (target_mean * count + global_mean * alpha) / (count + alpha)

    # 인코딩된 값을 데이터프레임에 추가
    train_data[f'{column}_encoded'] = train_data[column].map(smoothed_values)

    # test_data에 동일한 인코딩 값을 추가
    encoding_dict = train_data.groupby(column)[f'{column}_encoded'].first().to_dict()
    test_data[f'{column}_encoded'] = test_data[column].map(encoding_dict)

In [186]:
# 삭제할 열 리스트
columns_to_drop = [
    'target_01',
    'model_receip',
    'cleaned_workorder',
    'PalletID_Collect_Result',
    'cure_end_position_XZ_Fill2',
    'cure_start_position_XZ_Fill2',
    'cure_standby_position_XZ_Fill2',
    'stage1_line13_distance_speed_Dam',
    'stage1_line24_distance_speed_Dam',
    'stage2_line13_distance_speed_Dam',
    'stage2_line24_distance_speed_Dam',
    'stage3_line13_distance_speed_Dam',
    'stage3_line24_distance_speed_Dam'
]

# train_data와 test_data에서 열 드랍
train_data = train_data.drop(columns=columns_to_drop, errors='ignore')
test_data = test_data.drop(columns=columns_to_drop, errors='ignore')

In [187]:
# info 잘리지 않게 출력
train_data.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40506 entries, 0 to 40505
Data columns (total 50 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   CURE SPEED Collect Result_Dam                   40506 non-null  int64  
 1   DISCHARGED SPEED OF RESIN Collect Result_Dam    40506 non-null  int64  
 2   Head Clean Position Z Collect Result_Dam        40506 non-null  float64
 3   Head Purge Position Z Collect Result_Dam        40506 non-null  float64
 4   Head Zero Position Y Collect Result_Dam         40506 non-null  float64
 5   Stage1_Circle_Distance_Speed_Dam                40506 non-null  int64  
 6   Stage2_Circle_Distance_Speed_Dam                40506 non-null  int64  
 7   Stage3_Circle_Distance_Speed_Dam                40506 non-null  int64  
 8   WorkMode Collect Result                         40506 non-null  float64
 9   Chamber Temp. Collect Result_AutoClave 

In [188]:
# info 잘리지 않게 출력
test_data.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17361 entries, 0 to 17360
Data columns (total 51 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Set ID                                          17361 non-null  object 
 1   CURE SPEED Collect Result_Dam                   17361 non-null  int64  
 2   DISCHARGED SPEED OF RESIN Collect Result_Dam    17361 non-null  int64  
 3   Head Clean Position Z Collect Result_Dam        17361 non-null  float64
 4   Head Purge Position Z Collect Result_Dam        17361 non-null  float64
 5   Head Zero Position Y Collect Result_Dam         17361 non-null  float64
 6   Stage1_Circle_Distance_Speed_Dam                17361 non-null  int64  
 7   Stage2_Circle_Distance_Speed_Dam                17361 non-null  int64  
 8   Stage3_Circle_Distance_Speed_Dam                17361 non-null  int64  
 9   WorkMode Collect Result                

### 13. correlation

In [193]:
# dam, fill1, fill2 공통 변수
var_dam_fill = [
    'Equipment_same_num',
    'PalletID_Collect_Result_encoded',
    'Production_Qty_Collect_Result',
    'WorkMode Collect Result'
]

In [194]:
# 전체 공통 변수
### correlation 확인을 위한 변수 리스트
var_all_corr = [
    'model_receip_encoded',
    'cleaned_workorder_encoded'
]

### train
var_all_train = [
    'target',
    'model_receip_encoded',
    'cleaned_workorder_encoded'
]

### test
var_all_test = [
    'Set ID',
    'target',
    'model_receip_encoded',
    'cleaned_workorder_encoded'
]

- dam

In [195]:
# 상관관계를 확인할 데이터셋
combined_variables = var_dam_fill + var_all_corr + [var for var in train_data.columns if '_Dam' in var]
combined_variables

['Equipment_same_num',
 'PalletID_Collect_Result_encoded',
 'Production_Qty_Collect_Result',
 'WorkMode Collect Result',
 'model_receip_encoded',
 'cleaned_workorder_encoded',
 'CURE SPEED Collect Result_Dam',
 'DISCHARGED SPEED OF RESIN Collect Result_Dam',
 'Head Clean Position Z Collect Result_Dam',
 'Head Purge Position Z Collect Result_Dam',
 'Head Zero Position Y Collect Result_Dam',
 'Stage1_Circle_Distance_Speed_Dam',
 'Stage2_Circle_Distance_Speed_Dam',
 'Stage3_Circle_Distance_Speed_Dam',
 'CURE_DISTANCE_Dam',
 'HEAD NORMAL DISTANCE_STAGE1_STAGE3_Dam',
 'HEAD NORMAL DISTANCE_TRIANGLE_area_Dam',
 'HEAD NORMAL DISTANCE_TRIANGLE_height_Dam',
 'volume_time_multip_avg_Dam',
 'average_thickness_Dam',
 'time_ratio_Dam',
 'stage1_line13_distance_speed_Dam_encoded',
 'stage1_line24_distance_speed_Dam_encoded',
 'stage2_line13_distance_speed_Dam_encoded',
 'stage2_line24_distance_speed_Dam_encoded',
 'stage3_line13_distance_speed_Dam_encoded',
 'stage3_line24_distance_speed_Dam_encoded

In [196]:
variables = ['Equipment_same_num',   
 'PalletID_Collect_Result_encoded',
 'Production_Qty_Collect_Result',
 'WorkMode Collect Result',
 'model_receip_encoded',
 'cleaned_workorder_encoded',
 'CURE SPEED Collect Result_Dam',
 'DISCHARGED SPEED OF RESIN Collect Result_Dam',
 'Head Clean Position Z Collect Result_Dam',
 'Head Purge Position Z Collect Result_Dam',
 'Head Zero Position Y Collect Result_Dam',
 'Stage1_Circle_Distance_Speed_Dam',
 'Stage2_Circle_Distance_Speed_Dam',
 'Stage3_Circle_Distance_Speed_Dam',
 'CURE_DISTANCE_Dam',
 'HEAD NORMAL DISTANCE_TRIANGLE_area_Dam',
 'HEAD NORMAL DISTANCE_TRIANGLE_height_Dam',
 'volume_time_multip_avg_Dam',
 'average_thickness_Dam',
 'time_ratio_Dam',
 'stage1_line13_distance_speed_Dam_encoded',
 'stage1_line24_distance_speed_Dam_encoded',
 'stage2_line13_distance_speed_Dam_encoded',
 'stage2_line24_distance_speed_Dam_encoded',
 'stage3_line13_distance_speed_Dam_encoded',
 'stage3_line24_distance_speed_Dam_encoded',
 'HEAD NORMAL DISTANCE_STAGE1_STAGE3_Dam']

# 변수들로만 이루어진 DataFrame 생성
filtered_data = train_data[variables]

In [197]:
# 자기자신을 제외하고 상관관계 절댓값이 0.9 이상인 조합 찾기
correlation_matrix = filtered_data.corr()
strong_correlations = correlation_matrix[(correlation_matrix.abs() >= 0.9) & (correlation_matrix != 1)]

# 리스트로 변환
strong_correlations_pairs = strong_correlations.stack().reset_index()
strong_correlations_pairs.columns = ['Variable 1', 'Variable 2', 'Correlation']

# 결과 출력
print(strong_correlations_pairs)

                                  Variable 1  \
0           Stage1_Circle_Distance_Speed_Dam   
1           Stage1_Circle_Distance_Speed_Dam   
2           Stage2_Circle_Distance_Speed_Dam   
3           Stage3_Circle_Distance_Speed_Dam   
4           Stage3_Circle_Distance_Speed_Dam   
5     HEAD NORMAL DISTANCE_TRIANGLE_area_Dam   
6   HEAD NORMAL DISTANCE_TRIANGLE_height_Dam   
7                 volume_time_multip_avg_Dam   
8                 volume_time_multip_avg_Dam   
9   stage1_line13_distance_speed_Dam_encoded   
10  stage1_line13_distance_speed_Dam_encoded   
11  stage1_line13_distance_speed_Dam_encoded   
12  stage1_line24_distance_speed_Dam_encoded   
13  stage1_line24_distance_speed_Dam_encoded   
14  stage1_line24_distance_speed_Dam_encoded   
15  stage2_line13_distance_speed_Dam_encoded   
16  stage3_line13_distance_speed_Dam_encoded   
17  stage3_line13_distance_speed_Dam_encoded   
18  stage3_line13_distance_speed_Dam_encoded   
19  stage3_line24_distance_speed_Dam_enc

In [198]:
# 드랍할 열 목록
columns_to_drop = [
    'Stage1_Circle_Distance_Speed_Dam',
    'Stage3_Circle_Distance_Speed_Dam',
    'HEAD NORMAL DISTANCE_TRIANGLE_area_Dam',
    'stage1_line24_distance_speed_Dam_encoded',
    'stage2_line13_distance_speed_Dam_encoded',
    'stage3_line13_distance_speed_Dam_encoded',
    'stage3_line24_distance_speed_Dam_encoded'
]

# 열 삭제
train_data.drop(columns=columns_to_drop, inplace=True, errors='ignore')
test_data.drop(columns=columns_to_drop, inplace=True, errors='ignore')

In [200]:
# '_Dam'을 포함하는 변수 선택
dam_variables = [var for var in train_data.columns if '_Dam' in var]

# train
final_columns_train = var_dam_fill + var_all_train + dam_variables
train_data_dam = train_data[final_columns_train]

# test 
final_columns_test = var_dam_fill + var_all_test + dam_variables
test_data_dam = test_data[final_columns_test]

In [201]:
train_data_dam.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40506 entries, 0 to 40505
Data columns (total 21 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   Equipment_same_num                            40506 non-null  int64  
 1   PalletID_Collect_Result_encoded               40506 non-null  float64
 2   Production_Qty_Collect_Result                 40506 non-null  int64  
 3   WorkMode Collect Result                       40506 non-null  float64
 4   target                                        40506 non-null  object 
 5   model_receip_encoded                          40506 non-null  float64
 6   cleaned_workorder_encoded                     40506 non-null  float64
 7   CURE SPEED Collect Result_Dam                 40506 non-null  int64  
 8   DISCHARGED SPEED OF RESIN Collect Result_Dam  40506 non-null  int64  
 9   Head Clean Position Z Collect Result_Dam      40506 non-null 

In [202]:
test_data_dam.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17361 entries, 0 to 17360
Data columns (total 22 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   Equipment_same_num                            17361 non-null  int64  
 1   PalletID_Collect_Result_encoded               17361 non-null  float64
 2   Production_Qty_Collect_Result                 17361 non-null  int64  
 3   WorkMode Collect Result                       17361 non-null  float64
 4   Set ID                                        17361 non-null  object 
 5   target                                        0 non-null      float64
 6   model_receip_encoded                          17361 non-null  float64
 7   cleaned_workorder_encoded                     17361 non-null  float64
 8   CURE SPEED Collect Result_Dam                 17361 non-null  int64  
 9   DISCHARGED SPEED OF RESIN Collect Result_Dam  17361 non-null 

- fill1

In [203]:
# 상관관계를 확인할 데이터셋
combined_variables = var_dam_fill + var_all_corr + [var for var in train_data.columns if '_Fill1' in var]
combined_variables

['Equipment_same_num',
 'PalletID_Collect_Result_encoded',
 'Production_Qty_Collect_Result',
 'WorkMode Collect Result',
 'model_receip_encoded',
 'cleaned_workorder_encoded',
 'DISCHARGED SPEED OF RESIN Collect Result_Fill1',
 'Head Purge Position Z Collect Result_Fill1',
 'HEAD NORMAL DISTANCE_STAGE1_STAGE3_Fill1',
 'HEAD NORMAL DISTANCE_TRIANGLE_area_Fill1',
 'HEAD NORMAL DISTANCE_TRIANGLE_height_Fill1',
 'volume_time_multip_avg_Fill1',
 'time_ratio_Fill1']

In [205]:
variables = ['Equipment_same_num',
 'PalletID_Collect_Result_encoded',
 'Production_Qty_Collect_Result',
 'WorkMode Collect Result',
 'model_receip_encoded',
 'cleaned_workorder_encoded',
 'DISCHARGED SPEED OF RESIN Collect Result_Fill1',
 'Head Purge Position Z Collect Result_Fill1',
 'HEAD NORMAL DISTANCE_STAGE1_STAGE3_Fill1',
 'HEAD NORMAL DISTANCE_TRIANGLE_area_Fill1',
 'HEAD NORMAL DISTANCE_TRIANGLE_height_Fill1',
 'volume_time_multip_avg_Fill1',
 'time_ratio_Fill1']

# 변수들로만 이루어진 DataFrame 생성
filtered_data = train_data[variables]

In [206]:
# 자기자신을 제외하고 상관관계 절댓값이 0.9 이상인 조합 찾기
correlation_matrix = filtered_data.corr()
strong_correlations = correlation_matrix[(correlation_matrix.abs() >= 0.9) & (correlation_matrix != 1)]

# 리스트로 변환
strong_correlations_pairs = strong_correlations.stack().reset_index()
strong_correlations_pairs.columns = ['Variable 1', 'Variable 2', 'Correlation']

# 결과 출력
print(strong_correlations_pairs)

                                   Variable 1  \
0    HEAD NORMAL DISTANCE_TRIANGLE_area_Fill1   
1  HEAD NORMAL DISTANCE_TRIANGLE_height_Fill1   

                                   Variable 2  Correlation  
0  HEAD NORMAL DISTANCE_TRIANGLE_height_Fill1     0.923415  
1    HEAD NORMAL DISTANCE_TRIANGLE_area_Fill1     0.923415  


In [207]:
# 드랍할 열 목록
columns_to_drop = ['HEAD NORMAL DISTANCE_TRIANGLE_area_Fill1']

# 열 삭제
train_data.drop(columns=columns_to_drop, inplace=True, errors='ignore')
test_data.drop(columns=columns_to_drop, inplace=True, errors='ignore')

In [208]:
# '_Fill1'을 포함하는 변수 선택
fill1_variables = [var for var in train_data.columns if '_Fill1' in var]

# train
final_columns_train = var_dam_fill + var_all_train + fill1_variables
train_data_fill1 = train_data[final_columns_train]

# test 
final_columns_test = var_dam_fill + var_all_test + fill1_variables
test_data_fill1 = test_data[final_columns_test]

In [209]:
train_data_fill1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40506 entries, 0 to 40505
Data columns (total 13 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Equipment_same_num                              40506 non-null  int64  
 1   PalletID_Collect_Result_encoded                 40506 non-null  float64
 2   Production_Qty_Collect_Result                   40506 non-null  int64  
 3   WorkMode Collect Result                         40506 non-null  float64
 4   target                                          40506 non-null  object 
 5   model_receip_encoded                            40506 non-null  float64
 6   cleaned_workorder_encoded                       40506 non-null  float64
 7   DISCHARGED SPEED OF RESIN Collect Result_Fill1  40506 non-null  float64
 8   Head Purge Position Z Collect Result_Fill1      40506 non-null  float64
 9   HEAD NORMAL DISTANCE_STAGE1_STAGE3_Fill

In [210]:
test_data_fill1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17361 entries, 0 to 17360
Data columns (total 14 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Equipment_same_num                              17361 non-null  int64  
 1   PalletID_Collect_Result_encoded                 17361 non-null  float64
 2   Production_Qty_Collect_Result                   17361 non-null  int64  
 3   WorkMode Collect Result                         17361 non-null  float64
 4   Set ID                                          17361 non-null  object 
 5   target                                          0 non-null      float64
 6   model_receip_encoded                            17361 non-null  float64
 7   cleaned_workorder_encoded                       17361 non-null  float64
 8   DISCHARGED SPEED OF RESIN Collect Result_Fill1  17361 non-null  float64
 9   Head Purge Position Z Collect Result_Fi

- fill2

In [211]:
# 상관관계를 확인할 데이터셋
combined_variables = var_dam_fill + var_all_corr + [var for var in train_data.columns if '_Fill2' in var]
combined_variables

['Equipment_same_num',
 'PalletID_Collect_Result_encoded',
 'Production_Qty_Collect_Result',
 'WorkMode Collect Result',
 'model_receip_encoded',
 'cleaned_workorder_encoded',
 'CURE SPEED Collect Result_Fill2',
 'Head Purge Position Z Collect Result_Fill2',
 'HEAD NORMAL DISTANCE_STAGE1_STAGE2_Fill2',
 'HEAD NORMAL DISTANCE_STAGE2_STAGE3_Fill2',
 'HEAD NORMAL DISTANCE_STAGE1_STAGE3_Fill2',
 'time_ratio_Fill2',
 'cure_end_position_XZ_Fill2_encoded',
 'cure_start_position_XZ_Fill2_encoded',
 'cure_standby_position_XZ_Fill2_encoded']

In [212]:
variables = ['Equipment_same_num',
 'PalletID_Collect_Result_encoded',
 'Production_Qty_Collect_Result',
 'WorkMode Collect Result',
 'model_receip_encoded',
 'cleaned_workorder_encoded',
 'CURE SPEED Collect Result_Fill2',
 'Head Purge Position Z Collect Result_Fill2',
 'HEAD NORMAL DISTANCE_STAGE1_STAGE2_Fill2',
 'HEAD NORMAL DISTANCE_STAGE2_STAGE3_Fill2',
 'HEAD NORMAL DISTANCE_STAGE1_STAGE3_Fill2',
 'time_ratio_Fill2',
 'cure_end_position_XZ_Fill2_encoded',
 'cure_start_position_XZ_Fill2_encoded',
 'cure_standby_position_XZ_Fill2_encoded']

# 변수들로만 이루어진 DataFrame 생성
filtered_data = train_data[variables]

In [213]:
# 자기자신을 제외하고 상관관계 절댓값이 0.9 이상인 조합 찾기
correlation_matrix = filtered_data.corr()
strong_correlations = correlation_matrix[(correlation_matrix.abs() >= 0.9) & (correlation_matrix != 1)]

# 리스트로 변환
strong_correlations_pairs = strong_correlations.stack().reset_index()
strong_correlations_pairs.columns = ['Variable 1', 'Variable 2', 'Correlation']

# 결과 출력
print(strong_correlations_pairs)

                                 Variable 1  \
0  HEAD NORMAL DISTANCE_STAGE1_STAGE2_Fill2   
1  HEAD NORMAL DISTANCE_STAGE1_STAGE2_Fill2   
2  HEAD NORMAL DISTANCE_STAGE2_STAGE3_Fill2   
3  HEAD NORMAL DISTANCE_STAGE2_STAGE3_Fill2   
4  HEAD NORMAL DISTANCE_STAGE1_STAGE3_Fill2   
5  HEAD NORMAL DISTANCE_STAGE1_STAGE3_Fill2   
6      cure_start_position_XZ_Fill2_encoded   
7    cure_standby_position_XZ_Fill2_encoded   

                                 Variable 2  Correlation  
0  HEAD NORMAL DISTANCE_STAGE2_STAGE3_Fill2     0.999993  
1  HEAD NORMAL DISTANCE_STAGE1_STAGE3_Fill2     0.999999  
2  HEAD NORMAL DISTANCE_STAGE1_STAGE2_Fill2     0.999993  
3  HEAD NORMAL DISTANCE_STAGE1_STAGE3_Fill2     0.999997  
4  HEAD NORMAL DISTANCE_STAGE1_STAGE2_Fill2     0.999999  
5  HEAD NORMAL DISTANCE_STAGE2_STAGE3_Fill2     0.999997  
6    cure_standby_position_XZ_Fill2_encoded     0.999366  
7      cure_start_position_XZ_Fill2_encoded     0.999366  


In [214]:
# 드랍할 열 목록
columns_to_drop = [
    'HEAD NORMAL DISTANCE_STAGE2_STAGE3_Fill2',
    'HEAD NORMAL DISTANCE_STAGE1_STAGE3_Fill2',
    'cure_standby_position_XZ_Fill2_encoded'
]

# 열 삭제
train_data.drop(columns=columns_to_drop, inplace=True, errors='ignore')
test_data.drop(columns=columns_to_drop, inplace=True, errors='ignore')

In [215]:
# '_Fill2'을 포함하는 변수 선택
fill2_variables = [var for var in train_data.columns if '_Fill2' in var]

# train
final_columns_train = var_dam_fill + var_all_train + fill2_variables
train_data_fill2 = train_data[final_columns_train]

# test 
final_columns_test = var_dam_fill + var_all_test + fill2_variables
test_data_fill2 = test_data[final_columns_test]

In [216]:
train_data_fill2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40506 entries, 0 to 40505
Data columns (total 13 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   Equipment_same_num                          40506 non-null  int64  
 1   PalletID_Collect_Result_encoded             40506 non-null  float64
 2   Production_Qty_Collect_Result               40506 non-null  int64  
 3   WorkMode Collect Result                     40506 non-null  float64
 4   target                                      40506 non-null  object 
 5   model_receip_encoded                        40506 non-null  float64
 6   cleaned_workorder_encoded                   40506 non-null  float64
 7   CURE SPEED Collect Result_Fill2             40506 non-null  int64  
 8   Head Purge Position Z Collect Result_Fill2  40506 non-null  float64
 9   HEAD NORMAL DISTANCE_STAGE1_STAGE2_Fill2    40506 non-null  float64
 10  time_ratio

In [217]:
test_data_fill2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17361 entries, 0 to 17360
Data columns (total 14 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   Equipment_same_num                          17361 non-null  int64  
 1   PalletID_Collect_Result_encoded             17361 non-null  float64
 2   Production_Qty_Collect_Result               17361 non-null  int64  
 3   WorkMode Collect Result                     17361 non-null  float64
 4   Set ID                                      17361 non-null  object 
 5   target                                      0 non-null      float64
 6   model_receip_encoded                        17361 non-null  float64
 7   cleaned_workorder_encoded                   17361 non-null  float64
 8   CURE SPEED Collect Result_Fill2             17361 non-null  int64  
 9   Head Purge Position Z Collect Result_Fill2  17361 non-null  float64
 10  HEAD NORMA

- autoclave

In [218]:
# 상관관계를 확인할 데이터셋
combined_variables = var_all_corr + [var for var in train_data.columns if '_AutoClave' in var]
combined_variables

['model_receip_encoded',
 'cleaned_workorder_encoded',
 'Chamber Temp. Collect Result_AutoClave',
 '1st_pressure_time_AutoClave',
 '2nd_pressure_time_AutoClave',
 '3rd_pressure_time_AutoClave',
 'avg_pressure_time_AutoClave',
 'time_ratio_AutoClave']

In [219]:
variables = ['model_receip_encoded',
 'cleaned_workorder_encoded',
 'Chamber Temp. Collect Result_AutoClave',
 '1st_pressure_time_AutoClave',
 '2nd_pressure_time_AutoClave',
 '3rd_pressure_time_AutoClave',
 'avg_pressure_time_AutoClave',
 'time_ratio_AutoClave']

# 변수들로만 이루어진 DataFrame 생성
filtered_data = train_data[variables]

In [220]:
# 자기자신을 제외하고 상관관계 절댓값이 0.9 이상인 조합 찾기
correlation_matrix = filtered_data.corr()
strong_correlations = correlation_matrix[(correlation_matrix.abs() >= 0.9) & (correlation_matrix != 1)]

# 리스트로 변환
strong_correlations_pairs = strong_correlations.stack().reset_index()
strong_correlations_pairs.columns = ['Variable 1', 'Variable 2', 'Correlation']

# 결과 출력
print(strong_correlations_pairs)

                    Variable 1                   Variable 2  Correlation
0  2nd_pressure_time_AutoClave  avg_pressure_time_AutoClave     0.907845
1  avg_pressure_time_AutoClave  2nd_pressure_time_AutoClave     0.907845


In [221]:
# 드랍할 열 목록
columns_to_drop = ['avg_pressure_time_AutoClave']

# 열 삭제
train_data.drop(columns=columns_to_drop, inplace=True, errors='ignore')
test_data.drop(columns=columns_to_drop, inplace=True, errors='ignore')

In [222]:
# '_AutoClave'을 포함하는 변수 선택
autoclave_variables = [var for var in train_data.columns if '_AutoClave' in var]

# train
final_columns_train = var_all_train + autoclave_variables
train_data_autoclave = train_data[final_columns_train]

# test 
final_columns_test = var_all_test + autoclave_variables
test_data_autoclave = test_data[final_columns_test]

In [223]:
train_data_autoclave.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40506 entries, 0 to 40505
Data columns (total 8 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   target                                  40506 non-null  object 
 1   model_receip_encoded                    40506 non-null  float64
 2   cleaned_workorder_encoded               40506 non-null  float64
 3   Chamber Temp. Collect Result_AutoClave  40506 non-null  int64  
 4   1st_pressure_time_AutoClave             40506 non-null  float64
 5   2nd_pressure_time_AutoClave             40506 non-null  float64
 6   3rd_pressure_time_AutoClave             40506 non-null  float64
 7   time_ratio_AutoClave                    40506 non-null  float64
dtypes: float64(6), int64(1), object(1)
memory usage: 2.5+ MB


In [224]:
test_data_autoclave.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17361 entries, 0 to 17360
Data columns (total 9 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   Set ID                                  17361 non-null  object 
 1   target                                  0 non-null      float64
 2   model_receip_encoded                    17361 non-null  float64
 3   cleaned_workorder_encoded               17361 non-null  float64
 4   Chamber Temp. Collect Result_AutoClave  17361 non-null  int64  
 5   1st_pressure_time_AutoClave             17361 non-null  float64
 6   2nd_pressure_time_AutoClave             17361 non-null  float64
 7   3rd_pressure_time_AutoClave             17361 non-null  float64
 8   time_ratio_AutoClave                    17361 non-null  float64
dtypes: float64(7), int64(1), object(1)
memory usage: 1.2+ MB


In [225]:
train_data.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40506 entries, 0 to 40505
Data columns (total 38 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   CURE SPEED Collect Result_Dam                   40506 non-null  int64  
 1   DISCHARGED SPEED OF RESIN Collect Result_Dam    40506 non-null  int64  
 2   Head Clean Position Z Collect Result_Dam        40506 non-null  float64
 3   Head Purge Position Z Collect Result_Dam        40506 non-null  float64
 4   Head Zero Position Y Collect Result_Dam         40506 non-null  float64
 5   Stage2_Circle_Distance_Speed_Dam                40506 non-null  int64  
 6   WorkMode Collect Result                         40506 non-null  float64
 7   Chamber Temp. Collect Result_AutoClave          40506 non-null  int64  
 8   DISCHARGED SPEED OF RESIN Collect Result_Fill1  40506 non-null  float64
 9   Head Purge Position Z Collect Result_Fi

In [226]:
test_data.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17361 entries, 0 to 17360
Data columns (total 39 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Set ID                                          17361 non-null  object 
 1   CURE SPEED Collect Result_Dam                   17361 non-null  int64  
 2   DISCHARGED SPEED OF RESIN Collect Result_Dam    17361 non-null  int64  
 3   Head Clean Position Z Collect Result_Dam        17361 non-null  float64
 4   Head Purge Position Z Collect Result_Dam        17361 non-null  float64
 5   Head Zero Position Y Collect Result_Dam         17361 non-null  float64
 6   Stage2_Circle_Distance_Speed_Dam                17361 non-null  int64  
 7   WorkMode Collect Result                         17361 non-null  float64
 8   Chamber Temp. Collect Result_AutoClave          17361 non-null  int64  
 9   DISCHARGED SPEED OF RESIN Collect Resul

In [227]:
test_data.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17361 entries, 0 to 17360
Data columns (total 39 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Set ID                                          17361 non-null  object 
 1   CURE SPEED Collect Result_Dam                   17361 non-null  int64  
 2   DISCHARGED SPEED OF RESIN Collect Result_Dam    17361 non-null  int64  
 3   Head Clean Position Z Collect Result_Dam        17361 non-null  float64
 4   Head Purge Position Z Collect Result_Dam        17361 non-null  float64
 5   Head Zero Position Y Collect Result_Dam         17361 non-null  float64
 6   Stage2_Circle_Distance_Speed_Dam                17361 non-null  int64  
 7   WorkMode Collect Result                         17361 non-null  float64
 8   Chamber Temp. Collect Result_AutoClave          17361 non-null  int64  
 9   DISCHARGED SPEED OF RESIN Collect Resul

In [228]:
# DataFrame을 CSV 파일로 저장
train_data.to_csv('./data/train_data_0825.csv', index=False)
test_data.to_csv('./data/test_data_0825.csv', index=False)

## 3. 모델링

공정별 데이터 구분

In [229]:
THRESHOLD = 0.3
RANDOM_STATE = 110

# csv 불러오기
train_data = pd.read_csv('./data/train_data_0825.csv')
test_data = pd.read_csv('./data/test_data_0825.csv')

In [230]:
# dam, fill1, fill2 공통 변수
var_dam_fill = [
    'Equipment_same_num',
    'PalletID_Collect_Result_encoded',
    'Production_Qty_Collect_Result',
    'WorkMode Collect Result'
]

In [231]:

# 전체 공통 변수
### correlation 확인을 위한 변수 리스트
var_all_corr = [
    'model_receip_encoded',
    'cleaned_workorder_encoded'
]

### train
var_all_train = [
    'target',
    'model_receip_encoded',
    'cleaned_workorder_encoded'
]

### test
var_all_test = [
    'Set ID',
    'target',
    'model_receip_encoded',
    'cleaned_workorder_encoded'
]

In [232]:
# '_Dam'을 포함하는 변수 선택
dam_variables = [var for var in train_data.columns if '_Dam' in var]

# train
final_columns_train = var_dam_fill + var_all_train + dam_variables
train_data_dam = train_data[final_columns_train]

# test 
final_columns_test = var_dam_fill + var_all_test + dam_variables
test_data_dam = test_data[final_columns_test]

In [233]:
# '_Fill1'을 포함하는 변수 선택
fill1_variables = [var for var in train_data.columns if '_Fill1' in var]

# train
final_columns_train = var_dam_fill + var_all_train + fill1_variables
train_data_fill1 = train_data[final_columns_train]

# test 
final_columns_test = var_dam_fill + var_all_test + fill1_variables
test_data_fill1 = test_data[final_columns_test]

In [234]:
# '_Fill2'을 포함하는 변수 선택
fill2_variables = [var for var in train_data.columns if '_Fill2' in var]

# train
final_columns_train = var_dam_fill + var_all_train + fill2_variables
train_data_fill2 = train_data[final_columns_train]

# test 
final_columns_test = var_dam_fill + var_all_test + fill2_variables
test_data_fill2 = test_data[final_columns_test]

In [235]:
# '_AutoClave'을 포함하는 변수 선택
autoclave_variables = [var for var in train_data.columns if '_AutoClave' in var]

# train
final_columns_train = var_all_train + autoclave_variables
train_data_autoclave = train_data[final_columns_train]

# test 
final_columns_test = var_all_test + autoclave_variables
test_data_autoclave = test_data[final_columns_test]

In [236]:
# 각 DataFrame의 칼럼 수 계산
num_columns_train_data = train_data.shape[1]
num_columns_train_data_dam = train_data_dam.shape[1]
num_columns_train_data_autoclave = train_data_autoclave.shape[1]
num_columns_train_data_fill1 = train_data_fill1.shape[1]
num_columns_train_data_fill2 = train_data_fill2.shape[1]

num_columns_test_data = test_data.shape[1]
num_columns_test_data_dam = test_data_dam.shape[1]
num_columns_test_data_autoclave = test_data_autoclave.shape[1]
num_columns_test_data_fill1 = test_data_fill1.shape[1]
num_columns_test_data_fill2 = test_data_fill2.shape[1]

# 각 DataFrame의 칼럼 수 출력
print("----train data-----")
print(f"train_data DataFrame의 칼럼 수: {num_columns_train_data}")
print(f"train_data_dam DataFrame의 칼럼 수: {num_columns_train_data_dam}")
print(f"train_data_autoclave DataFrame의 칼럼 수: {num_columns_train_data_autoclave}")
print(f"train_data_fill1 DataFrame의 칼럼 수: {num_columns_train_data_fill1}")
print(f"train_data_fill2 DataFrame의 칼럼 수: {num_columns_train_data_fill2}")
print("----test data-----")
print(f"test_data DataFrame의 칼럼 수: {num_columns_test_data}")
print(f"test_data_dam DataFrame의 칼럼 수: {num_columns_test_data_dam}")
print(f"test_data_autoclave DataFrame의 칼럼 수: {num_columns_test_data_autoclave}")
print(f"test_data_fill1 DataFrame의 칼럼 수: {num_columns_test_data_fill1}")
print(f"test_data_fill2 DataFrame의 칼럼 수: {num_columns_test_data_fill2}")

----train data-----
train_data DataFrame의 칼럼 수: 38
train_data_dam DataFrame의 칼럼 수: 21
train_data_autoclave DataFrame의 칼럼 수: 8
train_data_fill1 DataFrame의 칼럼 수: 13
train_data_fill2 DataFrame의 칼럼 수: 13
----test data-----
test_data DataFrame의 칼럼 수: 39
test_data_dam DataFrame의 칼럼 수: 22
test_data_autoclave DataFrame의 칼럼 수: 9
test_data_fill1 DataFrame의 칼럼 수: 14
test_data_fill2 DataFrame의 칼럼 수: 14


모델 정의

In [237]:
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.ensemble import VotingClassifier

# 스레드홀드 설정
THRESHOLD = 0.3

# 모델 설정 및 하이퍼파라미터
models = {
    'et': ExtraTreesClassifier(),
    'rf': RandomForestClassifier(),
    'cat': CatBoostClassifier(),
    'lgbm': LGBMClassifier(),
    'xgb': XGBClassifier(),
    'dt': DecisionTreeClassifier(),
    'ada': AdaBoostClassifier()
}

def train_and_evaluate_model(model_name, data, **params):
    if model_name not in models:
        print(f"{model_name}은(는) 지원되지 않는 모델입니다.")
        return
    
    # 데이터셋 분할
    x_train, x_val, y_train, y_val = train_test_split(
        data.drop("target", axis=1),
        data["target"].map({'Normal': 0, 'AbNormal': 1}),
        test_size=0.2,
        shuffle=True,
        random_state=RANDOM_STATE,
    )

    # 모델 선택
    model = models[model_name].__class__()  # 새로운 모델 인스턴스 생성

    # 하이퍼파라미터 설정
    model.set_params(**params)

    # 모델 학습
    model.fit(x_train, y_train)

    # 데이터 이름을 자동으로 추출하기 위한 래퍼 함수
    data_name = [name for name in globals() if globals()[name] is data][0]

    # 예측
    y_val_pred_proba = model.predict_proba(x_val)[:, 1]  # 양성 클래스 확률
    y_val_pred = (y_val_pred_proba >= THRESHOLD).astype(int)  # 스레드홀드에 따른 예측

    # 평가지표 계산
    f1 = f1_score(y_val, y_val_pred, average="binary")
    accuracy = accuracy_score(y_val, y_val_pred)
    precision = precision_score(y_val, y_val_pred, zero_division=0)
    recall = recall_score(y_val, y_val_pred)
    conf_matrix = confusion_matrix(y_val, y_val_pred)
    
    # 결과 출력
    print(f'{model_name} 모델이 {data_name} 데이터로 학습한 결과:')
    print(f'F1 Score: {f1}')
    print('---')
    print('Confusion Matrix:')
    print(conf_matrix)
    print('---')
    print(f'Accuracy: {accuracy}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print('\n')

    return model  # 학습된 모델 반환

def fit_all_train_data_function(model_name, data, **params):
    if model_name not in models:
        print(f"{model_name}은(는) 지원되지 않는 모델입니다.")
        return None  # 지원되지 않는 모델일 경우 None 반환
    
    # 모델 선택
    model = models[model_name].__class__()  # 새로운 모델 인스턴스 생성

    # 하이퍼파라미터 설정
    model.set_params(**params)

    # 모델 학습
    model.fit(data.drop("target", axis=1), data["target"].map({'Normal': 0, 'AbNormal': 1}))

    # 데이터 이름을 자동으로 추출하기 위한 래퍼 함수
    data_name = [name for name in globals() if globals()[name] is data][0]

    print(f'{model_name} 모델이 {data_name} 데이터로 학습 완료')
    return model  # 학습된 모델 반환

def voting_function(data, estimators, voting='hard', threshold=0.5):
    # 데이터셋 분할 # voting='hard'일 경우 threshold는 사용되지 않음
    x_train, x_val, y_train, y_val = train_test_split(
        data.drop("target", axis=1),
        data["target"].map({'Normal': 0, 'AbNormal': 1}),
        test_size=0.2,
        shuffle=True,
        random_state=RANDOM_STATE,
    )

    # VotingClassifier 설정
    voting_clf = VotingClassifier(estimators=estimators, voting=voting)

    # 모델 학습
    voting_clf.fit(x_train, y_train)

    if voting == 'soft':
        # 소프트 보팅의 경우 확률 예측
        y_val_pred_proba = voting_clf.predict_proba(x_val)[:, 1]
        y_val_pred = (y_val_pred_proba >= threshold).astype(int)
    else:
        # 하드 보팅의 경우 직접 예측
        y_val_pred = voting_clf.predict(x_val)

    # 평가지표 계산
    f1 = f1_score(y_val, y_val_pred, average="binary")
    accuracy = accuracy_score(y_val, y_val_pred)
    precision = precision_score(y_val, y_val_pred, zero_division=0)
    recall = recall_score(y_val, y_val_pred)
    conf_matrix = confusion_matrix(y_val, y_val_pred)
    
    # 결과 출력
    print(f'Voting Classifier로 학습한 결과:')
    print(f'F1 Score: {f1}')
    print('---')
    print('Confusion Matrix:')
    print(conf_matrix)
    print('---')
    print(f'Accuracy: {accuracy}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print('\n')

    return voting_clf  # 학습된 VotingClassifier 반환

def voting(preds_or_probs, method='soft', threshold=0.3):
    """
    하드 보팅 또는 소프트 보팅을 사용하여 최종 예측을 수행합니다.

    Parameters:
    preds_or_probs (list of np.array): 각 모델의 예측 배열 리스트 (하드 보팅) 또는 예측 확률 배열 리스트 (소프트 보팅)
    method (str): 'soft' 또는 'hard' 보팅 방법 선택
    threshold (float): 소프트 보팅 시 예측을 양성으로 간주할 확률 임계값

    Returns:
    np.array: 최종 예측 결과
    """
    if method == 'soft':
        # 소프트 보팅: 각 모델의 확률 평균 계산
        soft_voting_probs = np.mean(preds_or_probs, axis=0)
        # 최종 예측: 평균 확률에 대해 스레드 홀드 적용
        final_predictions = (soft_voting_probs >= threshold).astype(int)
    elif method == 'hard':
        # 하드 보팅: 각 모델의 예측을 모아서 다수결 원칙 적용
        preds = np.array(preds_or_probs)
        final_predictions = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=0, arr=preds)
    else:
        raise ValueError("method 인자는 'soft' 또는 'hard'여야 합니다.")
    
    return final_predictions

공정별 모델 구축

lightgbm

In [None]:
model_Dam = fit_all_train_data_function(
    'lgbm', train_data_dam
    , n_estimators=1467
    , num_leaves=2545
    , max_depth=37
    , learning_rate=0.04353920224587149 
    , min_child_samples=83
    , boosting_type='dart'
    , random_state=RANDOM_STATE
    , verbose=-1
)

model_AutoClave = fit_all_train_data_function(
    'lgbm', train_data_autoclave
    , n_estimators=1563
    , num_leaves=1885
    , max_depth=15
    , learning_rate=0.07033655355880039 
    , min_child_samples=158
    , boosting_type='dart'
    , random_state=RANDOM_STATE
    , verbose=-1
)

model_Fill1 = fit_all_train_data_function(
    'lgbm', train_data_fill1
    , n_estimators=1452
    , num_leaves=1581
    , max_depth=22
    , learning_rate=0.002000452888170992 
    , min_child_samples=43
    , boosting_type='dart'
    , random_state=RANDOM_STATE
    , verbose=-1
)

model_Fill2 = fit_all_train_data_function(
    'lgbm', train_data_fill2
    , n_estimators=1632
    , num_leaves=1426
    , max_depth=8
    , learning_rate=0.07487990991624197 
    , min_child_samples=90
    , boosting_type='dart'
    , random_state=RANDOM_STATE
    , verbose=-1
)

model_All = fit_all_train_data_function(
    'lgbm', train_data
    , n_estimators=2383
    , num_leaves=2528
    , max_depth=343
    , learning_rate=0.04661896043153508
    , min_child_samples=209
    , boosting_type='dart'
    , random_state=RANDOM_STATE
    , verbose=-1
)

# 예측에 필요한 데이터 분리
x_test_dam = test_data_dam.drop(["target", "Set ID"], axis=1)
x_test_autoclave = test_data_autoclave.drop(["target", "Set ID"], axis=1)
x_test_fill1 = test_data_fill1.drop(["target", "Set ID"], axis=1)
x_test_fill2 = test_data_fill2.drop(["target", "Set ID"], axis=1)
x_test_all = test_data.drop(["target", "Set ID"], axis=1)

# 예측 확률 리스트 (소프트 보팅용)
probs = [
    model_Dam.predict_proba(x_test_dam)[:, 1]
    , model_AutoClave.predict_proba(x_test_autoclave)[:, 1]
    , model_Fill1.predict_proba(x_test_fill1)[:, 1]
    , model_Fill2.predict_proba(x_test_fill2)[:, 1]
    , model_All.predict_proba(x_test_all)[:, 1]
]

# 소프트 보팅 결과
final_predictions = voting(probs, method='soft', threshold=0.26)
print(sum(final_predictions))

# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("./data/submission.csv")
df_sub["target"] = final_predictions

# df_sub['target'] 값을 문자열 레이블로 변환
df_sub['target'] = df_sub['target'].apply(lambda x: 'AbNormal' if x == 1 else 'Normal')

# 제출 파일 저장
df_sub.to_csv("./data/data0817_lgbm.csv", index=False)

xgb

In [None]:
model_Dam = fit_all_train_data_function(
    'xgb', train_data_dam
    , n_estimators = 1509
    , learning_rate = 0.06418917852996714
    , max_depth = 6
    , alpha = 0.00017309557032608048
    , gamma = 0.00398067155434722
    , reg_alpha = 0.756006595834120
    , reg_lambda = 0.3962538649486449
    , colsample_bytree =0.8752205595930229
    , subsample = 0.224637741333797
    , objective = 'binary:logistic'
    , tree_method = 'exact'
    , random_state=RANDOM_STATE
)

model_AutoClave = fit_all_train_data_function(
    'xgb', train_data_autoclave,
    n_estimators = 1539, 
    learning_rate = 0.026860419341696404, 
    max_depth = 14, 
    alpha = 1.9237525550524492e-05, 
    gamma = 2.2016346534611754e-05, 
    reg_alpha = 0.9148863773292526, 
    reg_lambda = 0.6194458787523232, 
    colsample_bytree = 0.902872150299903, 
    subsample = 0.10750014546599479,
    objective = 'binary:logistic', 
    tree_method = "exact", 
    random_state=RANDOM_STATE
)

model_Fill1 = fit_all_train_data_function(
    'xgb', train_data_fill1,
    n_estimators = 1707, 
    learning_rate = 0.0321470219836192, 
    max_depth = 7, 
    alpha = 7.368872823521818e-05, 
    gamma = 0.0007930035188326916, 
    reg_alpha = 0.644199314174124, 
    reg_lambda = 0.588270569327407, 
    colsample_bytree = 0.883929103208459, 
    subsample = 0.2534703342501092,
    objective = 'binary:logistic',
    tree_method = 'exact',
    random_state=RANDOM_STATE
)

model_Fill2 = fit_all_train_data_function(
    'xgb', train_data_fill2,
    n_estimators = 1998, 
    learning_rate = 0.030898693059763598, 
    max_depth = 8, 
    alpha = 0.0017554538174868774, 
    gamma = 0.0007257577447593802, 
    reg_alpha = 0.7581280398368035, 
    reg_lambda = 0.5872331353519633, 
    colsample_bytree = 0.56275606593282, 
    subsample = 0.8342870707789082,
    objective = 'binary:logistic',
    tree_method = 'exact',
    random_state=RANDOM_STATE
)

model_All = fit_all_train_data_function(
    'xgb', train_data,
    n_estimators = 2287, 
    learning_rate = 0.046904208411195795, 
    max_depth = 8, 
    alpha = 1.9343531171735368e-05, 
    gamma = 0.002118564280859176, 
    reg_alpha = 0.6827713868263061, 
    reg_lambda = 0.05035980721174918, 
    colsample_bytree = 0.8959193125044248, 
    subsample = 0.43471952905681815,
    objective = 'binary:logistic',  
    tree_method = "exact", 
    random_state=RANDOM_STATE
)

# 예측에 필요한 데이터 분리
x_test_dam = test_data_dam.drop(["target", "Set ID"], axis=1)
x_test_autoclave = test_data_autoclave.drop(["target", "Set ID"], axis=1)
x_test_fill1 = test_data_fill1.drop(["target", "Set ID"], axis=1)
x_test_fill2 = test_data_fill2.drop(["target", "Set ID"], axis=1)
x_test_all = test_data.drop(["target", "Set ID"], axis=1)


# 예측 확률 리스트 (소프트 보팅용)
probs = [
    model_Dam.predict_proba(x_test_dam)[:, 1]
    , model_AutoClave.predict_proba(x_test_autoclave)[:, 1]
    , model_Fill1.predict_proba(x_test_fill1)[:, 1]
    , model_Fill2.predict_proba(x_test_fill2)[:, 1]
    , model_All.predict_proba(x_test_all)[:, 1]
]

# 소프트 보팅 결과
final_predictions = voting(probs, method='soft', threshold=0.24)
print(sum(final_predictions))

# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("./data/submission.csv")
df_sub["target"] = final_predictions

# df_sub['target'] 값을 문자열 레이블로 변환
df_sub['target'] = df_sub['target'].apply(lambda x: 'AbNormal' if x == 1 else 'Normal')

# 제출 파일 저장
df_sub.to_csv("./data/data0817_xgb.csv", index=False)

In [None]:
model_Dam = fit_all_train_data_function(
    'cat', train_data_dam,
    iterations=2083,
    learning_rate=0.023925705983940986,
    depth=11,
    l2_leaf_reg=0.05919257514332274,
    random_strength=7.259397831551647,
    bagging_temperature=5.39094676652102,
    border_count=234,
    scale_pos_weight=1.776413991309166,
    grow_policy='Lossguide',
    random_seed=RANDOM_STATE,
    eval_metric='F1',
    logging_level='Silent',
    boosting_type='Plain'
)

model_AutoClave = fit_all_train_data_function(
    'cat', train_data_autoclave,
    iterations = 2786, 
    learning_rate = 0.016342560305036093, 
    depth = 8, 
    l2_leaf_reg = 3.7187150890684246, 
    random_strength = 0.13164684607188099, 
    bagging_temperature = 9.823498597792092, 
    border_count = 158, 
    scale_pos_weight = 1.8735070170496537,
    grow_policy = 'SymmetricTree',
    random_state = RANDOM_STATE,
    eval_metric = 'F1',
    logging_level = 'Silent',
    boosting_type = 'Plain'
)

model_Fill1 = fit_all_train_data_function(
    'cat', train_data_fill1,
    iterations=1489,
    learning_rate=0.011481405951174946,
    depth=6,
    l2_leaf_reg=0.12082259365361882,
    random_strength=2.5111358694495056,
    bagging_temperature=2.06264856742851,
    border_count=331,
    scale_pos_weight=2.3505422278535173,
    grow_policy='Lossguide',
    random_seed=RANDOM_STATE,
    eval_metric='F1',
    logging_level='Silent',
    boosting_type='Plain'
)

model_Fill2 = fit_all_train_data_function(
    'cat', train_data_fill2,
    iterations=481,
    learning_rate=0.018742270357007457,
    depth=5,
    l2_leaf_reg=1.0871571324663387,
    random_strength=3.49632241801363,
    bagging_temperature=5.717049796462913,
    border_count=183,
    scale_pos_weight=3.4406776189795383,
    grow_policy='SymmetricTree',
    random_seed=RANDOM_STATE,
    eval_metric='F1',
    logging_level='Silent',
    boosting_type='Plain'
)

model_All = fit_all_train_data_function(
    'cat', train_data,
    iterations=2752,
    learning_rate=0.01750223610140175,
    depth=7,
    l2_leaf_reg=1.0323106799772723,
    random_strength=11.120538157516553,
    bagging_temperature=9.844903580231264,
    border_count=140,
    scale_pos_weight=2.5890657068422374,
    grow_policy='Depthwise',
    random_seed=RANDOM_STATE,
    eval_metric='F1',
    logging_level='Silent',
    boosting_type='Plain'
)

# 예측에 필요한 데이터 분리
x_test_dam = test_data_dam.drop(["target", "Set ID"], axis=1)
x_test_autoclave = test_data_autoclave.drop(["target", "Set ID"], axis=1)
x_test_fill1 = test_data_fill1.drop(["target", "Set ID"], axis=1)
x_test_fill2 = test_data_fill2.drop(["target", "Set ID"], axis=1)
x_test_all = test_data.drop(["target", "Set ID"], axis=1)

# 예측 확률 리스트 (소프트 보팅용)
probs = [
    model_Dam.predict_proba(x_test_dam)[:, 1]
    , model_AutoClave.predict_proba(x_test_autoclave)[:, 1]
    , model_Fill1.predict_proba(x_test_fill1)[:, 1]
    , model_Fill2.predict_proba(x_test_fill2)[:, 1]
    , model_All.predict_proba(x_test_all)[:, 1]
]

# 소프트 보팅 결과
final_predictions = voting(probs, method='soft', threshold=0.3)
print(sum(final_predictions))

# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("./data/submission.csv")
df_sub["target"] = final_predictions

# df_sub['target'] 값을 문자열 레이블로 변환
df_sub['target'] = df_sub['target'].apply(lambda x: 'AbNormal' if x == 1 else 'Normal')

# 제출 파일 저장
df_sub.to_csv("./data/data0817_cat.csv", index=False)

et

In [None]:

model_Dam = fit_all_train_data_function(
    'et', train_data_dam
    , n_estimators = 2242
    , max_depth = 32
    , min_samples_split = 2
    , min_samples_leaf = 1
    , criterion = 'entropy'
    , bootstrap = False
    , random_state=RANDOM_STATE
)

model_AutoClave = fit_all_train_data_function(
    'et', train_data_autoclave,
    n_estimators = 2708,
    max_depth = 41,
    min_samples_split = 8,
    min_samples_leaf = 1,
    criterion = 'entropy',
    bootstrap = False,
    random_state=RANDOM_STATE
)

model_Fill1 = fit_all_train_data_function(
    'et', train_data_fill1,
    n_estimators = 1520,
    max_depth = 30,
    min_samples_split = 2,
    min_samples_leaf = 1,
    criterion = 'entropy',
    bootstrap = False,
    random_state=RANDOM_STATE
)

model_Fill2 = fit_all_train_data_function(
    'et', train_data_fill2
    , n_estimators = 1001
    , max_depth = 45
    , min_samples_split = 3
    , min_samples_leaf = 1
    , criterion = 'entropy'
    , bootstrap = False
    , random_state=RANDOM_STATE
)

model_All = fit_all_train_data_function(
    'et', train_data
    , n_estimators = 2884
    , max_depth = 56
    , min_samples_split = 3
    , min_samples_leaf = 1
    , criterion = 'entropy'
    , bootstrap = False
    , random_state=RANDOM_STATE
)

# 예측에 필요한 데이터 분리
x_test_dam = test_data_dam.drop(["target", "Set ID"], axis=1)
x_test_autoclave = test_data_autoclave.drop(["target", "Set ID"], axis=1)
x_test_fill1 = test_data_fill1.drop(["target", "Set ID"], axis=1)
x_test_fill2 = test_data_fill2.drop(["target", "Set ID"], axis=1)
x_test_all = test_data.drop(["target", "Set ID"], axis=1)

# 예측 확률 리스트 (소프트 보팅용)
probs = [
    model_Dam.predict_proba(x_test_dam)[:, 1]
    , model_AutoClave.predict_proba(x_test_autoclave)[:, 1]
    , model_Fill1.predict_proba(x_test_fill1)[:, 1]
    , model_Fill2.predict_proba(x_test_fill2)[:, 1]
    , model_All.predict_proba(x_test_all)[:, 1]
]

# 소프트 보팅 결과
final_predictions = voting(probs, method='soft', threshold=0.3)
print(sum(final_predictions))

# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("./data/submission.csv")
df_sub["target"] = final_predictions

# df_sub['target'] 값을 문자열 레이블로 변환
df_sub['target'] = df_sub['target'].apply(lambda x: 'AbNormal' if x == 1 else 'Normal')

# 제출 파일 저장
df_sub.to_csv("./data/data0817_et.csv", index=False)

rf

In [None]:
model_Dam = fit_all_train_data_function(
    'rf', train_data_dam
    , n_estimators = 1330
    , max_depth = 36
    , min_samples_split = 6
    , min_samples_leaf = 1
    , criterion = 'entropy'
    , bootstrap = False
    , random_state=RANDOM_STATE
)

model_AutoClave = fit_all_train_data_function(
    'rf', train_data_autoclave
    , n_estimators = 1103
    , max_depth = 36
    , min_samples_split = 8
    , min_samples_leaf = 1
    , criterion = 'entropy'
    , bootstrap = False
    , random_state=RANDOM_STATE
)

model_Fill1 = fit_all_train_data_function(
    'rf', train_data_fill1
    , n_estimators = 1861
    , max_depth = 91
    , min_samples_split = 7
    , min_samples_leaf = 5
    , criterion = 'entropy'
    , class_weight = 'balanced'
    , random_state=RANDOM_STATE
)

model_Fill2 = fit_all_train_data_function(
    'rf', train_data_fill2
    , n_estimators = 2663
    , max_depth = 100
    , min_samples_split = 6
    , min_samples_leaf = 1
    , criterion = 'entropy'
    , class_weight = 'balanced'
    , random_state=RANDOM_STATE
)

model_All = fit_all_train_data_function(
    'rf', train_data
    , n_estimators = 1082
    , max_depth = 54
    , min_samples_split = 6
    , min_samples_leaf = 1
    , criterion = 'entropy'
    , class_weight = 'balanced'
    , random_state=RANDOM_STATE
)

# 예측에 필요한 데이터 분리
x_test_dam = test_data_dam.drop(["target", "Set ID"], axis=1)
x_test_autoclave = test_data_autoclave.drop(["target", "Set ID"], axis=1)
x_test_fill1 = test_data_fill1.drop(["target", "Set ID"], axis=1)
x_test_fill2 = test_data_fill2.drop(["target", "Set ID"], axis=1)
x_test_all = test_data.drop(["target", "Set ID"], axis=1)

# 예측 확률 리스트 (소프트 보팅용)
probs = [
    model_Dam.predict_proba(x_test_dam)[:, 1]
    , model_AutoClave.predict_proba(x_test_autoclave)[:, 1]
    , model_Fill1.predict_proba(x_test_fill1)[:, 1]
    , model_Fill2.predict_proba(x_test_fill2)[:, 1]
    , model_All.predict_proba(x_test_all)[:, 1]
]

# 소프트 보팅 결과
final_predictions = voting(probs, method='soft', threshold=0.3)
print(sum(final_predictions))

# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("./data/submission.csv")
df_sub["target"] = final_predictions

# df_sub['target'] 값을 문자열 레이블로 변환
df_sub['target'] = df_sub['target'].apply(lambda x: 'AbNormal' if x == 1 else 'Normal')

# 제출 파일 저장
df_sub.to_csv("./data/data0817_rf.csv", index=False)

ada

In [None]:
base_estimator = DecisionTreeClassifier(
    max_depth=22,
    min_samples_split=5,
    min_samples_leaf=2,
    max_features=0.9449544624225188,
    random_state=RANDOM_STATE
)

model_Dam = fit_all_train_data_function(
    'ada', train_data_dam
    , estimator=base_estimator
    , n_estimators=439
    , learning_rate=0.30985993372769294
    , random_state=RANDOM_STATE
)

base_estimator = DecisionTreeClassifier(
    max_depth=13,
    min_samples_split=34,
    min_samples_leaf=5,
    max_features=0.7473309094470472,
    random_state=RANDOM_STATE
)

model_AutoClave = fit_all_train_data_function(
    'ada', train_data_autoclave
    , estimator=base_estimator
    , n_estimators=570
    , learning_rate=0.2040105705276999
    , random_state=RANDOM_STATE
)

base_estimator = DecisionTreeClassifier(
    max_depth=14,
    min_samples_split=33,
    min_samples_leaf=8,
    max_features=0.7113128413756866,
    random_state=RANDOM_STATE
)

model_Fill1 = fit_all_train_data_function(
    'ada', train_data_fill1
    , estimator=base_estimator
    , n_estimators=913
    , learning_rate=0.055237331816147595
    , random_state=RANDOM_STATE
)

base_estimator = DecisionTreeClassifier(
    max_depth=7,
    min_samples_split=13,
    min_samples_leaf=8,
    max_features=0.6266118401157937,
    random_state=RANDOM_STATE
)

model_Fill2 = fit_all_train_data_function(
    'ada', train_data_fill2
    , estimator=base_estimator
    , n_estimators=293
    , learning_rate=0.620377973483163
    , random_state=RANDOM_STATE
)

base_estimator = DecisionTreeClassifier(
    max_depth=6,
    min_samples_split=28,
    min_samples_leaf=7,
    max_features=0.7331591188366589,
    random_state=RANDOM_STATE
)

model_All = fit_all_train_data_function(
    'ada', train_data
    , estimator=base_estimator
    , n_estimators=677
    , learning_rate=0.6713565955468803
    , random_state=RANDOM_STATE
)

# 예측에 필요한 데이터 분리
x_test_dam = test_data_dam.drop(["target", "Set ID"], axis=1)
x_test_autoclave = test_data_autoclave.drop(["target", "Set ID"], axis=1)
x_test_fill1 = test_data_fill1.drop(["target", "Set ID"], axis=1)
x_test_fill2 = test_data_fill2.drop(["target", "Set ID"], axis=1)
x_test_all = test_data.drop(["target", "Set ID"], axis=1)

# 예측 확률 리스트 (소프트 보팅용)
probs = [
    model_Dam.predict_proba(x_test_dam)[:, 1]
    , model_AutoClave.predict_proba(x_test_autoclave)[:, 1]
    , model_Fill1.predict_proba(x_test_fill1)[:, 1]
    , model_Fill2.predict_proba(x_test_fill2)[:, 1]
    , model_All.predict_proba(x_test_all)[:, 1]
]

# 소프트 보팅 결과
final_predictions = voting(probs, method='soft', threshold=0.3)
print(sum(final_predictions))

# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("./data/submission.csv")
df_sub["target"] = final_predictions

# df_sub['target'] 값을 문자열 레이블로 변환
df_sub['target'] = df_sub['target'].apply(lambda x: 'AbNormal' if x == 1 else 'Normal')

# 제출 파일 저장
df_sub.to_csv("./data/data0817_ada.csv", index=False)

### Hard voting

공정구분하여 학습한 개별 모델들에 대해서 hard voting

In [None]:

import pandas as pd
import numpy as np
from collections import Counter

def read_submission_files(file_paths):
    """
    제출 파일을 읽어와서 예측 결과를 반환합니다.

    Parameters:
    file_paths (list of str): 제출 파일 경로 리스트

    Returns:
    list of np.array: 각 제출 파일의 예측 결과 리스트
    """
    predictions = []
    for file_path in file_paths:
        df = pd.read_csv(file_path)
        preds = df['target'].apply(lambda x: 1 if x == 'AbNormal' else 0).values
        predictions.append(preds)
    return predictions

def hard_voting(preds):
    """
    하드 보팅을 사용하여 최종 예측을 수행합니다.

    Parameters:
    preds (list of np.array): 각 모델의 예측 배열 리스트

    Returns:
    np.array: 최종 예측 결과
    """
    preds = np.array(preds)
    
    # 각 샘플의 예측 결과를 문자열로 변환하여 리스트에 저장
    sample_predictions = [''.join(map(str, x)) for x in preds.T]
    
    # 각 예측 결과의 빈도수를 계산
    prediction_counts = Counter(sample_predictions)
    
    # 빈도수 출력
    for pred, count in prediction_counts.items():
        print(f"Prediction {pred}: {count} times")
    
    # 하드 보팅을 통해 최종 예측을 계산
    final_predictions = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=0, arr=preds)
    return final_predictions

성능 좋은 3가지 모델(lgbm, xgb, cat)에 대해서 2번 넣어줌으로서 가중치를 주는 효과  
종합적으로 [2,2,2,1,1,1] 

In [None]:
import pandas as pd

# 공통 경로
common_path = "./data/"

# 제출 파일 이름 리스트
file_names = [
    "data0817_lgbm.csv"
    , "data0817_lgbm.csv"
    , "data0817_xgb.csv"
    , "data0817_xgb.csv"
    , "data0817_cat.csv"
    , "data0817_cat.csv"
    , "data0817_et.csv"
    , "data0817_rf.csv"
    , "data0817_ada.csv"   
    # 파일 추가 가능  <----- 파일 필요시 추가하세요!!
]

# 경로를 추가하는 함수
def add_common_path(file_names, common_path):
    return [common_path + file_name for file_name in file_names]

# 경로가 추가된 파일 리스트
file_paths = add_common_path(file_names, common_path)

# 제출 파일에서 예측 결과 읽어오기
predictions = read_submission_files(file_paths)

# 하드 보팅 결과
final_predictions_hard = hard_voting(predictions)

# 결과를 새로운 제출 파일로 저장할 파일 이름
output_file_name = "submission.csv" # <----- 파일 이름을 변경하세요!!

# 결과를 새로운 제출 파일로 저장
df_sub = pd.read_csv(file_paths[0])
df_sub["target"] = final_predictions_hard
df_sub['target'] = df_sub['target'].apply(lambda x: 'AbNormal' if x == 1 else 'Normal')
df_sub.to_csv(output_file_name, index=False)

print(f"최종 제출 파일이 '{output_file_name}'로 저장되었습니다.")

In [None]:
df_sub['target'].value_counts()

In [None]:
df_sub.head(10)

우측 상단의 제출 버튼을 클릭해 결과를 확인하세요

.