# 제품 이상여부 판별 프로젝트


## 1. 데이터 불러오기


### 필수 라이브러리


In [122]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### 데이터 읽어오기


In [123]:
RANDOM_STATE = 110

train_data = pd.read_csv("train_data_forEDA.csv")
test_data = pd.read_csv("test_data_forEDA.csv")

---

반복적으로 쓰는 툴 함수화

In [124]:
def plot_box(dataframe, column_name):
    """
    주어진 데이터프레임과 열 이름에 대해 박스 플롯을 그리는 함수.

    Parameters:
    dataframe (pd.DataFrame): 데이터프레임
    column_name (str): 열 이름
    """
    plt.figure(figsize=(10, 6))
    plt.boxplot(dataframe[column_name], vert=False)
    plt.xlabel(column_name)
    plt.title(f'Box Plot of {column_name}')
    plt.show()

In [125]:
import pandas as pd

def value_counts_ratio_count(df, col_name, target_name):
    """
    주어진 데이터프레임의 특정 열에 대해 각 값마다 타겟 변수의 비율과 갯수, 총 갯수를 출력하는 함수.

    Parameters:
    df (pd.DataFrame): 데이터프레임
    col_name (str): 열 이름
    target_name (str): 타겟 변수 이름
    """
    # 각 값마다 타겟 변수의 비율 계산
    value_counts = df.groupby(col_name)[target_name].value_counts(normalize=True).unstack().fillna(0)
    
    # 각 값마다 타겟 변수의 갯수 계산
    counts = df.groupby(col_name)[target_name].value_counts().unstack().fillna(0)
    
    # 각 값마다 총 갯수 계산
    total_counts = df[col_name].value_counts().rename('Total_Count')
    
    # 비율과 갯수를 합침
    result = value_counts.join(counts, lsuffix='_ratio', rsuffix='_count')
    
    # 총 갯수를 합침
    result = result.join(total_counts, on=col_name)
    
    # 출력 형식 조정
    result.index.name = 'variable'
    print(f"\n{col_name}별 {target_name} 비율 및 갯수\n")
    print(result.rename(columns=lambda x: x.split('_')[0]))

In [126]:
import pandas as pd

def summarize_grouped_data(df, group_by_columns):
    # 데이터프레임을 그룹화
    grouped_df = df.groupby(group_by_columns)
    
    # 결과를 저장할 리스트 초기화
    results = []
    
    # 그룹화된 데이터프레임의 내용을 확인하는 코드
    for name, group in grouped_df:
        # 그룹의 갯수 계산
        group_count = group.shape[0]
        
        # 'target' 변수의 'AdNormal' 비율과 갯수 계산
        adnormal_count = group['target'].value_counts().get('AbNormal', 0)
        adnormal_ratio = adnormal_count / group_count
        
        # 결과 리스트에 추가
        results.append([name, adnormal_count, adnormal_ratio, group_count])
    
    # 결과 리스트를 데이터프레임으로 변환
    results_df = pd.DataFrame(results, columns=['group', "'AdNormal' count", 'ratio', 'Total'])
    
    # 그룹화된 변수들의 이름을 제목행으로 출력
    print(f"Grouped by: {', '.join(group_by_columns)}")
    print()
    # 데이터프레임 출력
    print(results_df)
    
    return results_df

In [127]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

def plot_abnormal_ratio(dataframe, column_name, target_name, target_value, bins=20):
    """
    주어진 데이터프레임의 특정 열에 대해 각 값마다 타겟 변수의 특정 값 비율을 계산하고 막대그래프로 표시하는 함수.

    Parameters:
    dataframe (pd.DataFrame): 데이터프레임
    column_name (str): 열 이름
    target_name (str): 타겟 변수 이름
    target_value (str): 타겟 변수의 특정 값
    bins (int): 구간의 수 (기본값은 20)
    """
    def abnormal_ratio(dataframe, column_name, target_name, target_value):
        """
        주어진 데이터프레임의 특정 열에 대해 각 값마다 타겟 변수의 특정 값 비율을 계산하는 함수.

        Parameters:
        dataframe (pd.DataFrame): 데이터프레임
        column_name (str): 열 이름
        target_name (str): 타겟 변수 이름
        target_value (str): 타겟 변수의 특정 값

        Returns:
        pd.DataFrame: 각 값마다 타겟 변수의 특정 값 비율을 포함하는 데이터프레임
        """
        # 각 값마다 타겟 변수의 특정 값 비율 계산
        value_counts = dataframe.groupby(column_name)[target_name].apply(lambda x: (x == target_value).mean()).reset_index()
        count_counts = dataframe.groupby(column_name)[target_name].count().reset_index()
        
        value_counts.columns = [column_name, 'ratio']
        count_counts.columns = [column_name, 'count']
        
        # 비율과 카운트를 병합
        result = pd.merge(value_counts, count_counts, on=column_name)
        return result

    # column_name 값을 지정된 구간으로 나누기
    dataframe[f'{column_name}_bins'] = pd.cut(dataframe[column_name], bins=bins)

    # 비율 계산
    ratios = abnormal_ratio(dataframe, f'{column_name}_bins', target_name, target_value)

    # 막대그래프 그리기
    plt.figure(figsize=(20, 10))
    barplot = sns.barplot(x=f'{column_name}_bins', y='ratio', data=ratios, color='skyblue')
    plt.xlabel(f'{column_name} (binned)')
    plt.ylabel('AbNormal Ratio')
    plt.title(f'AbNormal Ratio by {column_name} (binned)', pad=30)  # 제목과 그래프 사이의 간격 조정
    plt.xticks(rotation=45)
    plt.ylim(0, 1)

    # 각 막대 위에 비율 값과 카운트 표시
    for p in barplot.patches:
        # 막대의 x 좌표에 해당하는 구간을 찾기
        bin_label = ratios[f'{column_name}_bins'].cat.categories[int(p.get_x() + p.get_width() / 2) - 1]
        count_value = ratios.loc[ratios[f'{column_name}_bins'] == bin_label, 'count'].values[0]
        barplot.annotate(f'{format(p.get_height(), ".2f")} ({count_value})', 
                         (p.get_x() + p.get_width() / 2., p.get_height()), 
                         ha='center', va='center', 
                         xytext=(0, 9), 
                         textcoords='offset points')

    plt.show()

    # _bins 변수 드랍
    dataframe.drop(columns=[f'{column_name}_bins'], inplace=True)


---

## Dam

## AutoClave

## Fill1

## Fill2

In [128]:
# '_Fill2'를 포함하는 열 이름 필터링
Process_Desc_col = train_data.filter(like='_Fill2').columns

# 필터링된 열 이름 출력
print("<Fill2 공정 관련 변수>")
for col in Process_Desc_col:
    print(col)

<Fill2 공정 관련 변수>
CURE END POSITION X Collect Result_Fill2
CURE END POSITION Z Collect Result_Fill2
CURE SPEED Collect Result_Fill2
CURE STANDBY POSITION Z Collect Result_Fill2
CURE START POSITION X Collect Result_Fill2
CURE START POSITION Z Collect Result_Fill2
HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2
HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill2
HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill2
HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill2
HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill2
HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill2
HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill2
HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill2
HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill2
HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill2
Head Purge Position Z Collect Result_Fill2
Machine Tact time Collect Result_Fill2
PalletID Collect Result_Fill2
Production Qty Collect Result_Fill2
Rec

In [129]:
# 'CURE'와 '_Fill2'를 포함하는 열 이름 필터링
Process_Desc_col = train_data.filter(like='_Fill2').columns
filtered_columns = [col for col in Process_Desc_col if 'CURE' in col]


print("\n Fill2 공정 관련 변수 중 CURE 포함 변수>")
for col in filtered_columns:
    print(col)


 Fill2 공정 관련 변수 중 CURE 포함 변수>
CURE END POSITION X Collect Result_Fill2
CURE END POSITION Z Collect Result_Fill2
CURE SPEED Collect Result_Fill2
CURE STANDBY POSITION Z Collect Result_Fill2
CURE START POSITION X Collect Result_Fill2
CURE START POSITION Z Collect Result_Fill2


In [130]:
import numpy as np

# 시작 위치와 끝 위치 열 이름
start_x_col = 'CURE START POSITION X Collect Result_Fill2'
start_z_col = 'CURE START POSITION Z Collect Result_Fill2'
end_x_col = 'CURE END POSITION X Collect Result_Fill2'
end_z_col = 'CURE END POSITION Z Collect Result_Fill2'

# 시작 위치와 끝 위치 사이의 거리 계산
train_data['CURE_DISTANCE_Fill2'] = np.sqrt(
    (train_data[end_x_col] - train_data[start_x_col]) ** 2 +
    (train_data[end_z_col] - train_data[start_z_col]) ** 2
)

# 결과 확인
value_counts_ratio_count(train_data, 'CURE_DISTANCE_Fill2', 'target')


CURE_DISTANCE_Fill2별 target 비율 및 갯수

            AbNormal    Normal  AbNormal   Normal  Total
variable                                                
780.000000  0.055409  0.944591    1990.0  33925.0  35915
780.000641  0.077019  0.922981     309.0   3703.0   4012
780.064100  0.088235  0.911765      51.0    527.0    578
780.077560  0.000000  1.000000       0.0      1.0      1


In [131]:
train_data['CURE_Time_Fill2']  = train_data['CURE_DISTANCE_Fill2'] / train_data['CURE SPEED Collect Result_Fill2']

In [132]:
value_counts_ratio_count(train_data, 'CURE_Time_Fill2', 'target')


CURE_Time_Fill2별 target 비율 및 갯수

           AbNormal    Normal  AbNormal   Normal  Total
variable                                               
10.400000  0.000000  1.000000       0.0      1.0      1
14.181818  0.070632  0.929368      19.0    250.0    269
14.716981  0.000000  1.000000       0.0     36.0     36
15.294118  0.071146  0.928854      18.0    235.0    253
15.295375  0.068592  0.931408      19.0    258.0    277
15.600000  0.054911  0.945089    1667.0  28691.0  30358
15.600013  0.077019  0.922981     309.0   3703.0   4012
15.601282  0.106312  0.893688      32.0    269.0    301
15.601551  0.000000  1.000000       0.0      1.0      1
16.250000  0.046212  0.953788     122.0   2518.0   2640
17.333333  0.069367  0.930633     161.0   2160.0   2321
19.500000  0.081081  0.918919       3.0     34.0     37


In [133]:
# 'CURE'와 '_Fill2'를 포함하는 열 이름 필터링
Process_Desc_col = train_data.filter(like='_Fill2').columns
filtered_columns = [col for col in Process_Desc_col if 'CURE' in col]


print("\n Fill2 공정 관련 변수 중 CURE 포함 변수>")
for col in filtered_columns:
    print(col)


 Fill2 공정 관련 변수 중 CURE 포함 변수>
CURE END POSITION X Collect Result_Fill2
CURE END POSITION Z Collect Result_Fill2
CURE SPEED Collect Result_Fill2
CURE STANDBY POSITION Z Collect Result_Fill2
CURE START POSITION X Collect Result_Fill2
CURE START POSITION Z Collect Result_Fill2
CURE_DISTANCE_Fill2
CURE_Time_Fill2


In [134]:
value_counts_ratio_count(train_data, 'CURE START POSITION Z Collect Result_Fill2', 'target')


CURE START POSITION Z Collect Result_Fill2별 target 비율 및 갯수

          AbNormal    Normal  AbNormal  Normal  Total
variable                                             
22        0.072495  0.927505        34     435    469
23        0.107843  0.892157        22     182    204
32        0.085866  0.914134       421    4482   4903
33        0.053622  0.946378      1873   33057  34930


In [135]:
train_data['CURE_STANDBY_DISTANCE_Fill2']  = train_data['CURE START POSITION Z Collect Result_Fill2'] - train_data['CURE STANDBY POSITION Z Collect Result_Fill2']

In [136]:
value_counts_ratio_count(train_data, 'CURE_STANDBY_DISTANCE_Fill2', 'target')


CURE_STANDBY_DISTANCE_Fill2별 target 비율 및 갯수

          AbNormal    Normal  AbNormal  Normal  Total
variable                                             
0         0.058016  0.941984      2350   38156  40506


In [137]:
summary_df = summarize_grouped_data(train_data, ['CURE START POSITION Z Collect Result_Fill2', 'CURE STANDBY POSITION Z Collect Result_Fill2'])

Grouped by: CURE START POSITION Z Collect Result_Fill2, CURE STANDBY POSITION Z Collect Result_Fill2

      group  'AdNormal' count     ratio  Total
0  (22, 22)                34  0.072495    469
1  (23, 23)                22  0.107843    204
2  (32, 32)               421  0.085866   4903
3  (33, 33)              1873  0.053622  34930


대기지점과 시작지점 z 좌표 동일 -> 동일 위치에서 시작한다고 볼수 있다(x,y 좌표 고려x)

좌표 값으로 대기지점 좌표와 시작지점 (x,y) , 종료지점 (x,y) 사이의 거리값 2개의 변수만을 가져간다.

In [138]:
# 삭제하려는 열 목록
columns_to_drop = [
    'CURE END POSITION X Collect Result_Fill2'
    , 'CURE END POSITION Z Collect Result_Fill2'
    , 'CURE START POSITION X Collect Result_Fill2'
    , 'CURE START POSITION Z Collect Result_Fill2'
    , 'CURE_STANDBY_DISTANCE_Fill2'
    , 'CURE SPEED Collect Result_Fill2'
]

# 열 삭제
train_data = train_data.drop(columns=[col for col in columns_to_drop if col in train_data.columns])
test_data = test_data.drop(columns=[col for col in columns_to_drop if col in test_data.columns])

In [139]:
# CURE STANDBY POSITION Z Collect Result_Fill2 이름 변경
train_data.rename(columns={'CURE STANDBY POSITION Z Collect Result_Fill2': 'CURE_STANDBY_POSITION_Fill2'}, inplace=True)
test_data.rename(columns={'CURE STANDBY POSITION Z Collect Result_Fill2': 'CURE_STANDBY_POSITION_Fill2'}, inplace=True)

In [140]:
# 'CURE'와 '_Fill2'를 포함하는 열 이름 필터링
Process_Desc_col = train_data.filter(like='_Fill2').columns
filtered_columns = [col for col in Process_Desc_col if 'CURE' in col]


print("\n Fill2 공정 관련 변수 중 CURE 포함 변수>")
for col in filtered_columns:
    print(col)


 Fill2 공정 관련 변수 중 CURE 포함 변수>
CURE_STANDBY_POSITION_Fill2
CURE_DISTANCE_Fill2
CURE_Time_Fill2


In [141]:
# '_Fill2'를 포함하는 열 이름 필터링
Process_Desc_col = train_data.filter(like='_Fill2').columns

# 필터링된 열 이름 출력
print("<Fill2 공정 관련 변수>")
for col in Process_Desc_col:
    print(col)

<Fill2 공정 관련 변수>
CURE_STANDBY_POSITION_Fill2
HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2
HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill2
HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill2
HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill2
HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill2
HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill2
HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill2
HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill2
HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill2
HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill2
Head Purge Position Z Collect Result_Fill2
Machine Tact time Collect Result_Fill2
PalletID Collect Result_Fill2
Production Qty Collect Result_Fill2
Receip No Collect Result_Fill2
WorkMode Collect Result_Fill2
CURE_DISTANCE_Fill2
CURE_Time_Fill2


In [142]:
summary_df = summarize_grouped_data(train_data, ['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2'
                                                 , 'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill2'
                                                 , 'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill2'])

Grouped by: HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2, HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill2, HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill2

                    group  'AdNormal' count     ratio  Total
0  (304.8, 1324.2, 243.5)                44  0.049107    896
1  (305.0, 1324.2, 243.5)               825  0.056569  14584
2   (835.5, 428.0, 243.7)              1481  0.059178  25026


In [143]:
summary_df = summarize_grouped_data(train_data, ['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill2'
                                                 , 'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill2'
                                                 , 'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill2'])

Grouped by: HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill2, HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill2, HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill2

                    group  'AdNormal' count     ratio  Total
0   (458.0, 427.9, 243.7)              1481  0.059178  25026
1  (499.8, 1324.2, 243.5)               869  0.056137  15480


In [144]:
summary_df = summarize_grouped_data(train_data, ['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill2'
                                                 , 'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill2'
                                                 , 'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill2'])

Grouped by: HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill2, HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill2, HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill2

                    group  'AdNormal' count     ratio  Total
0   (156.0, 428.0, 243.7)              1481  0.059178  25026
1  (692.8, 1324.2, 243.5)                44  0.049107    896
2  (694.0, 1324.2, 243.5)               825  0.056569  14584


In [145]:
import numpy as np

# 각 스테이지의 좌표 열 정의
stage1_cols = ['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2',
               'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill2',
               'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill2']

stage2_cols = ['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill2',
               'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill2',
               'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill2']

stage3_cols = ['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill2',
               'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill2',
               'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill2']

# 스테이지 1과 스테이지 2 사이의 거리 계산
train_data['DISTANCE_STAGE1_STAGE2'] = np.sqrt(
    (train_data[stage2_cols[0]] - train_data[stage1_cols[0]]) ** 2 +
    (train_data[stage2_cols[1]] - train_data[stage1_cols[1]]) ** 2 +
    (train_data[stage2_cols[2]] - train_data[stage1_cols[2]]) ** 2
)

# 스테이지 2와 스테이지 3 사이의 거리 계산
train_data['DISTANCE_STAGE2_STAGE3'] = np.sqrt(
    (train_data[stage3_cols[0]] - train_data[stage2_cols[0]]) ** 2 +
    (train_data[stage3_cols[1]] - train_data[stage2_cols[1]]) ** 2 +
    (train_data[stage3_cols[2]] - train_data[stage2_cols[2]]) ** 2
)

# 스테이지 1과 스테이지 3 사이의 거리 계산
train_data['DISTANCE_STAGE1_STAGE3'] = np.sqrt(
    (train_data[stage3_cols[0]] - train_data[stage1_cols[0]]) ** 2 +
    (train_data[stage3_cols[1]] - train_data[stage1_cols[1]]) ** 2 +
    (train_data[stage3_cols[2]] - train_data[stage1_cols[2]]) ** 2
)

In [146]:
# 결과 출력
value_counts_ratio_count(train_data, 'DISTANCE_STAGE1_STAGE2', 'target')
value_counts_ratio_count(train_data, 'DISTANCE_STAGE2_STAGE3', 'target')
value_counts_ratio_count(train_data, 'DISTANCE_STAGE1_STAGE3', 'target')


DISTANCE_STAGE1_STAGE2별 target 비율 및 갯수

            AbNormal    Normal  AbNormal  Normal  Total
variable                                               
194.800000  0.056569  0.943431       825   13759  14584
195.000000  0.049107  0.950893        44     852    896
377.500013  0.059178  0.940822      1481   23545  25026

DISTANCE_STAGE2_STAGE3별 target 비율 및 갯수

            AbNormal    Normal  AbNormal  Normal  Total
variable                                               
193.000000  0.049107  0.950893        44     852    896
194.200000  0.056569  0.943431       825   13759  14584
302.000017  0.059178  0.940822      1481   23545  25026

DISTANCE_STAGE1_STAGE3별 target 비율 및 갯수

          AbNormal    Normal  AbNormal  Normal  Total
variable                                             
388.0     0.049107  0.950893        44     852    896
389.0     0.056569  0.943431       825   13759  14584
679.5     0.059178  0.940822      1481   23545  25026


In [147]:
summary_df = summarize_grouped_data(train_data, ['DISTANCE_STAGE1_STAGE2', 'DISTANCE_STAGE2_STAGE3', 'DISTANCE_STAGE1_STAGE3'])

Grouped by: DISTANCE_STAGE1_STAGE2, DISTANCE_STAGE2_STAGE3, DISTANCE_STAGE1_STAGE3

                                             group  'AdNormal' count  \
0                            (194.8, 194.2, 389.0)               825   
1  (195.0, 192.99999999999994, 387.99999999999994)                44   
2    (377.5000132450329, 302.0000165562909, 679.5)              1481   

      ratio  Total  
0  0.056569  14584  
1  0.049107    896  
2  0.059178  25026  


In [148]:
value_counts_ratio_count(train_data, 'HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill2', 'target')


HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill2별 target 비율 및 갯수

          AbNormal    Normal  AbNormal  Normal  Total
variable                                             
OK        0.043921  0.956079       496   10797  11293


In [149]:
# OK 값이면 1, 결측 값이면 0을 부여
train_data['HEAD_NORMAL_stage1_ok_Fill2'] = train_data['HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill2'].apply(lambda x: 1 if x == 'OK' else 0)
test_data['HEAD_NORMAL_stage1_ok_Fill2'] = test_data['HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill2'].apply(lambda x: 1 if x == 'OK' else 0)

# HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill1 변수 제거
train_data.drop(columns=['HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill2'], inplace=True)
test_data.drop(columns=['HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill2'], inplace=True)

# 결과 확인
value_counts_ratio_count(train_data, 'HEAD_NORMAL_stage1_ok_Fill2', 'target')


HEAD_NORMAL_stage1_ok_Fill2별 target 비율 및 갯수

          AbNormal    Normal  AbNormal  Normal  Total
variable                                             
0         0.063465  0.936535      1854   27359  29213
1         0.043921  0.956079       496   10797  11293


In [150]:
summary_df = summarize_grouped_data(train_data, ['HEAD_NORMAL_stage1_ok_Fill2','DISTANCE_STAGE1_STAGE2', 'DISTANCE_STAGE2_STAGE3', 'DISTANCE_STAGE1_STAGE3'])

Grouped by: HEAD_NORMAL_stage1_ok_Fill2, DISTANCE_STAGE1_STAGE2, DISTANCE_STAGE2_STAGE3, DISTANCE_STAGE1_STAGE3

                                               group  'AdNormal' count  \
0                           (0, 194.8, 194.2, 389.0)               570   
1  (0, 195.0, 192.99999999999994, 387.99999999999...                44   
2   (0, 377.5000132450329, 302.0000165562909, 679.5)              1240   
3                           (1, 194.8, 194.2, 389.0)               255   
4   (1, 377.5000132450329, 302.0000165562909, 679.5)               241   

      ratio  Total  
0  0.063730   8944  
1  0.049107    896  
2  0.064007  19373  
3  0.045213   5640  
4  0.042632   5653  


In [151]:
# 삭제하려는 열 목록
columns_to_drop = [
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2'
    , 'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill2'
    , 'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill2'
    , 'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill2'
    , 'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill2'
    , 'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill2'
    , 'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill2'
    , 'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill2'
    , 'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill2']

# train_data에서 열 삭제
train_data = train_data.drop(columns=[col for col in columns_to_drop if col in train_data.columns])

# test_data에서 열 삭제
test_data = test_data.drop(columns=[col for col in columns_to_drop if col in test_data.columns])

In [152]:
# '_Fill2'를 포함하는 열 이름 필터링
Process_Desc_col = train_data.filter(like='_Fill2').columns

# 필터링된 열 이름 출력
print("<Fill2 공정 관련 변수>")
for col in Process_Desc_col:
    print(col)

<Fill2 공정 관련 변수>
CURE_STANDBY_POSITION_Fill2
Head Purge Position Z Collect Result_Fill2
Machine Tact time Collect Result_Fill2
PalletID Collect Result_Fill2
Production Qty Collect Result_Fill2
Receip No Collect Result_Fill2
WorkMode Collect Result_Fill2
CURE_DISTANCE_Fill2
CURE_Time_Fill2
HEAD_NORMAL_stage1_ok_Fill2


In [153]:
value_counts_ratio_count(train_data, 'Head Purge Position Z Collect Result_Fill2', 'target')


Head Purge Position Z Collect Result_Fill2별 target 비율 및 갯수

          AbNormal    Normal  AbNormal  Normal  Total
variable                                             
85.000    0.058709  0.941291      2083   33397  35480
114.612   0.053124  0.946876       267    4759   5026


In [154]:
summary_df = summarize_grouped_data(train_data, ['HEAD_NORMAL_stage1_ok_Fill2', 'Head Purge Position Z Collect Result_Fill2'])

Grouped by: HEAD_NORMAL_stage1_ok_Fill2, Head Purge Position Z Collect Result_Fill2

          group  'AdNormal' count     ratio  Total
0     (0, 85.0)              1598  0.064933  24610
1  (0, 114.612)               256  0.055616   4603
2     (1, 85.0)               485  0.044618  10870
3  (1, 114.612)                11  0.026005    423


In [155]:
summary_df = summarize_grouped_data(train_data, ['Dispenser_num', 'Head Purge Position Z Collect Result_Fill2'])

Grouped by: Dispenser_num, Head Purge Position Z Collect Result_Fill2

          group  'AdNormal' count     ratio  Total
0     (0, 85.0)                27  1.000000     27
1  (0, 114.612)                 7  1.000000      7
2     (1, 85.0)              1348  0.060205  22390
3  (1, 114.612)               118  0.045021   2621
4     (2, 85.0)               708  0.054199  13063
5  (2, 114.612)               142  0.059216   2398


In [156]:
# '_Fill2'를 포함하는 열 이름 필터링
Process_Desc_col = train_data.filter(like='_Fill2').columns

# 필터링된 열 이름 출력
print("<Fill2 공정 관련 변수>")
for col in Process_Desc_col:
    print(col)

<Fill2 공정 관련 변수>
CURE_STANDBY_POSITION_Fill2
Head Purge Position Z Collect Result_Fill2
Machine Tact time Collect Result_Fill2
PalletID Collect Result_Fill2
Production Qty Collect Result_Fill2
Receip No Collect Result_Fill2
WorkMode Collect Result_Fill2
CURE_DISTANCE_Fill2
CURE_Time_Fill2
HEAD_NORMAL_stage1_ok_Fill2


In [157]:
value_counts_ratio_count(train_data, 'WorkMode Collect Result_Fill2', 'target')


WorkMode Collect Result_Fill2별 target 비율 및 갯수

          AbNormal    Normal  AbNormal  Normal  Total
variable                                             
0.0       0.073326  0.926674      1206   15241  16447


In [159]:
summary_df = summarize_grouped_data(train_data, ['WorkMode Collect Result_Fill1', 'WorkMode Collect Result_Fill2'])

Grouped by: WorkMode Collect Result_Fill1, WorkMode Collect Result_Fill2

        group  'AdNormal' count     ratio  Total
0  (7.0, 0.0)              1206  0.073326  16447


In [160]:
summary_df = summarize_grouped_data(train_data, ['WorkMode Collect Result_Dam', 'WorkMode Collect Result_Fill1', 'WorkMode Collect Result_Fill2'])

Grouped by: WorkMode Collect Result_Dam, WorkMode Collect Result_Fill1, WorkMode Collect Result_Fill2

             group  'AdNormal' count     ratio  Total
0  (7.0, 7.0, 0.0)              1206  0.073326  16447


---