# 제품 이상여부 판별 프로젝트


## 1. 데이터 불러오기


### 필수 라이브러리


In [386]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### 데이터 읽어오기


In [387]:
RANDOM_STATE = 110

train_data = pd.read_csv("train_data_forEDA.csv")
test_data = pd.read_csv("test_data_forEDA.csv")

---

반복적으로 쓰는 툴 함수화

In [388]:
def plot_box(dataframe, column_name):
    """
    주어진 데이터프레임과 열 이름에 대해 박스 플롯을 그리는 함수.

    Parameters:
    dataframe (pd.DataFrame): 데이터프레임
    column_name (str): 열 이름
    """
    plt.figure(figsize=(10, 6))
    plt.boxplot(dataframe[column_name], vert=False)
    plt.xlabel(column_name)
    plt.title(f'Box Plot of {column_name}')
    plt.show()

In [389]:
import pandas as pd

def value_counts_ratio_count(df, col_name, target_name):
    """
    주어진 데이터프레임의 특정 열에 대해 각 값마다 타겟 변수의 비율과 갯수, 총 갯수를 출력하는 함수.

    Parameters:
    df (pd.DataFrame): 데이터프레임
    col_name (str): 열 이름
    target_name (str): 타겟 변수 이름
    """
    # 각 값마다 타겟 변수의 비율 계산
    value_counts = df.groupby(col_name)[target_name].value_counts(normalize=True).unstack().fillna(0)
    
    # 각 값마다 타겟 변수의 갯수 계산
    counts = df.groupby(col_name)[target_name].value_counts().unstack().fillna(0)
    
    # 각 값마다 총 갯수 계산
    total_counts = df[col_name].value_counts().rename('Total_Count')
    
    # 비율과 갯수를 합침
    result = value_counts.join(counts, lsuffix='_ratio', rsuffix='_count')
    
    # 총 갯수를 합침
    result = result.join(total_counts, on=col_name)
    
    # 출력 형식 조정
    result.index.name = 'variable'
    print(f"\n{col_name}별 {target_name} 비율 및 갯수\n")
    print(result.rename(columns=lambda x: x.split('_')[0]))

In [390]:
import pandas as pd

def summarize_grouped_data(df, group_by_columns):
    # 데이터프레임을 그룹화
    grouped_df = df.groupby(group_by_columns)
    
    # 결과를 저장할 리스트 초기화
    results = []
    
    # 그룹화된 데이터프레임의 내용을 확인하는 코드
    for name, group in grouped_df:
        # 그룹의 갯수 계산
        group_count = group.shape[0]
        
        # 'target' 변수의 'AdNormal' 비율과 갯수 계산
        adnormal_count = group['target'].value_counts().get('AbNormal', 0)
        adnormal_ratio = adnormal_count / group_count
        
        # 결과 리스트에 추가
        results.append([name, adnormal_count, adnormal_ratio, group_count])
    
    # 결과 리스트를 데이터프레임으로 변환
    results_df = pd.DataFrame(results, columns=['group', "'AdNormal' count", 'ratio', 'Total'])
    
    # 그룹화된 변수들의 이름을 제목행으로 출력
    print(f"Grouped by: {', '.join(group_by_columns)}")
    print()
    # 데이터프레임 출력
    print(results_df)
    
    return results_df

In [391]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

def plot_abnormal_ratio(dataframe, column_name, target_name, target_value, bins=20):
    """
    주어진 데이터프레임의 특정 열에 대해 각 값마다 타겟 변수의 특정 값 비율을 계산하고 막대그래프로 표시하는 함수.

    Parameters:
    dataframe (pd.DataFrame): 데이터프레임
    column_name (str): 열 이름
    target_name (str): 타겟 변수 이름
    target_value (str): 타겟 변수의 특정 값
    bins (int): 구간의 수 (기본값은 20)
    """
    def abnormal_ratio(dataframe, column_name, target_name, target_value):
        """
        주어진 데이터프레임의 특정 열에 대해 각 값마다 타겟 변수의 특정 값 비율을 계산하는 함수.

        Parameters:
        dataframe (pd.DataFrame): 데이터프레임
        column_name (str): 열 이름
        target_name (str): 타겟 변수 이름
        target_value (str): 타겟 변수의 특정 값

        Returns:
        pd.DataFrame: 각 값마다 타겟 변수의 특정 값 비율을 포함하는 데이터프레임
        """
        # 각 값마다 타겟 변수의 특정 값 비율 계산
        value_counts = dataframe.groupby(column_name)[target_name].apply(lambda x: (x == target_value).mean()).reset_index()
        count_counts = dataframe.groupby(column_name)[target_name].count().reset_index()
        
        value_counts.columns = [column_name, 'ratio']
        count_counts.columns = [column_name, 'count']
        
        # 비율과 카운트를 병합
        result = pd.merge(value_counts, count_counts, on=column_name)
        return result

    # column_name 값을 지정된 구간으로 나누기
    dataframe[f'{column_name}_bins'] = pd.cut(dataframe[column_name], bins=bins)

    # 비율 계산
    ratios = abnormal_ratio(dataframe, f'{column_name}_bins', target_name, target_value)

    # 막대그래프 그리기
    plt.figure(figsize=(20, 10))
    barplot = sns.barplot(x=f'{column_name}_bins', y='ratio', data=ratios, color='skyblue')
    plt.xlabel(f'{column_name} (binned)')
    plt.ylabel('AbNormal Ratio')
    plt.title(f'AbNormal Ratio by {column_name} (binned)', pad=30)  # 제목과 그래프 사이의 간격 조정
    plt.xticks(rotation=45)
    plt.ylim(0, 1)

    # 각 막대 위에 비율 값과 카운트 표시
    for p in barplot.patches:
        # 막대의 x 좌표에 해당하는 구간을 찾기
        bin_label = ratios[f'{column_name}_bins'].cat.categories[int(p.get_x() + p.get_width() / 2) - 1]
        count_value = ratios.loc[ratios[f'{column_name}_bins'] == bin_label, 'count'].values[0]
        barplot.annotate(f'{format(p.get_height(), ".2f")} ({count_value})', 
                         (p.get_x() + p.get_width() / 2., p.get_height()), 
                         ha='center', va='center', 
                         xytext=(0, 9), 
                         textcoords='offset points')

    plt.show()

    # _bins 변수 드랍
    dataframe.drop(columns=[f'{column_name}_bins'], inplace=True)


---

## Dam

In [392]:
# train_data와 test_data에서 '?'를 포함하는 열 이름 필터링
train_Process_Desc_col = train_data.filter(like='?').columns
test_Process_Desc_col = test_data.filter(like='?').columns

# 필터링된 열 이름 출력
print("<? column in train_data>")
for col in train_Process_Desc_col:
    print(col)

print("<? column in test_data>")
for col in test_Process_Desc_col:
    print(col)

# ? -> Θ로 변경할 열 이름과 새 열 이름 생성
train_new_columns = {col: col.replace('?', 'Θ') for col in train_Process_Desc_col}
test_new_columns = {col: col.replace('?', 'Θ') for col in test_Process_Desc_col}

# 열 이름 변경
train_data.rename(columns=train_new_columns, inplace=True)
test_data.rename(columns=test_new_columns, inplace=True)

# 'Θ'를 포함하는 열 이름 필터링
train_Process_Desc_col = train_data.filter(like='Θ').columns
test_Process_Desc_col = test_data.filter(like='Θ').columns

# 필터링된 열 이름 출력
print("<Θ in train_data>")
print("train_data:")
for col in train_Process_Desc_col:
    print(col)

print("test_data:")
for col in test_Process_Desc_col:
    print(col)

<? column in train_data>
CURE END POSITION ? Collect Result_Dam
CURE START POSITION ? Collect Result_Dam
<? column in test_data>
<Θ in train_data>
train_data:
CURE END POSITION Θ Collect Result_Dam
CURE START POSITION Θ Collect Result_Dam
test_data:
CURE END POSITION Θ Collect Result_Dam
CURE START POSITION Θ Collect Result_Dam


In [393]:
# '_Dam'를 포함하는 열 이름 필터링
Process_Desc_col = train_data.filter(like='_Dam').columns

# 필터링된 열 이름 출력
print("<Dam 공정 관련 변수>")
for col in Process_Desc_col:
    print(col)

<Dam 공정 관련 변수>
CURE END POSITION X Collect Result_Dam
CURE END POSITION Z Collect Result_Dam
CURE END POSITION Θ Collect Result_Dam
CURE SPEED Collect Result_Dam
CURE START POSITION X Collect Result_Dam
CURE START POSITION Θ Collect Result_Dam
DISCHARGED SPEED OF RESIN Collect Result_Dam
DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam
DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam
DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam
Dispense Volume(Stage1) Collect Result_Dam
Dispense Volume(Stage2) Collect Result_Dam
Dispense Volume(Stage3) Collect Result_Dam
HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam
HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Dam
HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam
HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam
HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam
HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam
HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam
HEAD NORMAL COORDINATE Z AXIS(

In [394]:
summary_df = summarize_grouped_data(train_data, ['Dispenser_num','CURE START POSITION X Collect Result_Dam', 'CURE END POSITION X Collect Result_Dam'])

Grouped by: Dispenser_num, CURE START POSITION X Collect Result_Dam, CURE END POSITION X Collect Result_Dam

            group  'AdNormal' count     ratio  Total
0  (0, 280, 1000)                15  1.000000     15
1  (0, 1030, 240)                19  1.000000     19
2  (1, 1030, 240)              1466  0.058614  25011
3  (2, 280, 1000)               850  0.054977  15461


'CURE START POSITION Z Collect Result_Dam' 33.5 값 하나만 존재로  
이전에 고윳값 1개에 대해서 제거하는 과정에서 빠짐

In [395]:
# 'CURE'와 '_Dam'를 포함하는 열 이름 필터링
Process_Desc_col = train_data.filter(like='_Dam').columns
filtered_columns = [col for col in Process_Desc_col if 'CURE' in col]


print("\n Dam 공정 관련 변수 중 CURE 포함 변수>")
for col in filtered_columns:
    print(col)


 Dam 공정 관련 변수 중 CURE 포함 변수>
CURE END POSITION X Collect Result_Dam
CURE END POSITION Z Collect Result_Dam
CURE END POSITION Θ Collect Result_Dam
CURE SPEED Collect Result_Dam
CURE START POSITION X Collect Result_Dam
CURE START POSITION Θ Collect Result_Dam


In [396]:
import numpy as np

# 시작 위치와 끝 위치 열 이름
start_x_col = 'CURE START POSITION X Collect Result_Dam'
start_z_col = 33.5
end_x_col = 'CURE END POSITION X Collect Result_Dam'
end_z_col = 'CURE END POSITION Z Collect Result_Dam'

# 시작 위치와 끝 위치 사이의 거리 계산
train_data['CURE_DISTANCE_Dam'] = np.sqrt(
    (train_data[end_x_col] - train_data[start_x_col]) ** 2 +
    (train_data[end_z_col] - start_z_col) ** 2
)

test_data['CURE_DISTANCE_Dam'] = np.sqrt(
    (train_data[end_x_col] - train_data[start_x_col]) ** 2 +
    (train_data[end_z_col] - start_z_col) ** 2
)

# 결과 확인
value_counts_ratio_count(train_data, 'CURE_DISTANCE_Dam', 'target')


CURE_DISTANCE_Dam별 target 비율 및 갯수

            AbNormal    Normal  AbNormal  Normal  Total
variable                                               
720.306185  0.055893  0.944107       865   14611  15476
790.607994  0.059329  0.940671      1485   23545  25030


Dispenser 종류에 따라 좌표위치가 동일하게 따라감

In [397]:
summary_df = summarize_grouped_data(train_data, ['Dispenser_num','CURE START POSITION Θ Collect Result_Dam', 'CURE END POSITION Θ Collect Result_Dam'])

Grouped by: Dispenser_num, CURE START POSITION Θ Collect Result_Dam, CURE END POSITION Θ Collect Result_Dam

           group  'AdNormal' count     ratio  Total
0  (0, -90, -90)                19  1.000000     19
1    (0, 90, 90)                15  1.000000     15
2  (1, -90, -90)              1466  0.058614  25011
3    (2, 90, 90)               850  0.054977  15461


Θ 각도도 마찬가지로 Dispenser 종류를 따라감

In [398]:
# 'CURE'와 '_Dam'를 포함하는 열 이름 필터링
Process_Desc_col = train_data.filter(like='_Dam').columns
filtered_columns = [col for col in Process_Desc_col if 'CURE' in col]


print("\n Dam 공정 관련 변수 중 CURE 포함 변수>")
for col in filtered_columns:
    print(col)


 Dam 공정 관련 변수 중 CURE 포함 변수>
CURE END POSITION X Collect Result_Dam
CURE END POSITION Z Collect Result_Dam
CURE END POSITION Θ Collect Result_Dam
CURE SPEED Collect Result_Dam
CURE START POSITION X Collect Result_Dam
CURE START POSITION Θ Collect Result_Dam
CURE_DISTANCE_Dam


In [399]:
summary_df = summarize_grouped_data(train_data, ['Dispenser_num','CURE SPEED Collect Result_Dam'])

Grouped by: Dispenser_num, CURE SPEED Collect Result_Dam

       group  'AdNormal' count     ratio  Total
0    (0, 70)                27  1.000000     27
1    (0, 85)                 4  1.000000      4
2   (0, 100)                 2  1.000000      2
3   (0, 105)                 1  1.000000      1
4    (1, 70)              1328  0.060943  21791
5    (1, 85)                51  0.042642   1196
6    (1, 95)                 1  0.012048     83
7   (1, 100)                52  0.049430   1052
8   (1, 105)                34  0.038245    889
9    (2, 70)               697  0.055269  12611
10   (2, 85)                72  0.071076   1013
11   (2, 95)                 3  0.039474     76
12  (2, 100)                39  0.038576   1011
13  (2, 105)                39  0.052000    750


In [400]:
value_counts_ratio_count(train_data, 'CURE SPEED Collect Result_Dam', 'target')


CURE SPEED Collect Result_Dam별 target 비율 및 갯수

          AbNormal    Normal  AbNormal  Normal  Total
variable                                             
70        0.059601  0.940399      2052   32377  34429
85        0.057388  0.942612       127    2086   2213
95        0.025157  0.974843         4     155    159
100       0.045036  0.954964        93    1972   2065
105       0.045122  0.954878        74    1566   1640


In [401]:
# 'CURE'와 '_Dam'를 포함하는 열 이름 필터링
Process_Desc_col = train_data.filter(like='_Dam').columns
filtered_columns = [col for col in Process_Desc_col if 'CURE' in col]

print("\n Dam 공정 관련 변수 중 CURE 포함 변수>")
for col in filtered_columns:
    print(col)


 Dam 공정 관련 변수 중 CURE 포함 변수>
CURE END POSITION X Collect Result_Dam
CURE END POSITION Z Collect Result_Dam
CURE END POSITION Θ Collect Result_Dam
CURE SPEED Collect Result_Dam
CURE START POSITION X Collect Result_Dam
CURE START POSITION Θ Collect Result_Dam
CURE_DISTANCE_Dam


In [402]:
train_data.drop(columns=[
    'CURE END POSITION X Collect Result_Dam'
    , 'CURE END POSITION Z Collect Result_Dam'
    , 'CURE END POSITION Θ Collect Result_Dam'
    , 'CURE START POSITION X Collect Result_Dam'
    , 'CURE START POSITION Θ Collect Result_Dam'
], inplace=True)

test_data.drop(columns=[
    'CURE END POSITION X Collect Result_Dam'
    , 'CURE END POSITION Z Collect Result_Dam'
    , 'CURE END POSITION Θ Collect Result_Dam'
    , 'CURE START POSITION X Collect Result_Dam'
    , 'CURE START POSITION Θ Collect Result_Dam'
], inplace=True)

In [403]:
# 'CURE'와 '_Dam'를 포함하는 열 이름 필터링
Process_Desc_col = train_data.filter(like='_Dam').columns
filtered_columns = [col for col in Process_Desc_col if 'CURE' in col]

print("\n Dam 공정 관련 변수 중 CURE 포함 변수>")
for col in filtered_columns:
    print(col)


 Dam 공정 관련 변수 중 CURE 포함 변수>
CURE SPEED Collect Result_Dam
CURE_DISTANCE_Dam


In [404]:
train_data['CURE_Time_Dam'] = train_data['CURE_DISTANCE_Dam'] / train_data['CURE SPEED Collect Result_Dam']
test_data['CURE_Time_Dam'] = test_data['CURE_DISTANCE_Dam'] / test_data['CURE SPEED Collect Result_Dam']

In [405]:
value_counts_ratio_count(train_data, 'CURE_Time_Dam', 'target')


CURE_Time_Dam별 target 비율 및 갯수

           AbNormal    Normal  AbNormal  Normal  Total
variable                                              
6.860059   0.053262  0.946738        40     711    751
7.203062   0.039526  0.960474        40     972   1012
7.529600   0.038245  0.961755        34     855    889
7.582170   0.039474  0.960526         3      73     76
7.906080   0.050332  0.949668        53    1000   1053
8.322189   0.012048  0.987952         1      82     83
8.474190   0.073819  0.926181        75     941   1016
9.301271   0.043442  0.956558        52    1145   1197
10.290088  0.056018  0.943982       707   11914  12621
11.294400  0.061675  0.938325      1345   20463  21808


In [406]:
summary_df = summarize_grouped_data(train_data, ['Dispenser_num','CURE_Time_Dam'])

Grouped by: Dispenser_num, CURE_Time_Dam

                      group  'AdNormal' count     ratio  Total
0    (0, 6.860058903775193)                 1  1.000000      1
1    (0, 7.203061848963952)                 1  1.000000      1
2    (0, 7.906079938882479)                 1  1.000000      1
3    (0, 8.474190410545827)                 3  1.000000      3
4    (0, 9.301270516332329)                 1  1.000000      1
5   (0, 10.290088355662789)                10  1.000000     10
6   (0, 11.294399912689256)                17  1.000000     17
7    (1, 7.529599941792838)                34  0.038245    889
8    (1, 7.906079938882479)                52  0.049430   1052
9    (1, 8.322189409349978)                 1  0.012048     83
10   (1, 9.301270516332329)                51  0.042642   1196
11  (1, 11.294399912689256)              1328  0.060943  21791
12   (2, 6.860058903775193)                39  0.052000    750
13   (2, 7.203061848963952)                39  0.038576   1011
14   (2, 7.58

In [407]:
# 'CURE'와 '_Dam'를 포함하는 열 이름 필터링
Process_Desc_col = train_data.filter(like='_Dam').columns
filtered_columns = [col for col in Process_Desc_col if 'CURE' in col]

print("\n Dam 공정 관련 변수 중 CURE 포함 변수>")
for col in filtered_columns:
    print(col)


 Dam 공정 관련 변수 중 CURE 포함 변수>
CURE SPEED Collect Result_Dam
CURE_DISTANCE_Dam
CURE_Time_Dam


In [408]:
train_data.drop(columns=[
    'CURE_DISTANCE_Dam'
    , 'CURE SPEED Collect Result_Dam'
], inplace=True)

test_data.drop(columns=[
    'CURE_DISTANCE_Dam'
    , 'CURE SPEED Collect Result_Dam'
], inplace=True)

In [409]:
# 'CURE'와 '_Dam'를 포함하는 열 이름 필터링
Process_Desc_col = train_data.filter(like='_Dam').columns
filtered_columns = [col for col in Process_Desc_col if 'CURE' in col]

print("\n Dam 공정 관련 변수 중 CURE 포함 변수>")
for col in filtered_columns:
    print(col)


 Dam 공정 관련 변수 중 CURE 포함 변수>
CURE_Time_Dam


In [410]:
# 'CURE'와 '_Dam'를 포함하는 열 이름 필터링
Process_Desc_col = train_data.filter(like='_Dam').columns
filtered_columns = [col for col in Process_Desc_col if 'HEAD NORMAL' in col]

print("\n Dam 공정 관련 변수 중 CURE 포함 변수>")
for col in filtered_columns:
    print(col)


 Dam 공정 관련 변수 중 CURE 포함 변수>
HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam
HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Dam
HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam
HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam
HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam
HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam
HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam
HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Dam
HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Dam
HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Dam


In [411]:
summary_df = summarize_grouped_data(train_data, ['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'
                                                 , 'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam'
                                                 , 'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Dam'])

Grouped by: HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam, HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam, HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Dam

                       group  'AdNormal' count     ratio  Total
0     (161.2, 1271.8, 281.3)                 2  0.036364     55
1   (161.2, 1271.8, 281.324)                 8  0.023952    334
2   (161.2, 1271.8, 281.424)                 0  0.000000     29
3    (161.2, 1271.8, 281.43)                15  0.036765    408
4   (161.7, 1271.8, 280.894)                19  0.038855    489
5    (162.4, 1269.0, 282.15)               157  0.088351   1777
6     (162.4, 1269.0, 284.6)                 0  0.000000      1
7     (162.4, 1269.0, 284.8)                 0  0.000000      1
8    (162.4, 1271.3, 282.15)                53  0.050189   1056
9     (162.4, 1271.8, 274.0)                21  0.055263    380
10    (162.4, 1271.8, 274.2)                32  0.041078    779
11   (162.4, 1271.8, 274.33)                22

In [412]:
summary_df = summarize_grouped_data(train_data, ['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam'
                                                 , 'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam'
                                                 , 'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Dam'])

Grouped by: HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam, HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam, HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Dam

                       group  'AdNormal' count     ratio  Total
0      (462.2, 377.0, 284.8)                 2  0.047619     42
1    (462.2, 377.1, 281.413)                 8  0.020050    399
2    (462.2, 377.1, 281.513)                 1  0.027778     36
3    (462.2, 377.1, 281.517)                12  0.029412    408
4      (462.5, 377.0, 284.8)                66  0.050267   1313
5    (462.7, 377.1, 281.095)                15  0.033632    446
6     (462.75, 377.0, 284.8)                30  0.069124    434
7      (463.0, 377.0, 284.6)                71  0.089646    792
8      (463.0, 377.0, 284.8)               593  0.099180   5979
9     (463.5, 377.3, 282.15)               126  0.052544   2398
10    (463.6, 377.1, 274.51)                 5  0.019531    256
11    (463.6, 377.1, 282.15)                47

In [413]:
summary_df = summarize_grouped_data(train_data, ['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam'
                                                 , 'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam'
                                                 , 'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Dam'])

Grouped by: HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam, HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam, HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Dam

                       group  'AdNormal' count     ratio  Total
0      (159.5, 377.5, 284.8)                68  0.050259   1353
1    (159.8, 377.6, 281.413)                 8  0.020050    399
2    (159.8, 377.6, 281.513)                 1  0.027778     36
3    (159.8, 377.6, 281.517)                12  0.029412    408
4      (160.0, 377.5, 284.6)                71  0.089646    792
5      (160.0, 377.5, 284.8)               143  0.119266   1199
6    (160.3, 377.6, 281.095)                15  0.033632    446
7     (160.5, 377.3, 282.15)               126  0.052544   2398
8      (160.5, 377.5, 284.8)               480  0.092025   5216
9     (160.8, 377.3, 282.15)                93  0.040789   2280
10    (160.8, 377.6, 282.15)                45  0.036261   1241
11    (161.2, 378.0, 274.51)                 5

In [414]:
import numpy as np

# 각 스테이지의 좌표 열 정의
stage1_cols = ['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam',
               'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam',
               'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Dam']

stage2_cols = ['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam',
               'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam',
               'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Dam']

stage3_cols = ['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam',
               'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam',
               'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Dam']

# 거리 계산 함수
def calculate_distances(data):
    data['DISTANCE_STAGE1_STAGE2_Dam'] = np.sqrt(
        (data[stage2_cols[0]] - data[stage1_cols[0]]) ** 2 +
        (data[stage2_cols[1]] - data[stage1_cols[1]]) ** 2 +
        (data[stage2_cols[2]] - data[stage1_cols[2]]) ** 2
    )

    data['DISTANCE_STAGE2_STAGE3_Dam'] = np.sqrt(
        (data[stage3_cols[0]] - data[stage2_cols[0]]) ** 2 +
        (data[stage3_cols[1]] - data[stage2_cols[1]]) ** 2 +
        (data[stage3_cols[2]] - data[stage2_cols[2]]) ** 2
    )

    data['DISTANCE_STAGE1_STAGE3_Dam'] = np.sqrt(
        (data[stage3_cols[0]] - data[stage1_cols[0]]) ** 2 +
        (data[stage3_cols[1]] - data[stage1_cols[1]]) ** 2 +
        (data[stage3_cols[2]] - data[stage1_cols[2]]) ** 2
    )

    return data

# train_data에 적용
train_data = calculate_distances(train_data)

# test_data에 적용
test_data = calculate_distances(test_data)

In [415]:
summary_df = summarize_grouped_data(train_data, ['DISTANCE_STAGE1_STAGE2_Dam'
                                                 , 'DISTANCE_STAGE2_STAGE3_Dam'
                                                 , 'DISTANCE_STAGE1_STAGE3_Dam'])

Grouped by: DISTANCE_STAGE1_STAGE2_Dam, DISTANCE_STAGE2_STAGE3_Dam, DISTANCE_STAGE1_STAGE3_Dam

                                                          group  \
0                 (85.50146197580483, 303.0004125409733, 388.5)   
1                (86.00145347608958, 302.50041322285824, 388.5)   
2                 (86.00145347608958, 303.0004125409733, 389.0)   
3    (86.20145010381205, 302.9013370719911, 389.10020560261853)   
4                 (86.20469824783332, 302.9013370719911, 389.1)   
5                 (86.2514492631863, 302.25041356464675, 388.5)   
6   (86.40144674714652, 302.70041294983395, 389.09999999999997)   
7    (86.40144674714652, 302.7013379554177, 389.10020560261853)   
8    (86.40144674714658, 302.7013379554177, 389.10020560261853)   
9                (86.49999999999994, 303.0, 389.49999999999994)   
10                                         (86.5, 303.0, 389.5)   
11    (86.50144507463439, 302.700412949834, 389.19999999999993)   
12               (86.501445074634

In [416]:
# 결과 출력
value_counts_ratio_count(train_data, 'DISTANCE_STAGE1_STAGE2_Dam', 'target')
# value_counts_ratio_count(train_data, 'DISTANCE_STAGE2_STAGE3_Dam', 'target')
# value_counts_ratio_count(train_data, 'DISTANCE_STAGE1_STAGE3_Dam', 'target')


DISTANCE_STAGE1_STAGE2_Dam별 target 비율 및 갯수

            AbNormal    Normal  AbNormal  Normal  Total
variable                                               
85.501462   0.153846  0.846154       4.0    22.0     26
86.001453   0.100146  0.899854     549.0  4933.0   5482
86.201450   0.044643  0.955357       5.0   107.0    112
86.204698   0.050584  0.949416      52.0   976.0   1028
86.251449   0.069124  0.930876      30.0   404.0    434
86.401447   0.052632  0.947368       3.0    54.0     57
86.401447   0.051173  0.948827      48.0   890.0    938
86.500000   0.042803  0.957197      91.0  2035.0   2126
86.500000   0.052544  0.947456     126.0  2272.0   2398
86.501445   0.028226  0.971774      14.0   482.0    496
86.501445   0.060259  0.939741     242.0  3774.0   4016
86.700231   0.012987  0.987013       2.0   152.0    154
86.701442   0.033888  0.966112      55.0  1568.0   1623
86.701442   0.046480  0.953520     169.0  3467.0   3636
86.701442   0.036831  0.963169      73.0  1909.0   1982
86.

In [417]:
summary_df = summarize_grouped_data(train_data, ['Dispenser_num', 'DISTANCE_STAGE1_STAGE2_Dam'])                                            

Grouped by: Dispenser_num, DISTANCE_STAGE1_STAGE2_Dam

                      group  'AdNormal' count     ratio  Total
0    (0, 86.20469824783332)                 1  1.000000      1
1    (0, 86.40144674714658)                 1  1.000000      1
2    (0, 86.49999999999994)                 3  1.000000      3
3    (0, 86.50144507463445)                 4  1.000000      4
4    (0, 86.70144174118438)                 1  1.000000      1
5    (0, 86.70144174118442)                 5  1.000000      5
6    (0, 86.70144174118448)                 1  1.000000      1
7    (0, 87.10143512020915)                 3  1.000000      3
8   (0, 302.70006607201134)                 1  1.000000      1
9     (0, 302.901997352279)                 2  1.000000      2
10   (0, 303.1019960343383)                 3  1.000000      3
11  (0, 322.41541526422094)                 1  1.000000      1
12    (0, 322.929620815434)                 1  1.000000      1
13   (0, 323.0718341174297)                 6  1.000000      6


In [418]:
train_data['DISTANCE_STAGE(1-2)+STAGE(2-3)_Dam'] = train_data['DISTANCE_STAGE1_STAGE2_Dam'] + train_data['DISTANCE_STAGE2_STAGE3_Dam']
test_data['DISTANCE_STAGE(1-2)+STAGE(2-3)_Dam'] = test_data['DISTANCE_STAGE1_STAGE2_Dam'] + test_data['DISTANCE_STAGE2_STAGE3_Dam']

In [419]:
summary_df = summarize_grouped_data(train_data, ['DISTANCE_STAGE(1-2)+STAGE(2-3)_Dam'
                                                 , 'DISTANCE_STAGE1_STAGE3_Dam'])

Grouped by: DISTANCE_STAGE(1-2)+STAGE(2-3)_Dam, DISTANCE_STAGE1_STAGE3_Dam

                                       group  'AdNormal' count     ratio  \
0                (388.50185898163556, 388.5)                 0  0.000000   
1                (388.50186282783307, 388.5)                30  0.069124   
2                 (388.5018666989478, 388.5)               450  0.094142   
3                 (388.5018745167781, 388.5)                 4  0.153846   
4    (388.7002986298364, 388.70000000000005)                 0  0.000000   
5                 (388.9002980902605, 388.9)                24  0.115942   
6                (388.90268278934377, 388.9)               104  0.094288   
7                (388.90268725273387, 388.9)                96  0.085791   
8                 (388.9047693675001, 388.9)                28  0.068796   
9                 (388.9047773021729, 388.9)                 1  0.016667   
10               (389.00186601706287, 389.0)                99  0.141026   
11   (389.10

In [420]:
train_data['diff_DISTANCE_stage_Dam'] = train_data['DISTANCE_STAGE1_STAGE3_Dam'] - train_data['DISTANCE_STAGE(1-2)+STAGE(2-3)_Dam']
test_data['diff_DISTANCE_stage_Dam'] = test_data['DISTANCE_STAGE1_STAGE3_Dam'] - test_data['DISTANCE_STAGE(1-2)+STAGE(2-3)_Dam']

In [421]:
value_counts_ratio_count(train_data, 'diff_DISTANCE_stage_Dam', 'target')


diff_DISTANCE_stage_Dam별 target 비율 및 갯수

            AbNormal    Normal  AbNormal  Normal  Total
variable                                               
-75.582935  0.067194  0.932806      17.0   236.0    253
-75.399381  0.052083  0.947917       5.0    91.0     96
-75.325696  0.073593  0.926407      17.0   214.0    231
-75.325696  0.035581  0.964419      19.0   515.0    534
-75.255466  0.000000  1.000000       0.0     3.0      3
-75.243033  0.048695  0.951305     304.0  5939.0   6243
-75.224411  0.072165  0.927835      14.0   180.0    194
-75.190006  0.000000  1.000000       0.0    29.0     29
-75.183780  0.068807  0.931193      15.0   203.0    218
-75.112275  0.000000  1.000000       0.0    34.0     34
-75.106050  0.085174  0.914826      27.0   290.0    317
-75.067250  0.115385  0.884615       3.0    23.0     26
-75.034721  0.000000  1.000000       0.0     1.0      1
-74.996010  0.000000  1.000000       0.0     3.0      3
-74.976044  0.034483  0.965517       1.0    28.0     29
-0.014

In [422]:
# 결과 출력
value_counts_ratio_count(train_data, 'DISTANCE_STAGE1_STAGE2_Dam', 'target')
# value_counts_ratio_count(train_data, 'DISTANCE_STAGE2_STAGE3_Dam', 'target')
# value_counts_ratio_count(train_data, 'DISTANCE_STAGE1_STAGE3_Dam', 'target')


DISTANCE_STAGE1_STAGE2_Dam별 target 비율 및 갯수

            AbNormal    Normal  AbNormal  Normal  Total
variable                                               
85.501462   0.153846  0.846154       4.0    22.0     26
86.001453   0.100146  0.899854     549.0  4933.0   5482
86.201450   0.044643  0.955357       5.0   107.0    112
86.204698   0.050584  0.949416      52.0   976.0   1028
86.251449   0.069124  0.930876      30.0   404.0    434
86.401447   0.052632  0.947368       3.0    54.0     57
86.401447   0.051173  0.948827      48.0   890.0    938
86.500000   0.042803  0.957197      91.0  2035.0   2126
86.500000   0.052544  0.947456     126.0  2272.0   2398
86.501445   0.028226  0.971774      14.0   482.0    496
86.501445   0.060259  0.939741     242.0  3774.0   4016
86.700231   0.012987  0.987013       2.0   152.0    154
86.701442   0.033888  0.966112      55.0  1568.0   1623
86.701442   0.046480  0.953520     169.0  3467.0   3636
86.701442   0.036831  0.963169      73.0  1909.0   1982
86.

In [423]:
train_data['DISTANCE_STAGE1_STAGE2_Dam']

0         86.204698
1         86.500000
2        323.071834
3        303.101996
4         86.501445
5         86.001453
6         86.701442
7        323.071834
8         86.501445
9         86.501445
10       323.071834
11       323.071834
12       302.901997
13       303.101996
14        86.001453
15        86.001453
16        86.001453
17        86.701442
18        86.501445
19        86.701442
20        86.701442
21       323.071834
22        86.001453
23        86.701442
24        86.701442
25        86.501445
26       302.901997
27        86.500000
28        86.500000
29        86.701442
30       323.071834
31        86.251449
32       323.071834
33       323.071834
34       323.071834
35       323.071834
36        86.500000
37        86.251449
38       302.900594
39       302.901997
40        86.500000
41        86.001453
42       302.901997
43       302.900594
44        86.401447
45        86.500000
46        86.501445
47       303.103233
48        86.401447
49       323.071834


In [424]:
import numpy as np

# 필요한 열 이름
stage1_stage2_col = 'DISTANCE_STAGE1_STAGE2_Dam'
stage2_stage3_col = 'DISTANCE_STAGE2_STAGE3_Dam'
stage1_stage3_col = 'DISTANCE_STAGE1_STAGE3_Dam'

# 삼각형의 넓이와 높이를 계산하는 함수
def calculate_triangle_features(data):
    a = data[stage1_stage2_col]
    b = data[stage2_stage3_col]
    c = data[stage1_stage3_col]

    # 헤론의 공식에 따른 삼각형의 넓이 계산
    s = (a + b + c) / 2
    area = np.sqrt(s * (s - a) * (s - b) * (s - c))

    # 높이 계산 (밑변을 c로 가정)
    height = (2 * area) / c

    # 결과를 새로운 열에 저장
    data['DISTANCE_TRIANGLE_area_Dam'] = area
    data['DISTANCE_TRIANGLE_height_Dam'] = height

    return data

# train_data에 적용
train_data = calculate_triangle_features(train_data)

# test_data에 적용
test_data = calculate_triangle_features(test_data)

In [425]:
value_counts_ratio_count(train_data, 'DISTANCE_TRIANGLE_area_Dam', 'target')


DISTANCE_TRIANGLE_area_Dam별 target 비율 및 갯수

           AbNormal    Normal  AbNormal  Normal  Total
variable                                              
0.000      0.047966  0.952034     217.0  4307.0   4524
38.870     0.000000  1.000000       0.0     2.0      2
38.890     0.115942  0.884058      24.0   183.0    207
38.950     0.012987  0.987013       2.0   152.0    154
97.125     0.000000  1.000000       0.0     2.0      2
97.125     0.153846  0.846154       4.0    22.0     26
97.125     0.069124  0.930876      30.0   404.0    434
97.125     0.094142  0.905858     450.0  4330.0   4780
97.250     0.141026  0.858974      99.0   603.0    702
97.275     0.666667  0.333333       2.0     1.0      3
97.275     0.046480  0.953520     169.0  3467.0   3636
97.300     0.045139  0.954861      65.0  1375.0   1440
97.300     0.028226  0.971774      14.0   482.0    496
97.350     0.030303  0.969697       2.0    64.0     66
97.375     0.047619  0.952381       2.0    40.0     42
97.375     0.036261 

In [426]:
import pandas as pd

# pandas 출력 옵션 설정
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

# 데이터 요약
summary_df = summarize_grouped_data(train_data, ['Dispenser_num', 'DISTANCE_TRIANGLE_area_Dam'])

# 결과 출력
print(summary_df)

Grouped by: Dispenser_num, DISTANCE_TRIANGLE_area_Dam

                      group  'AdNormal' count     ratio  Total
0                  (0, 0.0)                 3  1.000000      3
1   (0, 38.889999995880395)                 1  1.000000      1
2    (0, 97.27499999905812)                 5  1.000000      5
3     (0, 97.2999999983805)                 4  1.000000      4
4    (0, 97.34999999933613)                 1  1.000000      1
5    (0, 97.37499999918606)                 1  1.000000      1
6    (0, 97.45000000164453)                 3  1.000000      3
7    (0, 114.5549999994864)                 1  1.000000      1
8   (0, 175.09499999841518)                 1  1.000000      1
9    (0, 214.1700000001986)                 2  1.000000      2
10  (0, 214.28000000011698)                 3  1.000000      3
11  (0, 21820.265000000018)                 3  1.000000      3
12   (0, 21820.26500000003)                 3  1.000000      3
13  (0, 21825.870000000014)                 1  1.000000      1


In [427]:
value_counts_ratio_count(train_data, 'DISTANCE_TRIANGLE_height_Dam', 'target')


DISTANCE_TRIANGLE_height_Dam별 target 비율 및 갯수

            AbNormal    Normal  AbNormal  Normal  Total
variable                                               
0.000000    0.047966  0.952034     217.0  4307.0   4524
0.200000    0.115942  0.884058      24.0   183.0    207
0.200000    0.000000  1.000000       0.0     2.0      2
0.200000    0.012987  0.987013       2.0   152.0    154
0.500000    0.045139  0.954861      65.0  1375.0   1440
0.500000    0.666667  0.333333       2.0     1.0      3
0.500000    0.046480  0.953520     169.0  3467.0   3636
0.500000    0.047619  0.952381       2.0    40.0     42
0.500000    0.036261  0.963739      45.0  1196.0   1241
0.500000    0.030303  0.969697       2.0    64.0     66
0.500000    0.068765  0.931235     177.0  2397.0   2574
0.500000    0.000000  1.000000       0.0     2.0      2
0.500000    0.153846  0.846154       4.0    22.0     26
0.500000    0.141026  0.858974      99.0   603.0    702
0.500000    0.069124  0.930876      30.0   404.0    434
0

In [428]:
import pandas as pd

# pandas 출력 옵션 설정
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

# 데이터 요약
summary_df = summarize_grouped_data(train_data, ['Dispenser_num', 'DISTANCE_TRIANGLE_height_Dam'])

# 결과 출력
print(summary_df)

Grouped by: Dispenser_num, DISTANCE_TRIANGLE_height_Dam

                       group  'AdNormal' count     ratio  Total
0                   (0, 0.0)                 3  1.000000      3
1   (0, 0.19999999997881407)                 1  1.000000      1
2    (0, 0.4999999999916778)                 4  1.000000      4
3   (0, 0.49999999999515876)                 5  1.000000      5
4    (0, 0.4999999999958207)                 1  1.000000      1
5    (0, 0.4999999999965903)                 1  1.000000      1
6    (0, 0.5000000000084378)                 3  1.000000      3
7    (0, 0.5888200435261629)                 1  1.000000      1
8    (0, 0.8999999999918539)                 1  1.000000      1
9    (0, 1.1000000000006005)                 3  1.000000      3
10     (0, 1.10000000000102)                 2  1.000000      2
11   (0, 112.10000000000008)                 1  1.000000      1
12   (0, 112.10000000000011)                 3  1.000000      3
13   (0, 112.10000000000014)                 3 

In [429]:
# '_Dam'를 포함하는 열 이름 필터링
Process_Desc_col = train_data.filter(like='_Dam').columns

# 필터링된 열 이름 출력
print("<Dam 공정 관련 변수>")
for col in Process_Desc_col:
    print(col)

<Dam 공정 관련 변수>
DISCHARGED SPEED OF RESIN Collect Result_Dam
DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam
DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam
DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam
Dispense Volume(Stage1) Collect Result_Dam
Dispense Volume(Stage2) Collect Result_Dam
Dispense Volume(Stage3) Collect Result_Dam
HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam
HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Dam
HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam
HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam
HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam
HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam
HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam
HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Dam
HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Dam
HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Dam
Head Clean Position Z Collect Result_Dam
Head Purge Position Z Collect Result_Dam
Head Z

In [430]:
# 'Stage1 Circle'와 '_Dam'를 포함하는 열 이름 필터링
Process_Desc_col = train_data.filter(like='_Dam').columns
filtered_columns = [col for col in Process_Desc_col if 'Stage1 Circle' in col]

print("\n Dam 공정 관련 변수 중 Stage1 Circle 포함 변수>")
for col in filtered_columns:
    print(col)


 Dam 공정 관련 변수 중 Stage1 Circle 포함 변수>
Stage1 Circle1 Distance Speed Collect Result_Dam
Stage1 Circle2 Distance Speed Collect Result_Dam
Stage1 Circle3 Distance Speed Collect Result_Dam
Stage1 Circle4 Distance Speed Collect Result_Dam


In [431]:
import pandas as pd

# pandas 출력 옵션 설정
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

# 데이터 요약
summary_df = summarize_grouped_data(train_data, [
    'Stage1 Circle1 Distance Speed Collect Result_Dam'
    , 'Stage1 Circle2 Distance Speed Collect Result_Dam'
    , 'Stage1 Circle3 Distance Speed Collect Result_Dam'
    , 'Stage1 Circle4 Distance Speed Collect Result_Dam'
    ])

Grouped by: Stage1 Circle1 Distance Speed Collect Result_Dam, Stage1 Circle2 Distance Speed Collect Result_Dam, Stage1 Circle3 Distance Speed Collect Result_Dam, Stage1 Circle4 Distance Speed Collect Result_Dam

                      group  'AdNormal' count     ratio  Total
0  (4000, 4000, 4000, 4000)               256  0.041707   6138
1  (5000, 5000, 5000, 5000)               305  0.062232   4901
2  (5800, 5800, 5800, 5800)               269  0.050803   5295
3  (6000, 6000, 6000, 6000)                65  0.048435   1342
4  (6200, 6200, 6200, 6200)                15  0.052817    284
5  (6500, 6500, 6500, 6500)               595  0.047452  12539
6  (9000, 9000, 9000, 9000)               845  0.084441  10007


In [432]:
import pandas as pd

# pandas 출력 옵션 설정
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

# 데이터 요약
summary_df = summarize_grouped_data(train_data, [
    'Stage1 Line1 Distance Speed Collect Result_Dam'
    , 'Stage1 Line2 Distance Speed Collect Result_Dam'
    , 'Stage1 Line3 Distance Speed Collect Result_Dam'
    , 'Stage1 Line4 Distance Speed Collect Result_Dam'
    ])

Grouped by: Stage1 Line1 Distance Speed Collect Result_Dam, Stage1 Line2 Distance Speed Collect Result_Dam, Stage1 Line3 Distance Speed Collect Result_Dam, Stage1 Line4 Distance Speed Collect Result_Dam

                        group  'AdNormal' count     ratio  Total
0    (4000, 4000, 4000, 4000)               256  0.041707   6138
1    (5000, 5000, 5000, 5000)               305  0.062232   4901
2    (5800, 5800, 5600, 5800)               227  0.056764   3999
3    (5800, 5800, 5800, 5800)                42  0.032407   1296
4    (6000, 6000, 6000, 6000)                65  0.048435   1342
5    (6000, 6500, 6500, 6500)                 5  0.024752    202
6    (6200, 6200, 6200, 6200)                15  0.052817    284
7    (6500, 6500, 6000, 6500)                 4  0.013605    294
8    (6500, 6500, 6500, 6500)               586  0.048663  12042
9    (7000, 9000, 7000, 9000)                12  0.113208    106
10   (9000, 9000, 9000, 9000)               833  0.084133   9901
11  (13000, 6500

In [433]:
import pandas as pd

# pandas 출력 옵션 설정
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

# 데이터 요약
summary_df = summarize_grouped_data(train_data, [
    'Stage1 Circle1 Distance Speed Collect Result_Dam'
    , 'Stage1 Circle2 Distance Speed Collect Result_Dam'
    , 'Stage1 Circle3 Distance Speed Collect Result_Dam'
    , 'Stage1 Circle4 Distance Speed Collect Result_Dam'
    , 'Stage1 Line1 Distance Speed Collect Result_Dam'
    , 'Stage1 Line2 Distance Speed Collect Result_Dam'
    , 'Stage1 Line3 Distance Speed Collect Result_Dam'
    , 'Stage1 Line4 Distance Speed Collect Result_Dam'
    ])

Grouped by: Stage1 Circle1 Distance Speed Collect Result_Dam, Stage1 Circle2 Distance Speed Collect Result_Dam, Stage1 Circle3 Distance Speed Collect Result_Dam, Stage1 Circle4 Distance Speed Collect Result_Dam, Stage1 Line1 Distance Speed Collect Result_Dam, Stage1 Line2 Distance Speed Collect Result_Dam, Stage1 Line3 Distance Speed Collect Result_Dam, Stage1 Line4 Distance Speed Collect Result_Dam

                                                group  'AdNormal' count  \
0    (4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000)               256   
1    (5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000)               305   
2    (5800, 5800, 5800, 5800, 5800, 5800, 5600, 5800)               227   
3    (5800, 5800, 5800, 5800, 5800, 5800, 5800, 5800)                42   
4    (6000, 6000, 6000, 6000, 6000, 6000, 6000, 6000)                65   
5    (6200, 6200, 6200, 6200, 6200, 6200, 6200, 6200)                15   
6    (6500, 6500, 6500, 6500, 6000, 6500, 6500, 6500)                 5

In [434]:
# 'Stage1 Circle'와 '_Dam'를 포함하는 열 이름 필터링
Process_Desc_col = train_data.filter(like='_Dam').columns
filtered_columns = [col for col in Process_Desc_col if 'Stage2 Line' in col]

print("\n Dam 공정 관련 변수 중 Stage1 Line 포함 변수>")
for col in filtered_columns:
    print(col)


 Dam 공정 관련 변수 중 Stage1 Line 포함 변수>
Stage2 Line1 Distance Speed Collect Result_Dam
Stage2 Line2 Distance Speed Collect Result_Dam
Stage2 Line3 Distance Speed Collect Result_Dam
Stage2 Line4 Distance Speed Collect Result_Dam


In [435]:
# 'Stage1 Circle'와 '_Dam'를 포함하는 열 이름 필터링
Process_Desc_col = train_data.filter(like='_Dam').columns
filtered_columns = [col for col in Process_Desc_col if 'Stage2 Circle' in col]

print("\n Dam 공정 관련 변수 중 Stage1 Circle 포함 변수>")
for col in filtered_columns:
    print(col)


 Dam 공정 관련 변수 중 Stage1 Circle 포함 변수>
Stage2 Circle1 Distance Speed Collect Result_Dam
Stage2 Circle2 Distance Speed Collect Result_Dam
Stage2 Circle3 Distance Speed Collect Result_Dam
Stage2 Circle4 Distance Speed Collect Result_Dam


In [436]:
import pandas as pd

# pandas 출력 옵션 설정
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

# 데이터 요약
summary_df = summarize_grouped_data(train_data, [
    'Stage2 Circle1 Distance Speed Collect Result_Dam'
    , 'Stage2 Circle2 Distance Speed Collect Result_Dam'
    , 'Stage2 Circle3 Distance Speed Collect Result_Dam'
    , 'Stage2 Circle4 Distance Speed Collect Result_Dam'
    , 'Stage2 Line1 Distance Speed Collect Result_Dam'
    , 'Stage2 Line2 Distance Speed Collect Result_Dam'
    , 'Stage2 Line3 Distance Speed Collect Result_Dam'
    , 'Stage2 Line4 Distance Speed Collect Result_Dam'
    ])

Grouped by: Stage2 Circle1 Distance Speed Collect Result_Dam, Stage2 Circle2 Distance Speed Collect Result_Dam, Stage2 Circle3 Distance Speed Collect Result_Dam, Stage2 Circle4 Distance Speed Collect Result_Dam, Stage2 Line1 Distance Speed Collect Result_Dam, Stage2 Line2 Distance Speed Collect Result_Dam, Stage2 Line3 Distance Speed Collect Result_Dam, Stage2 Line4 Distance Speed Collect Result_Dam

                                                       group  \
0           (4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000)   
1           (5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000)   
2           (5300, 5300, 5300, 5300, 5300, 5300, 5300, 5300)   
3           (5500, 5500, 5500, 5500, 5500, 5500, 5500, 5500)   
4           (5500, 5500, 5500, 5500, 5500, 5800, 5500, 5800)   
5           (6000, 6000, 6000, 6000, 5500, 6000, 5000, 6000)   
6           (6500, 6500, 6500, 6500, 5000, 6500, 5000, 6500)   
7           (6500, 6500, 6500, 6500, 5500, 6500, 5500, 6500)   
8           (6500, 6

In [437]:
import pandas as pd

# pandas 출력 옵션 설정
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

# 데이터 요약
summary_df = summarize_grouped_data(train_data, [
    'Stage3 Circle1 Distance Speed Collect Result_Dam'
    , 'Stage3 Circle2 Distance Speed Collect Result_Dam'
    , 'Stage3 Circle3 Distance Speed Collect Result_Dam'
    , 'Stage3 Circle4 Distance Speed Collect Result_Dam'
    , 'Stage3 Line1 Distance Speed Collect Result_Dam'
    , 'Stage3 Line2 Distance Speed Collect Result_Dam'
    , 'Stage3 Line3 Distance Speed Collect Result_Dam'
    , 'Stage3 Line4 Distance Speed Collect Result_Dam'
    ])

Grouped by: Stage3 Circle1 Distance Speed Collect Result_Dam, Stage3 Circle2 Distance Speed Collect Result_Dam, Stage3 Circle3 Distance Speed Collect Result_Dam, Stage3 Circle4 Distance Speed Collect Result_Dam, Stage3 Line1 Distance Speed Collect Result_Dam, Stage3 Line2 Distance Speed Collect Result_Dam, Stage3 Line3 Distance Speed Collect Result_Dam, Stage3 Line4 Distance Speed Collect Result_Dam

                                              group  'AdNormal' count  \
0  (4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000)               256   
1  (5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000)               305   
2  (5800, 5800, 5800, 5800, 5800, 5800, 5800, 5800)               269   
3  (6000, 6000, 6000, 6000, 6000, 6000, 6000, 6000)                70   
4  (6200, 6200, 6200, 6200, 6200, 6200, 6200, 6200)                15   
5  (6500, 6500, 6500, 6500, 6000, 6500, 6500, 6500)                 5   
6  (6500, 6500, 6500, 6500, 6500, 6500, 6000, 6500)                 4   
7  (6500, 65

In [438]:
import pandas as pd

# '_Dam'를 포함하는 열 이름 필터링 함수
def filter_dam_columns(data):
    return data.filter(like='_Dam').columns

# 스테이지별로 파생변수 생성 함수
def create_stage_variables(data, columns):
    stages = ['Stage1', 'Stage2', 'Stage3']
    for stage in stages:
        stage_cols = [col for col in columns if stage in col]
        new_col_name = f'{stage}_Total_Distance_Speed_Collect_Result_Dam'
        data[new_col_name] = data[stage_cols].sum(axis=1)
        # 이전 변수 삭제
        data.drop(columns=stage_cols, inplace=True)
    return data

# train_data에 적용
train_dam_columns = filter_dam_columns(train_data)
train_data = create_stage_variables(train_data, train_dam_columns)

# test_data에 적용
test_dam_columns = filter_dam_columns(test_data)
test_data = create_stage_variables(test_data, test_dam_columns)

  del sys.path[0]


In [439]:
train_data['Stage1_Total_Distance_Speed_Collect_Result_Dam'].value_counts().head()

73221.570    4248
73221.670    2917
33739.140    2009
53730.963    1499
41227.640    1431
Name: Stage1_Total_Distance_Speed_Collect_Result_Dam, dtype: int64

In [440]:
# '_Dam'를 포함하는 열 이름 필터링
Process_Desc_col = train_data.filter(like='_Dam').columns

# 필터링된 열 이름 출력
print("<Dam 공정 관련 변수>")
for col in Process_Desc_col:
    print(col)

<Dam 공정 관련 변수>
DISCHARGED SPEED OF RESIN Collect Result_Dam
Head Clean Position Z Collect Result_Dam
Head Purge Position Z Collect Result_Dam
Head Zero Position Y Collect Result_Dam
Head Zero Position Z Collect Result_Dam
Machine Tact time Collect Result_Dam
PalletID Collect Result_Dam
Production Qty Collect Result_Dam
Receip No Collect Result_Dam
THICKNESS 1 Collect Result_Dam
THICKNESS 2 Collect Result_Dam
THICKNESS 3 Collect Result_Dam
WorkMode Collect Result_Dam
CURE_Time_Dam
DISTANCE_STAGE1_STAGE2_Dam
DISTANCE_STAGE2_STAGE3_Dam
DISTANCE_STAGE1_STAGE3_Dam
DISTANCE_STAGE(1-2)+STAGE(2-3)_Dam
diff_DISTANCE_stage_Dam
DISTANCE_TRIANGLE_area_Dam
DISTANCE_TRIANGLE_height_Dam
Stage1_Total_Distance_Speed_Collect_Result_Dam
Stage2_Total_Distance_Speed_Collect_Result_Dam
Stage3_Total_Distance_Speed_Collect_Result_Dam


In [441]:
# 'THICKNESS'와 '_Dam'를 포함하는 열 이름 필터링
Process_Desc_col = train_data.filter(like='_Dam').columns
filtered_columns = [col for col in Process_Desc_col if 'THICKNESS' in col]

print("\n Dam 공정 관련 변수 중 THICKNESS 포함 변수>")
for col in filtered_columns:
    print(col)


 Dam 공정 관련 변수 중 THICKNESS 포함 변수>
THICKNESS 1 Collect Result_Dam
THICKNESS 2 Collect Result_Dam
THICKNESS 3 Collect Result_Dam


In [442]:
import pandas as pd

# pandas 출력 옵션 설정
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

# 데이터 요약
summary_df = summarize_grouped_data(train_data, [
    'THICKNESS 1 Collect Result_Dam'
    , 'THICKNESS 2 Collect Result_Dam'
    , 'THICKNESS 3 Collect Result_Dam'
    ])

Grouped by: THICKNESS 1 Collect Result_Dam, THICKNESS 2 Collect Result_Dam, THICKNESS 3 Collect Result_Dam

                      group  'AdNormal' count     ratio  Total
0   (-0.054, -0.219, 0.007)                37  0.041855    884
1  (-0.019, -0.021, -0.118)                74  0.044876   1649
2  (-0.015, -0.036, -0.026)                72  0.074689    964
3           (0.0, 0.0, 0.0)              2091  0.059036  35419
4    (0.012, -0.022, 0.003)                 2  0.016000    125
5    (0.014, -0.058, 0.012)                62  0.051796   1197
6     (0.014, 0.007, 0.012)                 6  0.040000    150
7     (0.037, 0.005, 0.024)                 6  0.050847    118


In [443]:
# 새로운 파생변수 생성 함수
def create_total_thickness_dam(data):
    data['Total_THICKNESS_Collect_Result_Dam'] = (
        data['THICKNESS 1 Collect Result_Dam'] +
        data['THICKNESS 2 Collect Result_Dam'] +
        data['THICKNESS 3 Collect Result_Dam']
    )
    # 기존 변수 삭제
    data.drop(columns=[
        'THICKNESS 1 Collect Result_Dam',
        'THICKNESS 2 Collect Result_Dam',
        'THICKNESS 3 Collect Result_Dam'
    ], inplace=True)
    return data

train_data = create_total_thickness_dam(train_data)
test_data = create_total_thickness_dam(test_data)

In [444]:
value_counts_ratio_count(train_data, 'Total_THICKNESS_Collect_Result_Dam', 'target')


Total_THICKNESS_Collect_Result_Dam별 target 비율 및 갯수

          AbNormal    Normal  AbNormal  Normal  Total
variable                                             
-0.266    0.041855  0.958145        37     847    884
-0.158    0.044876  0.955124        74    1575   1649
-0.077    0.074689  0.925311        72     892    964
-0.032    0.051796  0.948204        62    1135   1197
-0.007    0.016000  0.984000         2     123    125
 0.000    0.059036  0.940964      2091   33328  35419
 0.033    0.040000  0.960000         6     144    150
 0.066    0.050847  0.949153         6     112    118


In [445]:
# '_Dam'를 포함하는 열 이름 필터링
Process_Desc_col = train_data.filter(like='_Dam').columns

# 필터링된 열 이름 출력
print("<Dam 공정 관련 변수>")
for col in Process_Desc_col:
    print(col)

<Dam 공정 관련 변수>
DISCHARGED SPEED OF RESIN Collect Result_Dam
Head Clean Position Z Collect Result_Dam
Head Purge Position Z Collect Result_Dam
Head Zero Position Y Collect Result_Dam
Head Zero Position Z Collect Result_Dam
Machine Tact time Collect Result_Dam
PalletID Collect Result_Dam
Production Qty Collect Result_Dam
Receip No Collect Result_Dam
WorkMode Collect Result_Dam
CURE_Time_Dam
DISTANCE_STAGE1_STAGE2_Dam
DISTANCE_STAGE2_STAGE3_Dam
DISTANCE_STAGE1_STAGE3_Dam
DISTANCE_STAGE(1-2)+STAGE(2-3)_Dam
diff_DISTANCE_stage_Dam
DISTANCE_TRIANGLE_area_Dam
DISTANCE_TRIANGLE_height_Dam
Stage1_Total_Distance_Speed_Collect_Result_Dam
Stage2_Total_Distance_Speed_Collect_Result_Dam
Stage3_Total_Distance_Speed_Collect_Result_Dam
Total_THICKNESS_Collect_Result_Dam


In [446]:
# 'Head'와 '_Dam'를 포함하는 열 이름 필터링
Process_Desc_col = train_data.filter(like='_Dam').columns
filtered_columns = [col for col in Process_Desc_col if 'Head' in col]

print("\n Dam 공정 관련 변수 중 Head 포함 변수>")
for col in filtered_columns:
    print(col)


 Dam 공정 관련 변수 중 Head 포함 변수>
Head Clean Position Z Collect Result_Dam
Head Purge Position Z Collect Result_Dam
Head Zero Position Y Collect Result_Dam
Head Zero Position Z Collect Result_Dam


In [447]:
import pandas as pd

# pandas 출력 옵션 설정
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

# 데이터 요약
summary_df = summarize_grouped_data(train_data, [
    'Head Clean Position Z Collect Result_Dam'
    , 'Head Purge Position Z Collect Result_Dam'
    , 'Head Zero Position Y Collect Result_Dam'
    , 'Head Zero Position Z Collect Result_Dam'
    ])

Grouped by: Head Clean Position Z Collect Result_Dam, Head Purge Position Z Collect Result_Dam, Head Zero Position Y Collect Result_Dam, Head Zero Position Z Collect Result_Dam

                             group  'AdNormal' count     ratio  Total
0  (118.85, 130.85, 300.0, 265.02)                 0  0.000000     33
1    (124.0, 130.85, 300.0, 265.0)                88  0.049383   1782
2   (124.0, 130.85, 300.0, 265.02)               315  0.046938   6711
3     (124.5, 124.5, 303.5, 265.0)                28  0.084592    331
4   (130.85, 130.85, 300.0, 265.0)               716  0.048261  14836
5  (130.85, 130.85, 300.0, 265.02)               424  0.049865   8503
6   (130.85, 130.85, 303.5, 265.0)                43  0.671875     64
7    (130.85, 133.5, 300.0, 265.0)                 2  0.133333     15
8     (133.5, 133.5, 300.0, 265.0)               323  0.083484   3869
9     (133.5, 133.5, 303.5, 265.0)               411  0.094223   4362


In [448]:
# '_Dam'를 포함하는 열 이름 필터링
Process_Desc_col = train_data.filter(like='_Dam').columns

# 필터링된 열 이름 출력
print("<Dam 공정 관련 변수>")
for col in Process_Desc_col:
    print(col)

<Dam 공정 관련 변수>
DISCHARGED SPEED OF RESIN Collect Result_Dam
Head Clean Position Z Collect Result_Dam
Head Purge Position Z Collect Result_Dam
Head Zero Position Y Collect Result_Dam
Head Zero Position Z Collect Result_Dam
Machine Tact time Collect Result_Dam
PalletID Collect Result_Dam
Production Qty Collect Result_Dam
Receip No Collect Result_Dam
WorkMode Collect Result_Dam
CURE_Time_Dam
DISTANCE_STAGE1_STAGE2_Dam
DISTANCE_STAGE2_STAGE3_Dam
DISTANCE_STAGE1_STAGE3_Dam
DISTANCE_STAGE(1-2)+STAGE(2-3)_Dam
diff_DISTANCE_stage_Dam
DISTANCE_TRIANGLE_area_Dam
DISTANCE_TRIANGLE_height_Dam
Stage1_Total_Distance_Speed_Collect_Result_Dam
Stage2_Total_Distance_Speed_Collect_Result_Dam
Stage3_Total_Distance_Speed_Collect_Result_Dam
Total_THICKNESS_Collect_Result_Dam


In [449]:
# 삭제할 열 이름 정의
columns_to_drop = [
    'DISTANCE_STAGE1_STAGE2_Dam'
    , 'DISTANCE_STAGE2_STAGE3_Dam'
    , 'DISTANCE_STAGE1_STAGE3_Dam'
    , 'DISTANCE_STAGE(1-2)+STAGE(2-3)_Dam'
    , 'diff_DISTANCE_stage_Dam'
]

# train_data에서 열 삭제
train_data = train_data.drop(columns=columns_to_drop)

# test_data에서 열 삭제
test_data = test_data.drop(columns=columns_to_drop)

In [450]:
# '_Dam'를 포함하는 열 이름 필터링
Process_Desc_col = train_data.filter(like='_Dam').columns

# 필터링된 열 이름 출력
print("<Dam 공정 관련 변수>")
for col in Process_Desc_col:
    print(col)

<Dam 공정 관련 변수>
DISCHARGED SPEED OF RESIN Collect Result_Dam
Head Clean Position Z Collect Result_Dam
Head Purge Position Z Collect Result_Dam
Head Zero Position Y Collect Result_Dam
Head Zero Position Z Collect Result_Dam
Machine Tact time Collect Result_Dam
PalletID Collect Result_Dam
Production Qty Collect Result_Dam
Receip No Collect Result_Dam
WorkMode Collect Result_Dam
CURE_Time_Dam
DISTANCE_TRIANGLE_area_Dam
DISTANCE_TRIANGLE_height_Dam
Stage1_Total_Distance_Speed_Collect_Result_Dam
Stage2_Total_Distance_Speed_Collect_Result_Dam
Stage3_Total_Distance_Speed_Collect_Result_Dam
Total_THICKNESS_Collect_Result_Dam


## AutoClave

## Fill1

## Fill2

---