# 제품 이상여부 판별 프로젝트


## 1. 데이터 불러오기


### 필수 라이브러리


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### 데이터 읽어오기


In [2]:
RANDOM_STATE = 110

train_data = pd.read_csv("train_data_forEDA.csv")
test_data = pd.read_csv("test_data_forEDA.csv")

---

반복적으로 쓰는 툴 함수화

In [3]:
def plot_box(dataframe, column_name):
    """
    주어진 데이터프레임과 열 이름에 대해 박스 플롯을 그리는 함수.

    Parameters:
    dataframe (pd.DataFrame): 데이터프레임
    column_name (str): 열 이름
    """
    plt.figure(figsize=(10, 6))
    plt.boxplot(dataframe[column_name], vert=False)
    plt.xlabel(column_name)
    plt.title(f'Box Plot of {column_name}')
    plt.show()

In [4]:
import pandas as pd

def value_counts_ratio_count(df, col_name, target_name):
    """
    주어진 데이터프레임의 특정 열에 대해 각 값마다 타겟 변수의 비율과 갯수, 총 갯수를 출력하는 함수.

    Parameters:
    df (pd.DataFrame): 데이터프레임
    col_name (str): 열 이름
    target_name (str): 타겟 변수 이름
    """
    # 각 값마다 타겟 변수의 비율 계산
    value_counts = df.groupby(col_name)[target_name].value_counts(normalize=True).unstack().fillna(0)
    
    # 각 값마다 타겟 변수의 갯수 계산
    counts = df.groupby(col_name)[target_name].value_counts().unstack().fillna(0)
    
    # 각 값마다 총 갯수 계산
    total_counts = df[col_name].value_counts().rename('Total_Count')
    
    # 비율과 갯수를 합침
    result = value_counts.join(counts, lsuffix='_ratio', rsuffix='_count')
    
    # 총 갯수를 합침
    result = result.join(total_counts, on=col_name)
    
    # 출력 형식 조정
    result.index.name = 'variable'
    print(f"\n{col_name}별 {target_name} 비율 및 갯수\n")
    print(result.rename(columns=lambda x: x.split('_')[0]))

In [5]:
import pandas as pd

def summarize_grouped_data(df, group_by_columns):
    # 데이터프레임을 그룹화
    grouped_df = df.groupby(group_by_columns)
    
    # 결과를 저장할 리스트 초기화
    results = []
    
    # 그룹화된 데이터프레임의 내용을 확인하는 코드
    for name, group in grouped_df:
        # 그룹의 갯수 계산
        group_count = group.shape[0]
        
        # 'target' 변수의 'AdNormal' 비율과 갯수 계산
        adnormal_count = group['target'].value_counts().get('AbNormal', 0)
        adnormal_ratio = adnormal_count / group_count
        
        # 결과 리스트에 추가
        results.append([name, adnormal_count, adnormal_ratio, group_count])
    
    # 결과 리스트를 데이터프레임으로 변환
    results_df = pd.DataFrame(results, columns=['group', "'AdNormal' count", 'ratio', 'Total'])
    
    # 그룹화된 변수들의 이름을 제목행으로 출력
    print(f"Grouped by: {', '.join(group_by_columns)}")
    print()
    # 데이터프레임 출력
    print(results_df)
    
    return results_df

In [6]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

def plot_abnormal_ratio(dataframe, column_name, target_name, target_value, bins=20):
    """
    주어진 데이터프레임의 특정 열에 대해 각 값마다 타겟 변수의 특정 값 비율을 계산하고 막대그래프로 표시하는 함수.

    Parameters:
    dataframe (pd.DataFrame): 데이터프레임
    column_name (str): 열 이름
    target_name (str): 타겟 변수 이름
    target_value (str): 타겟 변수의 특정 값
    bins (int): 구간의 수 (기본값은 20)
    """
    def abnormal_ratio(dataframe, column_name, target_name, target_value):
        """
        주어진 데이터프레임의 특정 열에 대해 각 값마다 타겟 변수의 특정 값 비율을 계산하는 함수.

        Parameters:
        dataframe (pd.DataFrame): 데이터프레임
        column_name (str): 열 이름
        target_name (str): 타겟 변수 이름
        target_value (str): 타겟 변수의 특정 값

        Returns:
        pd.DataFrame: 각 값마다 타겟 변수의 특정 값 비율을 포함하는 데이터프레임
        """
        # 각 값마다 타겟 변수의 특정 값 비율 계산
        value_counts = dataframe.groupby(column_name)[target_name].apply(lambda x: (x == target_value).mean()).reset_index()
        count_counts = dataframe.groupby(column_name)[target_name].count().reset_index()
        
        value_counts.columns = [column_name, 'ratio']
        count_counts.columns = [column_name, 'count']
        
        # 비율과 카운트를 병합
        result = pd.merge(value_counts, count_counts, on=column_name)
        return result

    # column_name 값을 지정된 구간으로 나누기
    dataframe[f'{column_name}_bins'] = pd.cut(dataframe[column_name], bins=bins)

    # 비율 계산
    ratios = abnormal_ratio(dataframe, f'{column_name}_bins', target_name, target_value)

    # 막대그래프 그리기
    plt.figure(figsize=(20, 10))
    barplot = sns.barplot(x=f'{column_name}_bins', y='ratio', data=ratios, color='skyblue')
    plt.xlabel(f'{column_name} (binned)')
    plt.ylabel('AbNormal Ratio')
    plt.title(f'AbNormal Ratio by {column_name} (binned)', pad=30)  # 제목과 그래프 사이의 간격 조정
    plt.xticks(rotation=45)
    plt.ylim(0, 1)

    # 각 막대 위에 비율 값과 카운트 표시
    for p in barplot.patches:
        # 막대의 x 좌표에 해당하는 구간을 찾기
        bin_label = ratios[f'{column_name}_bins'].cat.categories[int(p.get_x() + p.get_width() / 2) - 1]
        count_value = ratios.loc[ratios[f'{column_name}_bins'] == bin_label, 'count'].values[0]
        barplot.annotate(f'{format(p.get_height(), ".2f")} ({count_value})', 
                         (p.get_x() + p.get_width() / 2., p.get_height()), 
                         ha='center', va='center', 
                         xytext=(0, 9), 
                         textcoords='offset points')

    plt.show()

    # _bins 변수 드랍
    dataframe.drop(columns=[f'{column_name}_bins'], inplace=True)


---

## Dam

## AutoClave

## Fill1

In [16]:
# 'CURE'와 '_Dam'를 포함하는 열 이름 필터링
Process_Desc_col = train_data.filter(like='_Fill1').columns
filtered_columns = [col for col in Process_Desc_col if 'HEAD NORMAL' in col]

print("\n Dam 공정 관련 변수 중 CURE 포함 변수>")
for col in filtered_columns:
    print(col)


 Dam 공정 관련 변수 중 CURE 포함 변수>
HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1
HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill1
HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill1
HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill1
HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1
HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1
HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1
HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill1
HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill1
HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill1


In [17]:
summary_df = summarize_grouped_data(train_data, ['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'
                                                 , 'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1'
                                                 , 'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill1'])

Grouped by: HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1, HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1, HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill1

                       group  'AdNormal' count     ratio  Total
0   (156.1, 1323.3, 244.535)                 1  0.015385     65
1     (157.0, 430.3, 244.52)                 7  0.104478     67
2     (458.7, 429.8, 244.45)                 1  0.017857     56
3   (458.7, 1322.8, 244.415)                 4  0.133333     30
4    (681.2, 1332.8, 225.85)                 1  1.000000      1
5   (837.4, 1323.5, 244.275)                26  0.048059    541
6   (837.4, 1323.5, 244.505)                55  0.044319   1241
7     (837.5, 1323.2, 244.6)                 0  0.000000      2
8     (837.7, 1323.2, 244.2)                58  0.120083    483
9     (837.7, 1323.2, 244.3)               113  0.043952   2571
10    (837.7, 1323.2, 244.4)                77  0.087600    879
11    (837.7, 1323.5, 244.3)            

In [18]:
summary_df = summarize_grouped_data(train_data, ['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill1'
                                                 , 'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1'
                                                 , 'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill1'])

Grouped by: HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill1, HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1, HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill1

                       group  'AdNormal' count     ratio  Total
0     (457.6, 1322.5, 244.3)                22  0.114583    192
1     (457.7, 1322.5, 244.3)                 4  0.041237     97
2   (457.8, 1322.5, 244.275)                26  0.048059    541
3     (457.8, 1322.5, 244.3)                38  0.054913    692
4   (457.8, 1322.5, 244.505)                30  0.068807    436
..                       ...               ...       ...    ...
56    (460.0, 1323.2, 244.6)                 0  0.000000      2
57     (460.5, 430.5, 244.4)                 0  0.000000      3
58    (682.5, 1325.8, 244.2)                 1  1.000000      1
59  (838.1, 1323.5, 244.415)                 4  0.133333     30
60    (838.4, 430.0, 244.45)                 1  0.017857     56

[61 rows x 4 columns]


In [19]:
summary_df = summarize_grouped_data(train_data, ['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill1'
                                                 , 'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1'
                                                 , 'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill1'])

Grouped by: HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill1, HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1, HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill1

                       group  'AdNormal' count     ratio  Total
0   (156.1, 1323.1, 244.415)                17  0.036638    464
1   (156.1, 1323.1, 244.505)                 8  0.023460    341
2   (156.1, 1323.3, 244.415)                18  0.073770    244
3   (156.1, 1323.3, 244.535)                60  0.077320    776
4   (156.1, 1324.3, 244.535)                36  0.047936    751
5   (156.1, 1324.7, 244.535)                15  0.065502    229
6   (156.3, 1323.1, 244.275)                26  0.048059    541
7   (156.3, 1323.1, 244.505)                55  0.044319   1241
8    (156.5, 431.1, 244.375)                22  0.043053    511
9    (156.5, 431.1, 244.555)                11  0.065089    169
10   (156.5, 431.1, 244.618)                18  0.030405    592
11   (156.5, 431.1, 244.728)            

In [25]:
import numpy as np

# 각 스테이지의 좌표 열 정의
stage1_cols = ['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1',
               'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1',
               'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill1']

stage2_cols = ['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill1',
               'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1',
               'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill1']

stage3_cols = ['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill1',
               'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1',
               'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill1']

# 스테이지 1과 스테이지 2 사이의 거리 계산
train_data['DISTANCE_STAGE1_STAGE2_Fill1'] = np.sqrt(
    (train_data[stage2_cols[0]] - train_data[stage1_cols[0]]) ** 2 +
    (train_data[stage2_cols[1]] - train_data[stage1_cols[1]]) ** 2 +
    (train_data[stage2_cols[2]] - train_data[stage1_cols[2]]) ** 2
)

# 스테이지 2와 스테이지 3 사이의 거리 계산
train_data['DISTANCE_STAGE2_STAGE3_Fill1'] = np.sqrt(
    (train_data[stage3_cols[0]] - train_data[stage2_cols[0]]) ** 2 +
    (train_data[stage3_cols[1]] - train_data[stage2_cols[1]]) ** 2 +
    (train_data[stage3_cols[2]] - train_data[stage2_cols[2]]) ** 2
)

# 스테이지 1과 스테이지 3 사이의 거리 계산
train_data['DISTANCE_STAGE1_STAGE3_Fill1'] = np.sqrt(
    (train_data[stage3_cols[0]] - train_data[stage1_cols[0]]) ** 2 +
    (train_data[stage3_cols[1]] - train_data[stage1_cols[1]]) ** 2 +
    (train_data[stage3_cols[2]] - train_data[stage1_cols[2]]) ** 2
)

In [26]:
# 결과 출력
value_counts_ratio_count(train_data, 'DISTANCE_STAGE1_STAGE2_Fill1', 'target')
# value_counts_ratio_count(train_data, 'DISTANCE_STAGE2_STAGE3_Dam', 'target')
# value_counts_ratio_count(train_data, 'DISTANCE_STAGE1_STAGE3_Dam', 'target')


DISTANCE_STAGE1_STAGE2_Fill1별 target 비율 및 갯수

            AbNormal    Normal  AbNormal  Normal  Total
variable                                               
19.682797   1.000000  0.000000       1.0     0.0      1
301.700414  0.104478  0.895522       7.0    60.0     67
302.600413  0.015385  0.984615       1.0    64.0     65
377.400119  0.000000  1.000000       0.0     3.0      3
377.500000  0.000000  1.000000       0.0     2.0      2
377.900119  0.093290  0.906710     406.0  3946.0   4352
378.200119  0.075099  0.924901      19.0   234.0    253
378.400119  0.091803  0.908197      56.0   554.0    610
378.900000  0.081522  0.918478      15.0   169.0    184
378.900053  0.130471  0.869529     155.0  1033.0   1188
378.900119  0.138047  0.861953      82.0   512.0    594
378.900330  0.000000  1.000000       0.0     4.0      4
378.900647  0.046166  0.953834     121.0  2500.0   2621
378.901320  0.044837  0.955163     142.0  3025.0   3167
379.101319  0.053859  0.946141      67.0  1177.0   1244
3

In [27]:
summary_df = summarize_grouped_data(train_data, ['Dispenser_num', 'DISTANCE_STAGE1_STAGE2_Fill1'])                                            

Grouped by: Dispenser_num, DISTANCE_STAGE1_STAGE2_Fill1

                      group  'AdNormal' count     ratio  Total
0   (0, 19.682797057329015)                 1  1.000000      1
1    (0, 378.9006466080521)                 3  1.000000      3
2    (0, 378.9013196070977)                 5  1.000000      5
3    (0, 379.1013189109212)                 2  1.000000      2
4   (0, 379.40064575590804)                 2  1.000000      2
5    (0, 379.4013178680327)                 1  1.000000      1
6    (0, 379.5001185770566)                 2  1.000000      2
7   (0, 379.60011854581916)                 1  1.000000      1
8   (0, 379.70005267315935)                 4  1.000000      4
9                (0, 379.9)                 3  1.000000      3
10   (0, 379.9004738086016)                 1  1.000000      1
11   (0, 380.0001184210342)                 3  1.000000      3
12   (0, 380.1001183898789)                 2  1.000000      2
13  (0, 380.10131544102813)                 2  1.000000      

In [29]:
train_data['DISTANCE_STAGE(1-2)+STAGE(2-3)_Fill1'] = train_data['DISTANCE_STAGE1_STAGE2_Fill1'] + train_data['DISTANCE_STAGE2_STAGE3_Fill1']

In [30]:
summary_df = summarize_grouped_data(train_data, ['DISTANCE_STAGE(1-2)+STAGE(2-3)_Fill1'
                                                 , 'DISTANCE_STAGE1_STAGE3_Fill1'])

Grouped by: DISTANCE_STAGE(1-2)+STAGE(2-3)_Fill1, DISTANCE_STAGE1_STAGE3_Fill1

                                       group  'AdNormal' count     ratio  \
0   (39.33025236329678, 0.09999999999990905)                 1  1.000000   
1                             (679.5, 679.5)                 0  0.000000   
2     (680.2001190791027, 680.2000661570095)                 1  0.090909   
3     (680.2001355409365, 680.2001176124568)                19  0.075099   
4     (680.2001356189633, 680.2001176124568)                 3  0.111111   
5      (680.400118764827, 680.4000661375629)                 2  0.400000   
6     (680.4001189217572, 680.4000661375629)                56  0.091803   
7     (680.4001190791026, 680.4000661375629)               402  0.093185   
8     (680.4001192368651, 680.4000661375629)                 0  0.000000   
9                             (680.7, 680.7)                15  0.081522   
10    (680.7000527843722, 680.7000293815184)                 8  0.363636   
11    (6

In [31]:
train_data['차이'] = train_data['DISTANCE_STAGE1_STAGE3_Fill1'] - train_data['DISTANCE_STAGE(1-2)+STAGE(2-3)_Fill1']

In [32]:
value_counts_ratio_count(train_data, '차이', 'target')


차이별 target 비율 및 갯수

             AbNormal    Normal  AbNormal  Normal  Total
variable                                                
-759.399704  0.017857  0.982143       1.0    55.0     56
-758.800262  0.133333  0.866667       4.0    26.0     30
-39.230252   1.000000  0.000000       1.0     0.0      1
-0.001799    0.044837  0.955163     142.0  3025.0   3167
-0.001798    0.097619  0.902381      41.0   379.0    420
-0.001797    0.031553  0.968447      26.0   798.0    824
-0.001797    0.047176  0.952824     147.0  2969.0   3116
-0.001796    0.000000  1.000000       0.0     4.0      4
-0.001796    0.014925  0.985075       1.0    66.0     67
-0.001796    0.000000  1.000000       0.0     1.0      1
-0.001796    0.059200  0.940800      37.0   588.0    625
-0.001796    0.041237  0.958763       4.0    93.0     97
-0.001796    0.114583  0.885417      22.0   170.0    192
-0.001795    0.100000  0.900000       4.0    36.0     40
-0.001795    0.026810  0.973190      20.0   726.0    746
-0.001795 

## Fill2

---