# 제품 이상여부 판별 프로젝트


## 1. 데이터 불러오기


### 필수 라이브러리


In [336]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import tensorflow as tf

### 데이터 읽어오기


In [337]:
RANDOM_STATE = 110

train_data = pd.read_csv("../../data/train_data_0817.csv")
test_data = pd.read_csv("../../data/test_data_0817.csv")

In [338]:
train_data.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40506 entries, 0 to 40505
Data columns (total 38 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   CURE SPEED Collect Result_Dam                   40506 non-null  int64  
 1   DISCHARGED SPEED OF RESIN Collect Result_Dam    40506 non-null  int64  
 2   Head Clean Position Z Collect Result_Dam        40506 non-null  float64
 3   Head Purge Position Z Collect Result_Dam        40506 non-null  float64
 4   Head Zero Position Y Collect Result_Dam         40506 non-null  float64
 5   Stage2_Circle_Distance_Speed_Dam                40506 non-null  int64  
 6   WorkMode Collect Result                         40506 non-null  float64
 7   Chamber Temp. Collect Result_AutoClave          40506 non-null  int64  
 8   DISCHARGED SPEED OF RESIN Collect Result_Fill1  40506 non-null  float64
 9   Head Purge Position Z Collect Result_Fi

In [339]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17361 entries, 0 to 17360
Data columns (total 39 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Set ID                                          17361 non-null  object 
 1   CURE SPEED Collect Result_Dam                   17361 non-null  int64  
 2   DISCHARGED SPEED OF RESIN Collect Result_Dam    17361 non-null  int64  
 3   Head Clean Position Z Collect Result_Dam        17361 non-null  float64
 4   Head Purge Position Z Collect Result_Dam        17361 non-null  float64
 5   Head Zero Position Y Collect Result_Dam         17361 non-null  float64
 6   Stage2_Circle_Distance_Speed_Dam                17361 non-null  float64
 7   WorkMode Collect Result                         17361 non-null  float64
 8   Chamber Temp. Collect Result_AutoClave          17361 non-null  int64  
 9   DISCHARGED SPEED OF RESIN Collect Resul

In [340]:
# dam, fill1, fill2 공통 변수
var_dam_fill = [
    'Equipment_same_num',
    'PalletID_Collect_Result_encoded',
    'Production_Qty_Collect_Result',
    'WorkMode Collect Result'
]

In [341]:
# 전체 공통 변수
### correlation 확인을 위한 변수 리스트
var_all_corr = [
    'model_receip_encoded',
    'workorder_receip_encoded'
]

### train
var_all_train = [
    'target',
    'model_receip_encoded',
    'workorder_receip_encoded'
]

### test
var_all_test = [
    'Set ID',
    'target',
    'model_receip_encoded',
    'workorder_receip_encoded'
]

In [342]:
# '_Dam'을 포함하는 변수 선택
dam_variables = [var for var in train_data.columns if '_Dam' in var]

# train
final_columns_train = var_dam_fill + var_all_train + dam_variables
train_data_dam = train_data[final_columns_train]

# test 
final_columns_test = var_dam_fill + var_all_test + dam_variables
test_data_dam = test_data[final_columns_test]

In [343]:
# '_Fill1'을 포함하는 변수 선택
fill1_variables = [var for var in train_data.columns if '_Fill1' in var]

# train
final_columns_train = var_dam_fill + var_all_train + fill1_variables
train_data_fill1 = train_data[final_columns_train]

# test 
final_columns_test = var_dam_fill + var_all_test + fill1_variables
test_data_fill1 = test_data[final_columns_test]

In [344]:
# '_Fill2'을 포함하는 변수 선택
fill2_variables = [var for var in train_data.columns if '_Fill2' in var]

# train
final_columns_train = var_dam_fill + var_all_train + fill2_variables
train_data_fill2 = train_data[final_columns_train]

# test 
final_columns_test = var_dam_fill + var_all_test + fill2_variables
test_data_fill2 = test_data[final_columns_test]

In [345]:
# '_AutoClave'을 포함하는 변수 선택
autoclave_variables = [var for var in train_data.columns if '_AutoClave' in var]

# train
final_columns_train = var_all_train + autoclave_variables
train_data_autoclave = train_data[final_columns_train]

# test 
final_columns_test = var_all_test + autoclave_variables
test_data_autoclave = test_data[final_columns_test]

---

## 3. 모델 학습

In [346]:
# 랜덤 시드 고정
np.random.seed(RANDOM_STATE)
tf.random.set_seed(RANDOM_STATE)

---

In [347]:
test_data_dam = test_data_dam.drop(columns = ['target'])
test_data_fill1 = test_data_fill1.drop(columns = ['target'])
test_data_fill2 = test_data_fill2.drop(columns = ['target'])
test_data_autoclave = test_data_autoclave.drop(columns = ['target'])

In [348]:
X_dam = train_data_dam.drop(columns=['target'])
Y_dam = train_data_dam['target']

In [349]:
X_dam.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40506 entries, 0 to 40505
Data columns (total 20 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   Equipment_same_num                            40506 non-null  int64  
 1   PalletID_Collect_Result_encoded               40506 non-null  float64
 2   Production_Qty_Collect_Result                 40506 non-null  int64  
 3   WorkMode Collect Result                       40506 non-null  float64
 4   model_receip_encoded                          40506 non-null  float64
 5   workorder_receip_encoded                      40506 non-null  float64
 6   CURE SPEED Collect Result_Dam                 40506 non-null  int64  
 7   DISCHARGED SPEED OF RESIN Collect Result_Dam  40506 non-null  int64  
 8   Head Clean Position Z Collect Result_Dam      40506 non-null  float64
 9   Head Purge Position Z Collect Result_Dam      40506 non-null 

In [350]:
X_autoclave = train_data_autoclave.drop(columns=['target'])
Y_autoclave = train_data_autoclave['target']

In [351]:
X_autoclave.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40506 entries, 0 to 40505
Data columns (total 7 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   model_receip_encoded                    40506 non-null  float64
 1   workorder_receip_encoded                40506 non-null  float64
 2   Chamber Temp. Collect Result_AutoClave  40506 non-null  int64  
 3   1st_pressure_time_AutoClave             40506 non-null  float64
 4   2nd_pressure_time_AutoClave             40506 non-null  float64
 5   3rd_pressure_time_AutoClave             40506 non-null  float64
 6   time_ratio_AutoClave                    40506 non-null  float64
dtypes: float64(6), int64(1)
memory usage: 2.2 MB


In [352]:
X_fill1 = train_data_fill1.drop(columns=['target'])
Y_fill1 = train_data_fill1['target']

In [353]:
X_fill1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40506 entries, 0 to 40505
Data columns (total 12 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Equipment_same_num                              40506 non-null  int64  
 1   PalletID_Collect_Result_encoded                 40506 non-null  float64
 2   Production_Qty_Collect_Result                   40506 non-null  int64  
 3   WorkMode Collect Result                         40506 non-null  float64
 4   model_receip_encoded                            40506 non-null  float64
 5   workorder_receip_encoded                        40506 non-null  float64
 6   DISCHARGED SPEED OF RESIN Collect Result_Fill1  40506 non-null  float64
 7   Head Purge Position Z Collect Result_Fill1      40506 non-null  int64  
 8   HEAD NORMAL DISTANCE_STAGE1_STAGE3_Fill1        40506 non-null  float64
 9   HEAD NORMAL DISTANCE_TRIANGLE_height_Fi

In [354]:
X_fill2 = train_data_fill2.drop(columns=['target'])
Y_fill2 = train_data_fill2['target']

In [355]:
X_fill2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40506 entries, 0 to 40505
Data columns (total 12 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   Equipment_same_num                          40506 non-null  int64  
 1   PalletID_Collect_Result_encoded             40506 non-null  float64
 2   Production_Qty_Collect_Result               40506 non-null  int64  
 3   WorkMode Collect Result                     40506 non-null  float64
 4   model_receip_encoded                        40506 non-null  float64
 5   workorder_receip_encoded                    40506 non-null  float64
 6   CURE SPEED Collect Result_Fill2             40506 non-null  int64  
 7   Head Purge Position Z Collect Result_Fill2  40506 non-null  float64
 8   HEAD NORMAL DISTANCE_STAGE1_STAGE2_Fill2    40506 non-null  float64
 9   time_ratio_Fill2                            40506 non-null  float64
 10  cure_end_p

In [356]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras import backend as K
from sklearn.metrics import f1_score
import numpy as np
import keras
import random

tf.random.set_seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)
os.environ['PYTHONHASHSEED'] = str(RANDOM_STATE)

def f1_score_metric(y_true, y_pred):
    # y_pred는 softmax 결과이므로 argmax를 사용해 클래스 예측값을 얻습니다.
    y_pred_classes = K.argmax(y_pred, axis=-1)
    y_true_classes = K.argmax(y_true, axis=-1)
    
    # F1 스코어를 계산합니다.
    return tf.py_function(f1_score, (y_true_classes, y_pred_classes), tf.double)

# 모델 생성
model_dam = Sequential()
model_dam.add(Dense(35, input_dim=20, activation='relu'))
model_dam.add(Dense(15, activation='relu'))
model_dam.add(Dense(8, activation='relu'))
model_dam.add(Dense(2, activation='sigmoid'))

# F1 스코어를 평가지표로 사용
model_dam.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model_dam.summary()

Model: "sequential_40"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_151 (Dense)           (None, 35)                735       
                                                                 
 dense_152 (Dense)           (None, 15)                540       
                                                                 
 dense_153 (Dense)           (None, 8)                 128       
                                                                 
 dense_154 (Dense)           (None, 2)                 18        
                                                                 
Total params: 1,421
Trainable params: 1,421
Non-trainable params: 0
_________________________________________________________________


In [357]:
model_autoclave = Sequential()
model_autoclave.add(Dense(20, input_dim=7, activation='relu'))
model_autoclave.add(Dense(11, activation='relu'))
model_autoclave.add(Dense(6, activation='relu'))
model_autoclave.add(Dense(2, activation='sigmoid'))

# F1 스코어를 평가지표로 사용
model_autoclave.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model_autoclave.summary()

Model: "sequential_41"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_155 (Dense)           (None, 20)                160       
                                                                 
 dense_156 (Dense)           (None, 11)                231       
                                                                 
 dense_157 (Dense)           (None, 6)                 72        
                                                                 
 dense_158 (Dense)           (None, 2)                 14        
                                                                 
Total params: 477
Trainable params: 477
Non-trainable params: 0
_________________________________________________________________


In [358]:
model_fill1 = Sequential()
model_fill1.add(Dense(30, input_dim=12, activation='relu'))
model_fill1.add(Dense(15, activation='relu'))
model_fill1.add(Dense(8, activation='relu'))
model_fill1.add(Dense(2, activation='sigmoid'))

# F1 스코어를 평가지표로 사용
model_fill1.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model_fill1.summary()

Model: "sequential_42"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_159 (Dense)           (None, 30)                390       
                                                                 
 dense_160 (Dense)           (None, 15)                465       
                                                                 
 dense_161 (Dense)           (None, 8)                 128       
                                                                 
 dense_162 (Dense)           (None, 2)                 18        
                                                                 
Total params: 1,001
Trainable params: 1,001
Non-trainable params: 0
_________________________________________________________________


In [359]:
model_fill2 = Sequential()
model_fill2.add(Dense(28, input_dim=12, activation='relu'))
model_fill2.add(Dense(12, activation='relu'))
model_fill2.add(Dense(7, activation='relu'))
model_fill2.add(Dense(2, activation='sigmoid'))

# F1 스코어를 평가지표로 사용
model_fill2.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model_fill2.summary()

Model: "sequential_43"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_163 (Dense)           (None, 28)                364       
                                                                 
 dense_164 (Dense)           (None, 12)                348       
                                                                 
 dense_165 (Dense)           (None, 7)                 91        
                                                                 
 dense_166 (Dense)           (None, 2)                 16        
                                                                 
Total params: 819
Trainable params: 819
Non-trainable params: 0
_________________________________________________________________


In [360]:
Y_dam = pd.get_dummies(Y_dam, columns=['target'])
Y_autoclave = pd.get_dummies(Y_autoclave, columns=['target'])
Y_fill1 = pd.get_dummies(Y_fill1, columns=['target'])
Y_fill2 = pd.get_dummies(Y_fill2, columns=['target'])

In [361]:
Y_dam = Y_dam.replace({True: 1, False: 0})
Y_autoclave = Y_autoclave.replace({True: 1, False: 0})
Y_fill1 = Y_fill1.replace({True: 1, False: 0})
Y_fill2 = Y_fill2.replace({True: 1, False: 0})

In [362]:
from sklearn.model_selection import train_test_split

# 데이터 분할
x_train_dam, x_valid_dam, y_train_dam, y_valid_dam = train_test_split(
    X_dam, Y_dam, test_size=0.2, shuffle=True, stratify=Y_dam, random_state=RANDOM_STATE
)

In [363]:
from sklearn.metrics import f1_score

# 모델 훈련
history1 = model_dam.fit(
    x_train_dam, y_train_dam, epochs=10, validation_data=(x_valid_dam, y_valid_dam)
)

# 예측 수행
y_pred_dam = model_dam.predict(x_valid_dam)
y_pred_dam_classes = y_pred_dam.argmax(axis=1)

# y_valid_dam을 numpy 배열로 변환 후 argmax 사용
y_valid_dam_classes = y_valid_dam.to_numpy().argmax(axis=1)

# f1 score 계산
f1 = f1_score(y_valid_dam_classes, y_pred_dam_classes, average='weighted')
print(f"F1 Score: {f1}")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
F1 Score: 0.9138508799886738


In [364]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# 데이터 분할
x_train_autoclave, x_valid_autoclave, y_train_autoclave, y_valid_autoclave = train_test_split(
    X_autoclave, Y_autoclave, test_size=0.2, shuffle=True, stratify=Y_autoclave, random_state=RANDOM_STATE
)

# 모델 훈련
history2 = model_autoclave.fit(
    x_train_autoclave, y_train_autoclave, epochs=10, validation_data=(x_valid_autoclave, y_valid_autoclave)
)

# 예측 수행
y_pred_autoclave = model_autoclave.predict(x_valid_autoclave)
y_pred_autoclave_classes = y_pred_autoclave.argmax(axis=1)

# y_valid_autoclave을 numpy 배열로 변환 후 argmax 사용
y_valid_autoclave_classes = y_valid_autoclave.to_numpy().argmax(axis=1)

# f1 score 계산
f1 = f1_score(y_valid_autoclave_classes, y_pred_autoclave_classes, average='weighted')
print(f"F1 Score: {f1}")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
F1 Score: 0.9138508799886738


In [365]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# 데이터 분할
x_train_fill1, x_valid_fill1, y_train_fill1, y_valid_fill1 = train_test_split(
    X_fill1, Y_fill1, test_size=0.2, shuffle=True, stratify=Y_fill1, random_state=34
)

# 모델 훈련
history3 = model_fill1.fit(
    x_train_fill1, y_train_fill1, epochs=10, validation_data=(x_valid_fill1, y_valid_fill1)
)

# 예측 수행
y_pred_fill1 = model_fill1.predict(x_valid_fill1)
y_pred_fill1_classes = y_pred_fill1.argmax(axis=1)

# y_valid_fill1을 numpy 배열로 변환 후 argmax 사용
y_valid_fill1_classes = y_valid_fill1.to_numpy().argmax(axis=1)

# f1 score 계산
f1 = f1_score(y_valid_fill1_classes, y_pred_fill1_classes, average='weighted')
print(f"F1 Score: {f1}")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
F1 Score: 0.9138508799886738


In [366]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# 데이터 분할
x_train_fill2, x_valid_fill2, y_train_fill2, y_valid_fill2 = train_test_split(
    X_fill2, Y_fill2, test_size=0.2, shuffle=True, stratify=Y_fill2, random_state=34
)

# 모델 훈련
history4 = model_fill2.fit(
    x_train_fill2, y_train_fill2, epochs=10, validation_data=(x_valid_fill2, y_valid_fill2)
)

# 예측 수행
y_pred_fill2 = model_fill2.predict(x_valid_fill2)
y_pred_fill2_classes = y_pred_fill2.argmax(axis=1)

# y_valid_fill2을 numpy 배열로 변환 후 argmax 사용
y_valid_fill2_classes = y_valid_fill2.to_numpy().argmax(axis=1)

# f1 score 계산
f1 = f1_score(y_valid_fill2_classes, y_pred_fill2_classes, average='weighted')
print(f"F1 Score: {f1}")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
F1 Score: 0.9138508799886738


In [367]:
models = [model_dam, model_autoclave, model_fill1, model_fill2]
for model in models:
    model.pop()  # 마지막 레이어 제거
    model.trainable = False 

In [368]:
model_dam.summary()

Model: "sequential_40"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_151 (Dense)           (None, 35)                735       
                                                                 
 dense_152 (Dense)           (None, 15)                540       
                                                                 
 dense_153 (Dense)           (None, 8)                 128       
                                                                 
Total params: 1,403
Trainable params: 0
Non-trainable params: 1,403
_________________________________________________________________


In [216]:
# from tensorflow.keras import layers, models

# combined_model = Sequential()

# # 각 모델의 출력을 다음 모델의 입력으로 연결
# combined_model.add(model_dam)
# combined_model.add(model_autoclave)
# combined_model.add(model_fill1)
# combined_model.add(model_fill2)

# # 추가적인 히든 레이어
# combined_model.add(layers.Dense(40, activation='relu', input_dim=31))
# combined_model.add(layers.Dense(20, activation='relu'))
# combined_model.add(layers.Dense(10, activation='relu'))

# # 출력 레이어
# combined_model.add(layers.Dense(2, activation='sigmoid'))

# # 모델 컴파일 (loss, optimizer 등 설정)
# combined_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# # 모델 요약 출력
# combined_model.summary()

In [375]:
X_dam

Unnamed: 0,Equipment_same_num,PalletID_Collect_Result_encoded,Production_Qty_Collect_Result,WorkMode Collect Result,model_receip_encoded,workorder_receip_encoded,CURE SPEED Collect Result_Dam,DISCHARGED SPEED OF RESIN Collect Result_Dam,Head Clean Position Z Collect Result_Dam,Head Purge Position Z Collect Result_Dam,Head Zero Position Y Collect Result_Dam,Stage2_Circle_Distance_Speed_Dam,CURE_DISTANCE_Dam,HEAD NORMAL DISTANCE_STAGE1_STAGE3_Dam,HEAD NORMAL DISTANCE_TRIANGLE_height_Dam,volume_time_multip_avg_Dam,average_thickness_Dam,time_ratio_Dam,stage1_line13_distance_speed_Dam_encoded,stage2_line24_distance_speed_Dam_encoded
0,1,0.052923,127,0.0,0.048602,0.157921,100,16,124.00,130.85,300.0,5300,790.607994,389.100000,0.900000,11.787333,0.000000,0.118,0.052492,0.050803
1,1,0.052923,185,1.0,0.048602,0.015245,70,10,130.85,130.85,300.0,9000,790.607994,389.500000,0.000000,21.713333,0.000000,0.104,0.041709,0.072739
2,1,0.051118,73,0.0,0.056583,0.009754,85,16,124.00,130.85,300.0,5300,720.306185,389.300000,112.100000,18.413000,-0.002333,0.148,0.032417,0.050803
3,1,0.042398,268,1.0,0.056583,0.057147,70,10,130.85,130.85,300.0,5000,720.306185,389.600000,1.100000,22.782000,0.000000,0.112,0.041709,0.042197
4,1,0.057412,121,1.0,0.056583,0.123969,70,10,133.50,133.50,300.0,9000,790.607994,389.500000,0.500000,7.489667,0.000000,0.130,0.084132,0.072739
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40501,1,0.051495,318,1.0,0.056583,0.018213,70,10,130.85,130.85,300.0,9000,790.607994,389.500000,0.000000,15.770000,0.000000,0.093,0.062232,0.072739
40502,1,0.048487,197,0.0,0.056583,0.015710,100,16,124.00,130.85,300.0,5300,720.306185,389.600051,112.344595,14.138000,-0.052667,0.151,0.052492,0.050803
40503,1,0.051495,27,0.0,0.056583,0.029194,100,16,124.00,130.85,300.0,5500,790.607994,389.100206,0.589128,11.010667,0.000000,0.137,0.048439,0.037518
40504,1,0.068764,117,1.0,0.056583,0.034803,70,10,130.85,130.85,300.0,9000,720.306185,388.900000,0.600000,7.539333,0.000000,0.093,0.084132,0.072739


In [371]:
y_prob_dam = model_dam.predict(X_dam)



In [374]:
# 첫 번째 예측 확률 출력
print(y_prob_dam[1])

[0.05839222 0.9415987 ]
