In [2]:
import pandas as pd
from tqdm import tqdm

# 데이터 로드에 tqdm 적용
tqdm.pandas()

# 데이터셋 로드
file_path = 'data/continuous_factory_process.csv'
df = pd.read_csv(file_path)

# "Setpoint" 컬럼 제거
df = df.drop(columns=[col for col in df.columns if 'Setpoint' in col])

# 데이터셋 크기 확인
print(f"데이터 크기: {df.shape}")
print(f"남은 컬럼 수: {len(df.columns)}")

# 첫 5개 행 확인
print(df.head())


데이터 크기: (9861, 86)
남은 컬럼 수: 86
            time_stamp  AmbientConditions.AmbientHumidity.U.Actual  \
0  2019-03-06 10:52:34                                       17.24   
1  2019-03-06 10:52:35                                       17.24   
2  2019-03-06 10:52:37                                       17.24   
3  2019-03-06 10:52:38                                       17.24   
4  2019-03-06 10:52:39                                       17.24   

   AmbientConditions.AmbientTemperature.U.Actual  \
0                                          23.53   
1                                          23.53   
2                                          23.53   
3                                          23.53   
4                                          23.53   

   Machine1.RawMaterial.Property1  Machine1.RawMaterial.Property2  \
0                           11.54                             200   
1                           11.54                             200   
2                           

In [3]:
from tqdm import tqdm

# tqdm 적용
tqdm.pandas()

# 시간 데이터를 시, 분, 초로 분리하고 새로운 feature "minsec" 생성
df['hour'] = df['time_stamp'].progress_apply(lambda x: int(x[11:13]))
df['minute'] = df['time_stamp'].progress_apply(lambda x: int(x[14:16]))
df['second'] = df['time_stamp'].progress_apply(lambda x: int(x[17:19]))
df['minsec'] = df['minute'] * 60 + df['second']

# time_stamp 컬럼 제거
df = df.drop(columns=['time_stamp'])

# 결과 확인
print(df.head())
print(f"시간 관련 feature 생성 완료. 남은 컬럼 수: {len(df.columns)}")


100%|██████████| 9861/9861 [00:00<00:00, 910252.03it/s]
100%|██████████| 9861/9861 [00:00<00:00, 1062966.63it/s]
100%|██████████| 9861/9861 [00:00<00:00, 924804.50it/s]

   AmbientConditions.AmbientHumidity.U.Actual  \
0                                       17.24   
1                                       17.24   
2                                       17.24   
3                                       17.24   
4                                       17.24   

   AmbientConditions.AmbientTemperature.U.Actual  \
0                                          23.53   
1                                          23.53   
2                                          23.53   
3                                          23.53   
4                                          23.53   

   Machine1.RawMaterial.Property1  Machine1.RawMaterial.Property2  \
0                           11.54                             200   
1                           11.54                             200   
2                           11.54                             200   
3                           11.54                             200   
4                           11.54              




In [4]:
# Stage 1 데이터 분리 (Machine 1, 2, 3 + Stage1.Output)
stage1_data = df[df.columns.drop(list(df.filter(regex='Machine4|Machine5|Stage2')))]

# Stage 2 데이터 분리 (Machine 4, 5 + Stage2.Output)
stage2_data = df[df.columns.drop(list(df.filter(regex='Machine1|Machine2|Machine3|Stage1')))]

# 결과 확인
print(f"Stage 1 데이터 크기: {stage1_data.shape}")
print(stage1_data.head())

print(f"Stage 2 데이터 크기: {stage2_data.shape}")
print(stage2_data.head())


Stage 1 데이터 크기: (9861, 60)
   AmbientConditions.AmbientHumidity.U.Actual  \
0                                       17.24   
1                                       17.24   
2                                       17.24   
3                                       17.24   
4                                       17.24   

   AmbientConditions.AmbientTemperature.U.Actual  \
0                                          23.53   
1                                          23.53   
2                                          23.53   
3                                          23.53   
4                                          23.53   

   Machine1.RawMaterial.Property1  Machine1.RawMaterial.Property2  \
0                           11.54                             200   
1                           11.54                             200   
2                           11.54                             200   
3                           11.54                             200   
4                   

In [5]:
from sklearn.ensemble import RandomForestRegressor
import numpy as np
from tqdm import tqdm

# tqdm 적용
tqdm.pandas()

# Feature와 Target 설정 (Stage 1)
X_stage1 = stage1_data.drop(columns=[col for col in stage1_data.columns if 'Stage1.Output' in col])
y_stage1 = stage1_data[[col for col in stage1_data.columns if 'Stage1.Output' in col]]

# RandomForest 모델을 사용한 Feature Importance 계산
model_rf = RandomForestRegressor(n_estimators=100, random_state=0)
model_rf.fit(X_stage1, y_stage1)

# Feature 중요도 계산
feature_importances = model_rf.feature_importances_
important_features_idx = np.argsort(feature_importances)[::-1][:20]  # 상위 20개 feature 선택
important_features = X_stage1.columns[important_features_idx]

# 상위 20개의 feature로 데이터셋 재구성
X_stage1_selected = X_stage1[important_features]

# 결과 확인
print(f"선택된 상위 20개 feature: {important_features}")
print(X_stage1_selected.head())


선택된 상위 20개 feature: Index(['Machine3.MaterialTemperature.U.Actual', 'Machine3.MotorRPM.C.Actual',
       'minsec', 'Machine1.MaterialTemperature.U.Actual',
       'Machine1.MotorRPM.C.Actual', 'Machine1.MotorAmperage.U.Actual',
       'AmbientConditions.AmbientTemperature.U.Actual',
       'Machine1.Zone2Temperature.C.Actual',
       'Machine3.MaterialPressure.U.Actual', 'Machine3.MotorAmperage.U.Actual',
       'FirstStage.CombinerOperation.Temperature1.U.Actual',
       'Machine1.MaterialPressure.U.Actual', 'second',
       'FirstStage.CombinerOperation.Temperature2.U.Actual',
       'Machine1.RawMaterialFeederParameter.U.Actual',
       'Machine2.RawMaterialFeederParameter.U.Actual',
       'Machine2.MaterialPressure.U.Actual',
       'Machine3.RawMaterialFeederParameter.U.Actual',
       'Machine2.MotorAmperage.U.Actual',
       'Machine2.ExitZoneTemperature.C.Actual'],
      dtype='object')
   Machine3.MaterialTemperature.U.Actual  Machine3.MotorRPM.C.Actual  minsec  \
0          

In [6]:
# Feature와 Target 설정 (Stage 2)
X_stage2 = stage2_data.drop(columns=[col for col in stage2_data.columns if 'Stage2.Output' in col])
y_stage2 = stage2_data[[col for col in stage2_data.columns if 'Stage2.Output' in col]]

# RandomForest 모델을 사용한 Feature Importance 계산
model_rf_stage2 = RandomForestRegressor(n_estimators=100, random_state=0)
model_rf_stage2.fit(X_stage2, y_stage2)

# Feature 중요도 계산
feature_importances_stage2 = model_rf_stage2.feature_importances_
important_features_idx_stage2 = np.argsort(feature_importances_stage2)[::-1][:20]  # 상위 20개 feature 선택
important_features_stage2 = X_stage2.columns[important_features_idx_stage2]

# 상위 20개의 feature로 데이터셋 재구성
X_stage2_selected = X_stage2[important_features_stage2]

# 결과 확인
print(f"선택된 상위 20개 feature (Stage 2): {important_features_stage2}")
print(X_stage2_selected.head())


선택된 상위 20개 feature (Stage 2): Index(['minsec', 'Machine5.Temperature3.C.Actual',
       'AmbientConditions.AmbientHumidity.U.Actual',
       'Machine4.Temperature3.C.Actual', 'Machine5.ExitTemperature.U.Actual',
       'second', 'Machine5.Temperature6.C.Actual',
       'FirstStage.CombinerOperation.Temperature2.U.Actual',
       'FirstStage.CombinerOperation.Temperature1.U.Actual',
       'Machine5.Temperature4.C.Actual', 'Machine4.ExitTemperature.U.Actual',
       'hour', 'Machine4.Temperature5.C.Actual',
       'FirstStage.CombinerOperation.Temperature3.C.Actual',
       'Machine4.Temperature2.C.Actual', 'Machine4.Temperature4.C.Actual',
       'Machine4.Pressure.C.Actual', 'minute',
       'Machine4.Temperature1.C.Actual', 'Machine5.Temperature1.C.Actual'],
      dtype='object')
   minsec  Machine5.Temperature3.C.Actual  \
0    3154                           263.9   
1    3155                           263.9   
2    3157                           263.9   
3    3158                  

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from tqdm import tqdm

# Stage 1 데이터 분리
X_train_stage1, X_test_stage1, y_train_stage1, y_test_stage1 = train_test_split(X_stage1_selected, y_stage1, test_size=0.2, random_state=42)

# RandomForestRegressor 모델 학습 (Stage 1)
model_rf_stage1 = RandomForestRegressor(n_estimators=500, random_state=0, n_jobs=-1)

# tqdm으로 학습 진행 상황 확인
for _ in tqdm(range(1)):
    model_rf_stage1.fit(X_train_stage1, y_train_stage1)

# Stage 1 예측
y_pred_stage1 = model_rf_stage1.predict(X_test_stage1)

# R2 Score 계산 (Stage 1)
stage1_r2 = r2_score(y_test_stage1, y_pred_stage1)
print(f"Stage 1 R2 Score: {stage1_r2}")


100%|██████████| 1/1 [00:09<00:00,  9.72s/it]


Stage 1 R2 Score: 0.6873003456106048


In [8]:
from sklearn.ensemble import ExtraTreesRegressor

# Stage 2 데이터 분리
X_train_stage2, X_test_stage2, y_train_stage2, y_test_stage2 = train_test_split(X_stage2_selected, y_stage2, test_size=0.2, random_state=42)

# ExtraTreesRegressor 모델 학습 (Stage 2)
model_et_stage2 = ExtraTreesRegressor(n_estimators=500, random_state=0, n_jobs=-1)

# tqdm으로 학습 진행 상황 확인
for _ in tqdm(range(1)):
    model_et_stage2.fit(X_train_stage2, y_train_stage2)

# Stage 2 예측
y_pred_stage2 = model_et_stage2.predict(X_test_stage2)

# R2 Score 계산 (Stage 2)
stage2_r2 = r2_score(y_test_stage2, y_pred_stage2)
print(f"Stage 2 R2 Score: {stage2_r2}")


100%|██████████| 1/1 [00:02<00:00,  2.89s/it]


Stage 2 R2 Score: 0.743666013807614


In [9]:
# 두 Stage의 R2 Score 평균 계산
final_score = (stage1_r2 + stage2_r2) / 2
print(f"최종 평가 점수 (R2 평균): {final_score}")


최종 평가 점수 (R2 평균): 0.7154831797091095


In [15]:
# submission_data.csv 파일 로드
submission_file_path = 'data/submission_data.csv'
submission_df = pd.read_csv(submission_file_path)

# 시간 데이터 처리 (시, 분, 초, minsec 생성)
submission_df['hour'] = submission_df['time_stamp'].str.slice(start=11, stop=13).astype(int)
submission_df['minute'] = submission_df['time_stamp'].str.slice(start=14, stop=16).astype(int)
submission_df['second'] = submission_df['time_stamp'].str.slice(start=17, stop=19).astype(int)
submission_df['minsec'] = submission_df['minute'] * 60 + submission_df['second']

# "Setpoint" 컬럼 제거
submission_df = submission_df.drop(columns=[col for col in submission_df.columns if 'Setpoint' in col])

# Stage 1 데이터 분리 (Machine 1, 2, 3 + Stage1.Output 관련 feature들)
submission_stage1 = submission_df[submission_df.columns.drop(list(submission_df.filter(regex='Machine4|Machine5|Stage2')))]

# Stage 2 데이터 분리 (Machine 4, 5 + Stage2.Output 관련 feature들)
submission_stage2 = submission_df[submission_df.columns.drop(list(submission_df.filter(regex='Machine1|Machine2|Machine3|Stage1')))]

# Stage 1 예측값 생성
submission_stage1_predictions = model_rf_stage1.predict(submission_stage1[important_features])

# Stage 2 예측값 생성
submission_stage2_predictions = model_et_stage2.predict(submission_stage2[important_features_stage2])

# 예측값 저장
np.save("submission1.npy", submission_stage1_predictions)
np.save("submission2.npy", submission_stage2_predictions)

print("Submission files 'submission1.npy' and 'submission2.npy' have been created.")


Submission files 'submission1.npy' and 'submission2.npy' have been created.
