In [1]:
import pandas as pd

# 데이터셋 로드
df = pd.read_csv('./data/continuous_factory_process.csv')

# 데이터셋 크기 확인
print(f"데이터셋 크기: {df.shape}")

# 컬럼 목록 출력
print("컬럼 목록:")
print(df.columns)

# 데이터의 기본 통계
print("데이터 기본 통계:")
print(df.describe())


데이터셋 크기: (9861, 116)
컬럼 목록:
Index(['time_stamp', 'AmbientConditions.AmbientHumidity.U.Actual',
       'AmbientConditions.AmbientTemperature.U.Actual',
       'Machine1.RawMaterial.Property1', 'Machine1.RawMaterial.Property2',
       'Machine1.RawMaterial.Property3', 'Machine1.RawMaterial.Property4',
       'Machine1.RawMaterialFeederParameter.U.Actual',
       'Machine1.Zone1Temperature.C.Actual',
       'Machine1.Zone2Temperature.C.Actual',
       ...
       'Stage2.Output.Measurement10.U.Actual',
       'Stage2.Output.Measurement10.U.Setpoint',
       'Stage2.Output.Measurement11.U.Actual',
       'Stage2.Output.Measurement11.U.Setpoint',
       'Stage2.Output.Measurement12.U.Actual',
       'Stage2.Output.Measurement12.U.Setpoint',
       'Stage2.Output.Measurement13.U.Actual',
       'Stage2.Output.Measurement13.U.Setpoint',
       'Stage2.Output.Measurement14.U.Actual',
       'Stage2.Output.Measurement14.U.Setpoint'],
      dtype='object', length=116)
데이터 기본 통계:
       AmbientCon

In [3]:
# 'Setpoint'가 포함된 컬럼 제거
df_cleaned = df.loc[:, ~df.columns.str.contains('Setpoint')]

# 전처리 후 데이터셋의 크기 확인
df_cleaned.shape


(9861, 86)

In [5]:
import pandas as pd

# Stage1의 출력값 분리
y_stage1 = df_cleaned[[col for col in df_cleaned.columns if 'Stage1.Output' in col]]

# Stage1의 입력값 분리
X_stage1 = df_cleaned.drop(columns=[col for col in df_cleaned.columns if 'Stage1.Output' in col])

# 상관계수 계산 (X_stage1의 각 컬럼과 Stage1 출력값 간의 상관계수)
correlation = X_stage1.corrwith(y_stage1.squeeze())

# 상관계수 절대값이 높은 순으로 정렬
correlation_sorted = correlation.abs().sort_values(ascending=False)

# 상관계수가 높은 상위 20개의 특성 선택
selected_columns = correlation_sorted.head(20).index

# 선택된 특성들을 추출한 새로운 데이터프레임 생성
X_stage1_selected_df = X_stage1[selected_columns]

# 출력값을 데이터프레임으로 병합
X_stage1_selected_df = pd.concat([X_stage1_selected_df, y_stage1], axis=1)

# 선택된 컬럼 목록과 데이터프레임의 크기 확인
print("선택된 컬럼 목록:")
print(selected_columns)

print("새로운 데이터셋 크기:")
print(X_stage1_selected_df.shape)

# 데이터 미리보기
print(X_stage1_selected_df.head())


선택된 컬럼 목록:
Index(['AmbientConditions.AmbientHumidity.U.Actual',
       'AmbientConditions.AmbientTemperature.U.Actual',
       'FirstStage.CombinerOperation.Temperature1.U.Actual',
       'FirstStage.CombinerOperation.Temperature2.U.Actual',
       'FirstStage.CombinerOperation.Temperature3.C.Actual',
       'Machine1.ExitZoneTemperature.C.Actual',
       'Machine1.MaterialPressure.U.Actual',
       'Machine1.MaterialTemperature.U.Actual',
       'Machine1.MotorAmperage.U.Actual', 'Machine1.MotorRPM.C.Actual',
       'Machine1.RawMaterial.Property1', 'Machine1.RawMaterial.Property2',
       'Machine1.RawMaterial.Property3', 'Machine1.RawMaterial.Property4',
       'Machine1.RawMaterialFeederParameter.U.Actual',
       'Machine1.Zone1Temperature.C.Actual',
       'Machine1.Zone2Temperature.C.Actual',
       'Machine2.ExitZoneTemperature.C.Actual',
       'Machine2.MaterialPressure.U.Actual',
       'Machine2.MaterialTemperature.U.Actual'],
      dtype='object')
새로운 데이터셋 크기:
(9861, 35)
 

In [6]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# 입력 데이터와 출력 데이터 분리
X = X_stage1_selected_df.drop(columns=[col for col in X_stage1_selected_df.columns if 'Stage1.Output' in col])
y = X_stage1_selected_df[[col for col in X_stage1_selected_df.columns if 'Stage1.Output' in col]]

# 훈련 세트와 검증 세트로 데이터 분할
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# RandomForestRegressor 모델 초기화 및 훈련
model_stage1 = RandomForestRegressor(n_estimators=100, random_state=42)
model_stage1.fit(X_train, y_train)

# 검증 세트로 예측
y_pred = model_stage1.predict(X_val)

# R2 Score 계산
r2_stage1 = r2_score(y_val, y_pred)
print(f"Stage1 모델의 R2 Score: {r2_stage1}")


Stage1 모델의 R2 Score: 0.6803670279116282


In [8]:
# n_jobs=-1 을 n_jobs=1로 변경하여 병렬 처리 비활성화
rf_random = RandomizedSearchCV(estimator=model_stage1, 
                               param_distributions=param_distributions, 
                               n_iter=50, 
                               cv=3, 
                               verbose=2, 
                               random_state=42, 
                               n_jobs=1)  # 병렬 처리 비활성화

# 모델 훈련
rf_random.fit(X_train, y_train)

# 최적의 파라미터 출력
print(f"최적의 파라미터: {rf_random.best_params_}")

# 최적의 모델로 예측
y_pred_optimized = rf_random.best_estimator_.predict(X_val)

# R2 Score 계산
r2_stage1_optimized = r2_score(y_val, y_pred_optimized)
print(f"튜닝된 Stage1 모델의 R2 Score: {r2_stage1_optimized}")


Fitting 3 folds for each of 50 candidates, totalling 150 fits
[CV] END bootstrap=True, max_depth=None, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=50; total time=   0.0s
[CV] END bootstrap=True, max_depth=None, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=50; total time=   0.0s
[CV] END bootstrap=True, max_depth=None, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=50; total time=   0.0s
[CV] END bootstrap=True, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.0s
[CV] END bootstrap=True, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.0s
[CV] END bootstrap=True, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.0s
[CV] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators

81 fits failed out of a total of 150.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
81 fits failed with the following error:
Traceback (most recent call last):
  File "/home/sdf1ai810/miniconda3/envs/module01/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/sdf1ai810/miniconda3/envs/module01/lib/python3.12/site-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/home/sdf1ai810/miniconda3/envs/module01/lib/python3.12/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/home/sdf1ai810/miniconda3/envs/module01/lib/python3.12/site-packag

최적의 파라미터: {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': None, 'bootstrap': False}
튜닝된 Stage1 모델의 R2 Score: 0.68363033895734


In [9]:
# Stage2 출력값 (y_stage2) 분리
y_stage2 = df_cleaned[[col for col in df_cleaned.columns if 'Stage2.Output' in col]]

# Stage2 입력값 (X_stage2) 분리
X_stage2 = df_cleaned.drop(columns=[col for col in df_cleaned.columns if 'Stage2.Output' in col])

# 상관계수 계산 (X_stage2의 각 컬럼과 Stage2 출력값 간의 상관계수)
correlation_stage2 = X_stage2.corrwith(y_stage2.squeeze())

# 상관계수 절대값이 높은 순으로 정렬
correlation_stage2_sorted = correlation_stage2.abs().sort_values(ascending=False)

# 상관계수가 높은 상위 20개의 특성 선택
selected_columns_stage2 = correlation_stage2_sorted.head(20).index

# 선택된 특성들을 추출한 새로운 데이터프레임 생성
X_stage2_selected_df = X_stage2[selected_columns_stage2]

# 출력값을 데이터프레임으로 병합
X_stage2_selected_df = pd.concat([X_stage2_selected_df, y_stage2], axis=1)

# 선택된 컬럼 목록과 데이터프레임의 크기 확인
print("선택된 컬럼 목록:")
print(selected_columns_stage2)

print("새로운 데이터셋 크기:")
print(X_stage2_selected_df.shape)

# 데이터 미리보기
print(X_stage2_selected_df.head())


선택된 컬럼 목록:
Index(['AmbientConditions.AmbientHumidity.U.Actual',
       'AmbientConditions.AmbientTemperature.U.Actual',
       'FirstStage.CombinerOperation.Temperature1.U.Actual',
       'FirstStage.CombinerOperation.Temperature2.U.Actual',
       'FirstStage.CombinerOperation.Temperature3.C.Actual',
       'Machine1.ExitZoneTemperature.C.Actual',
       'Machine1.MaterialPressure.U.Actual',
       'Machine1.MaterialTemperature.U.Actual',
       'Machine1.MotorAmperage.U.Actual', 'Machine1.MotorRPM.C.Actual',
       'Machine1.RawMaterial.Property1', 'Machine1.RawMaterial.Property2',
       'Machine1.RawMaterial.Property3', 'Machine1.RawMaterial.Property4',
       'Machine1.RawMaterialFeederParameter.U.Actual',
       'Machine1.Zone1Temperature.C.Actual',
       'Machine1.Zone2Temperature.C.Actual',
       'Machine2.ExitZoneTemperature.C.Actual',
       'Machine2.MaterialPressure.U.Actual',
       'Machine2.MaterialTemperature.U.Actual'],
      dtype='object')
새로운 데이터셋 크기:
(9861, 35)
 

In [10]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# 입력 데이터와 출력 데이터 분리
X2 = X_stage2_selected_df.drop(columns=[col for col in X_stage2_selected_df.columns if 'Stage2.Output' in col])
y2 = X_stage2_selected_df[[col for col in X_stage2_selected_df.columns if 'Stage2.Output' in col]]

# 훈련 세트와 검증 세트로 데이터 분할
X2_train, X2_val, y2_train, y2_val = train_test_split(X2, y2, test_size=0.2, random_state=42)

# RandomForestRegressor 모델 초기화 및 훈련
model_stage2 = RandomForestRegressor(n_estimators=100, random_state=42)
model_stage2.fit(X2_train, y2_train)

# 검증 세트로 예측
y2_pred = model_stage2.predict(X2_val)

# R2 Score 계산
r2_stage2 = r2_score(y2_val, y2_pred)
print(f"Stage2 모델의 R2 Score: {r2_stage2}")


Stage2 모델의 R2 Score: 0.7588507126316053


In [11]:
import numpy as np

# Stage1 데이터로 예측
stage1_predictions = model_stage1.predict(X_val)  # 혹은 테스트 데이터로 예측

# Stage2 데이터로 예측
stage2_predictions = model_stage2.predict(X2_val)  # 혹은 테스트 데이터로 예측

# 제출 파일 저장
np.save('submission1.npy', stage1_predictions)
np.save('submission2.npy', stage2_predictions)

print("submission1.npy와 submission2.npy 파일이 생성되었습니다.")


submission1.npy와 submission2.npy 파일이 생성되었습니다.


In [12]:
# Stage1 예측값 범위 확인
print(f"Stage1 예측값 최소값: {stage1_predictions.min()}, 최대값: {stage1_predictions.max()}")

# Stage2 예측값 범위 확인
print(f"Stage2 예측값 최소값: {stage2_predictions.min()}, 최대값: {stage2_predictions.max()}")


Stage1 예측값 최소값: -6.871208000000001e-05, 최대값: 34.59639999999999
Stage2 예측값 최소값: -0.00011157732, 최대값: 32.63999999999999
