In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.metrics import r2_score
from tqdm import tqdm

# 데이터 전처리 함수
def preprocess_data(df):
    df['hour'] = df['time_stamp'].str.slice(start=11, stop=13).astype(int)
    df['minute'] = df['time_stamp'].str.slice(start=14, stop=16).astype(int)
    df['second'] = df['time_stamp'].str.slice(start=17, stop=19).astype(int)
    df['minsec'] = df['minute'] * 60 + df['second']
    df = df.drop(columns=[col for col in df.columns if 'Setpoint' in col])
    return df

# 시간 데이터를 주기적인 sin, cos 형태로 변환
def add_time_features(df):
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
    df['minute_sin'] = np.sin(2 * np.pi * df['minute'] / 60)
    df['minute_cos'] = np.cos(2 * np.pi * df['minute'] / 60)
    df['second_sin'] = np.sin(2 * np.pi * df['second'] / 60)
    df['second_cos'] = np.cos(2 * np.pi * df['second'] / 60)
    return df

# Stage별 데이터 분리 함수
def split_stage_data(df):
    env_features = ['AmbientConditions.AmbientHumidity.U.Actual', 
                    'AmbientConditions.AmbientTemperature.U.Actual']
    stage1_data = df[df.columns.drop(list(df.filter(regex='Machine4|Machine5|Stage2|time_stamp')))].copy()
    stage1_data = stage1_data[env_features + stage1_data.columns.tolist()]  
    stage2_data = df[df.columns.drop(list(df.filter(regex='Machine1|Machine2|Machine3|Stage1|time_stamp|FirstStage.CombinerOperation')))].copy()
    stage2_data = stage2_data[env_features + stage2_data.columns.tolist()]
    return stage1_data, stage2_data

# 모델 학습 함수 (단일 학습)
def train_model(X, y, model_type='random_forest'):
    if model_type == 'random_forest':
        model = RandomForestRegressor(n_estimators=500, random_state=0, n_jobs=-1)
    elif model_type == 'extra_trees':
        model = ExtraTreesRegressor(n_estimators=500, random_state=0, n_jobs=-1)
    
    model.fit(X, y)
    return model

# R2 Score 계산 및 평가
def evaluate_model(y_true, y_pred):
    return r2_score(y_true, y_pred)

# 최종 서브미션 파일 저장 함수
def save_submission(predictions_stage1, predictions_stage2):
    np.save("submission1.npy", predictions_stage1)
    np.save("submission2.npy", predictions_stage2)
    print("Submission files 'submission1.npy' and 'submission2.npy' have been created.")

# 1. 데이터 로드 및 전처리
df = pd.read_csv('data/continuous_factory_process.csv')
submission_df = pd.read_csv('data/submission_data.csv')

df = preprocess_data(df)
submission_df = preprocess_data(submission_df)

# 시간 관련 데이터 변환
df = add_time_features(df)
submission_df = add_time_features(submission_df)

# 2. Stage 데이터 분리
stage1_data, stage2_data = split_stage_data(df)
submission_stage1_data, submission_stage2_data = split_stage_data(submission_df)

# 3. Feature와 Target 설정 (Stage 1, Stage 2)
X_stage1 = stage1_data.drop(columns=[col for col in stage1_data.columns if 'Stage1.Output' in col])
y_stage1 = stage1_data[[col for col in stage1_data.columns if 'Stage1.Output' in col]]

X_stage2 = stage2_data.drop(columns=[col for col in stage2_data.columns if 'Stage2.Output' in col])
y_stage2 = stage2_data[[col for col in stage2_data.columns if 'Stage2.Output' in col]]

# 서브미션 데이터의 컬럼을 학습 데이터와 동일한 순서로 정렬
X_submission_stage1 = submission_stage1_data[X_stage1.columns]
X_submission_stage2 = submission_stage2_data[X_stage2.columns]

# 4. 모델 학습 (Stage 1: RandomForest, Stage 2: ExtraTrees)
model_rf_stage1 = train_model(X_stage1, y_stage1, model_type='random_forest')
model_et_stage2 = train_model(X_stage2, y_stage2, model_type='extra_trees')

# 5. 예측 수행
submission_stage1_predictions = model_rf_stage1.predict(X_submission_stage1)
submission_stage2_predictions = model_et_stage2.predict(X_submission_stage2)

# 6. 서브미션 파일 저장
save_submission(submission_stage1_predictions, submission_stage2_predictions)


ValueError: The feature names should match those that were passed during fit.
Feature names must be in the same order as they were in fit.


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9861 entries, 0 to 9860
Data columns (total 96 columns):
 #   Column                                              Non-Null Count  Dtype  
---  ------                                              --------------  -----  
 0   time_stamp                                          9861 non-null   object 
 1   AmbientConditions.AmbientHumidity.U.Actual          9861 non-null   float64
 2   AmbientConditions.AmbientTemperature.U.Actual       9861 non-null   float64
 3   Machine1.RawMaterial.Property1                      9861 non-null   float64
 4   Machine1.RawMaterial.Property2                      9861 non-null   int64  
 5   Machine1.RawMaterial.Property3                      9861 non-null   float64
 6   Machine1.RawMaterial.Property4                      9861 non-null   int64  
 7   Machine1.RawMaterialFeederParameter.U.Actual        9861 non-null   float64
 8   Machine1.Zone1Temperature.C.Actual                  9861 non-null   float64
 9

In [4]:
submission_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4227 entries, 0 to 4226
Data columns (total 96 columns):
 #   Column                                              Non-Null Count  Dtype  
---  ------                                              --------------  -----  
 0   time_stamp                                          4227 non-null   object 
 1   AmbientConditions.AmbientHumidity.U.Actual          4227 non-null   float64
 2   AmbientConditions.AmbientTemperature.U.Actual       4227 non-null   float64
 3   Machine1.RawMaterial.Property1                      4227 non-null   float64
 4   Machine1.RawMaterial.Property2                      4227 non-null   int64  
 5   Machine1.RawMaterial.Property3                      4227 non-null   float64
 6   Machine1.RawMaterial.Property4                      4227 non-null   int64  
 7   Machine1.RawMaterialFeederParameter.U.Actual        4227 non-null   float64
 8   Machine1.Zone1Temperature.C.Actual                  4227 non-null   float64
 9

In [5]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from tqdm import tqdm

# 데이터 로드
file_path = 'data/continuous_factory_process.csv'
df = pd.read_csv(file_path)

# "Setpoint" 컬럼 제거
df = df.drop(columns=[col for col in df.columns if 'Setpoint' in col])

# 시간 데이터 처리 (시, 분, 초, minsec 생성)
df['hour'] = df['time_stamp'].str.slice(start=11, stop=13).astype(int)
df['minute'] = df['time_stamp'].str.slice(start=14, stop=16).astype(int)
df['second'] = df['time_stamp'].str.slice(start=17, stop=19).astype(int)
df['minsec'] = df['minute'] * 60 + df['second']

# time_stamp 컬럼 제거
df = df.drop(columns=['time_stamp'])

# Stage 1 데이터 분리 (Machine 1, 2, 3 + FirstStage + Ambient + Stage1.Output 관련 feature들)
stage1_data = df[df.columns.drop(list(df.filter(regex='Machine4|Machine5|Stage2')))]

# Stage 2 데이터 분리 (Machine 4, 5 + Ambient + Stage2.Output 관련 feature들, FirstStage 제외)
stage2_data = df[df.columns.drop(list(df.filter(regex='Machine1|Machine2|Machine3|Stage1|FirstStage')))]

# Feature와 Target 설정 (Stage 1)
X_stage1 = stage1_data.drop(columns=[col for col in stage1_data.columns if 'Stage1.Output' in col])
y_stage1 = stage1_data[[col for col in stage1_data.columns if 'Stage1.Output' in col]]

# Feature와 Target 설정 (Stage 2)
X_stage2 = stage2_data.drop(columns=[col for col in stage2_data.columns if 'Stage2.Output' in col])
y_stage2 = stage2_data[[col for col in stage2_data.columns if 'Stage2.Output' in col]]

# 데이터 분리 (학습/검증 세트)
X_train_stage1, X_test_stage1, y_train_stage1, y_test_stage1 = train_test_split(X_stage1, y_stage1, test_size=0.2, random_state=42)
X_train_stage2, X_test_stage2, y_train_stage2, y_test_stage2 = train_test_split(X_stage2, y_stage2, test_size=0.2, random_state=42)

# RandomForestRegressor 모델 학습 (Stage 1)
model_rf_stage1 = RandomForestRegressor(n_estimators=500, random_state=0, n_jobs=-1)
model_rf_stage1.fit(X_train_stage1, y_train_stage1)

# ExtraTreesRegressor 모델 학습 (Stage 2)
model_et_stage2 = ExtraTreesRegressor(n_estimators=500, random_state=0, n_jobs=-1)
model_et_stage2.fit(X_train_stage2, y_train_stage2)

# 예측 (Stage 1)
y_pred_stage1 = model_rf_stage1.predict(X_test_stage1)
stage1_r2 = r2_score(y_test_stage1, y_pred_stage1)
print(f"Stage 1 R2 Score: {stage1_r2}")

# 예측 (Stage 2)
y_pred_stage2 = model_et_stage2.predict(X_test_stage2)
stage2_r2 = r2_score(y_test_stage2, y_pred_stage2)
print(f"Stage 2 R2 Score: {stage2_r2}")

# 최종 평가 점수 계산 (두 Stage R2 Score 평균)
final_score = (stage1_r2 + stage2_r2) / 2
print(f"최종 평가 점수 (R2 평균): {final_score}")

# submission_data.csv 파일 로드
submission_file_path = 'data/submission_data.csv'
submission_df = pd.read_csv(submission_file_path)

# 시간 데이터 처리 (시, 분, 초, minsec 생성)
submission_df['hour'] = submission_df['time_stamp'].str.slice(start=11, stop=13).astype(int)
submission_df['minute'] = submission_df['time_stamp'].str.slice(start=14, stop=16).astype(int)
submission_df['second'] = submission_df['time_stamp'].str.slice(start=17, stop=19).astype(int)
submission_df['minsec'] = submission_df['minute'] * 60 + submission_df['second']

# "Setpoint" 컬럼 제거
submission_df = submission_df.drop(columns=[col for col in submission_df.columns if 'Setpoint' in col])

# Stage 1 데이터 분리 (Machine 1, 2, 3 + FirstStage + Ambient + Stage1.Output 관련 feature들)
submission_stage1 = submission_df[submission_df.columns.drop(list(submission_df.filter(regex='Machine4|Machine5|Stage2')))]

# Stage 2 데이터 분리 (Machine 4, 5 + Ambient + Stage2.Output 관련 feature들, FirstStage 제외)
submission_stage2 = submission_df[submission_df.columns.drop(list(submission_df.filter(regex='Machine1|Machine2|Machine3|Stage1|FirstStage')))]


Stage 1 R2 Score: 0.6958753629322967
Stage 2 R2 Score: 0.7279346834699326
최종 평가 점수 (R2 평균): 0.7119050232011146


ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- Stage1.Output.Measurement0.U.Actual
- Stage1.Output.Measurement1.U.Actual
- Stage1.Output.Measurement10.U.Actual
- Stage1.Output.Measurement11.U.Actual
- Stage1.Output.Measurement12.U.Actual
- ...


In [7]:
# Stage 1 예측값 생성 (Stage1.Output 및 time_stamp 관련 컬럼 제외)
submission_stage1_input = submission_stage1.drop(columns=[col for col in submission_stage1.columns if 'Stage1.Output' in col or 'time_stamp' in col])
submission_stage1_predictions = model_rf_stage1.predict(submission_stage1_input)

# Stage 2 예측값 생성 (Stage2.Output 및 time_stamp 관련 컬럼 제외)
submission_stage2_input = submission_stage2.drop(columns=[col for col in submission_stage2.columns if 'Stage2.Output' in col or 'time_stamp' in col])
submission_stage2_predictions = model_et_stage2.predict(submission_stage2_input)

# 예측값 저장
np.save("submission1.npy", submission_stage1_predictions)
np.save("submission2.npy", submission_stage2_predictions)

print("Submission files 'submission1.npy' and 'submission2.npy' have been created.")


Submission files 'submission1.npy' and 'submission2.npy' have been created.


In [11]:
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# 데이터 로드 및 전처리 (기존 동일)
file_path = 'data/continuous_factory_process.csv'
df = pd.read_csv(file_path)

# "Setpoint" 컬럼 제거
df = df.drop(columns=[col for col in df.columns if 'Setpoint' in col])

# 시간 데이터 처리 (시, 분, 초, minsec 생성)
df['hour'] = df['time_stamp'].str.slice(start=11, stop=13).astype(int)
df['minute'] = df['time_stamp'].str.slice(start=14, stop=16).astype(int)
df['second'] = df['time_stamp'].str.slice(start=17, stop=19).astype(int)
df['minsec'] = df['minute'] * 60 + df['second']

# time_stamp 컬럼 제거
df = df.drop(columns=['time_stamp'])

# Stage 1 데이터 분리
stage1_data = df[df.columns.drop(list(df.filter(regex='Machine4|Machine5|Stage2')))]
X_stage1 = stage1_data.drop(columns=[col for col in stage1_data.columns if 'Stage1.Output' in col])
y_stage1 = stage1_data[[col for col in stage1_data.columns if 'Stage1.Output' in col]]

# Stage 2 데이터 분리
stage2_data = df[df.columns.drop(list(df.filter(regex='Machine1|Machine2|Machine3|Stage1|FirstStage')))]
X_stage2 = stage2_data.drop(columns=[col for col in stage2_data.columns if 'Stage2.Output' in col])
y_stage2 = stage2_data[[col for col in stage2_data.columns if 'Stage2.Output' in col]]


Stage 1 R2 Score (CatBoost): 0.6949415654946729
Stage 2 R2 Score (CatBoost): 0.7644309123872067
최종 평가 점수 (CatBoost): 0.7296862389409398
Submission files 'submission1.npy' and 'submission2.npy' have been created.


In [None]:

# 데이터 분리 (학습/검증 세트)
X_train_stage1, X_test_stage1, y_train_stage1, y_test_stage1 = train_test_split(X_stage1, y_stage1, test_size=0.2, random_state=42)
X_train_stage2, X_test_stage2, y_train_stage2, y_test_stage2 = train_test_split(X_stage2, y_stage2, test_size=0.2, random_state=42)

# CatBoost 모델 적용 (Stage 1)
catboost_stage1 = CatBoostRegressor(iterations=1000, depth=10, learning_rate=0.05, loss_function='MultiRMSE', random_state=42, verbose=0)
catboost_stage1.fit(X_train_stage1, y_train_stage1)

# CatBoost 모델 적용 (Stage 2)
catboost_stage2 = CatBoostRegressor(iterations=1000, depth=10, learning_rate=0.05, loss_function='MultiRMSE', random_state=42, verbose=0)
catboost_stage2.fit(X_train_stage2, y_train_stage2)

# Stage 1 예측
y_pred_stage1_cat = catboost_stage1.predict(X_test_stage1)
stage1_r2_cat = r2_score(y_test_stage1, y_pred_stage1_cat)
print(f"Stage 1 R2 Score (CatBoost): {stage1_r2_cat}")

# Stage 2 예측
y_pred_stage2_cat = catboost_stage2.predict(X_test_stage2)
stage2_r2_cat = r2_score(y_test_stage2, y_pred_stage2_cat)
print(f"Stage 2 R2 Score (CatBoost): {stage2_r2_cat}")

# 최종 평가 점수 계산
final_score_catboost = (stage1_r2_cat + stage2_r2_cat) / 2
print(f"최종 평가 점수 (CatBoost): {final_score_catboost}")


In [None]:

# submission_data.csv 파일 로드
submission_file_path = 'data/submission_data.csv'
submission_df = pd.read_csv(submission_file_path)

# 시간 데이터 처리 (시, 분, 초, minsec 생성)
submission_df['hour'] = submission_df['time_stamp'].str.slice(start=11, stop=13).astype(int)
submission_df['minute'] = submission_df['time_stamp'].str.slice(start=14, stop=16).astype(int)
submission_df['second'] = submission_df['time_stamp'].str.slice(start=17, stop=19).astype(int)
submission_df['minsec'] = submission_df['minute'] * 60 + submission_df['second']

# "Setpoint" 컬럼 제거
submission_df = submission_df.drop(columns=[col for col in submission_df.columns if 'Setpoint' in col])

# Stage 1 데이터 분리
submission_stage1 = submission_df[submission_df.columns.drop(list(submission_df.filter(regex='Machine4|Machine5|Stage2')))]

# Stage 2 데이터 분리
submission_stage2 = submission_df[submission_df.columns.drop(list(submission_df.filter(regex='Machine1|Machine2|Machine3|Stage1|FirstStage')))]


In [None]:

# Stage 1 예측값 생성
submission_stage1_input = submission_stage1.drop(columns=[col for col in submission_stage1.columns if 'Stage1.Output' in col or 'time_stamp' in col])
submission_stage1_predictions_cat = catboost_stage1.predict(submission_stage1_input)

# Stage 2 예측값 생성
submission_stage2_input = submission_stage2.drop(columns=[col for col in submission_stage2.columns if 'Stage2.Output' in col or 'time_stamp' in col])
submission_stage2_predictions_cat = catboost_stage2.predict(submission_stage2_input)

# 예측값 저장
np.save("submission1.npy", submission_stage1_predictions_cat)
np.save("submission2.npy", submission_stage2_predictions_cat)

print("Submission files 'submission1.npy' and 'submission2.npy' have been created.")
