In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from tqdm import tqdm

# 데이터 전처리 함수
def preprocess_data(df):
    # 시간 데이터 처리 (시, 분, 초, minsec 생성)
    df['hour'] = df['time_stamp'].str.slice(start=11, stop=13).astype(int)
    df['minute'] = df['time_stamp'].str.slice(start=14, stop=16).astype(int)
    df['second'] = df['time_stamp'].str.slice(start=17, stop=19).astype(int)
    df['minsec'] = df['minute'] * 60 + df['second']

    # "Setpoint" 컬럼 제거
    df = df.drop(columns=[col for col in df.columns if 'Setpoint' in col])
    
    return df

# Stage별 데이터 분리 함수
def split_stage_data(df):
    # Stage 1 데이터 분리 (Machine 1, 2, 3 + Stage1.Output 관련 feature들)
    stage1_data = df[df.columns.drop(list(df.filter(regex='Machine4|Machine5|Stage2')))]
    
    # Stage 2 데이터 분리 (Machine 4, 5 + Stage2.Output 관련 feature들)
    stage2_data = df[df.columns.drop(list(df.filter(regex='Machine1|Machine2|Machine3|Stage1')))]
    
    return stage1_data, stage2_data

# 모델 학습 함수
def train_model(X, y, model_type='random_forest'):
    if model_type == 'random_forest':
        model = RandomForestRegressor(n_estimators=500, random_state=0, n_jobs=-1)
    elif model_type == 'extra_trees':
        model = ExtraTreesRegressor(n_estimators=500, random_state=0, n_jobs=-1)
    
    # tqdm으로 학습 진행 상황 확인
    for _ in tqdm(range(1)):
        model.fit(X, y)
    
    return model

# 예측 함수
def predict_model(model, X):
    return model.predict(X)

# 최종 R2 Score 계산 함수
def evaluate_model(y_true, y_pred):
    return r2_score(y_true, y_pred)

# 최종 서브미션 파일 저장 함수
def save_submission(predictions_stage1, predictions_stage2):
    np.save("submission1.npy", predictions_stage1)
    np.save("submission2.npy", predictions_stage2)
    print("Submission files 'submission1.npy' and 'submission2.npy' have been created.")


In [3]:
# 1. 데이터 로드 및 전처리
df = pd.read_csv('data/continuous_factory_process.csv')
submission_df = pd.read_csv('data/submission_data.csv')

df = preprocess_data(df)
submission_df = preprocess_data(submission_df)


In [9]:
df.columns

Index(['time_stamp', 'AmbientConditions.AmbientHumidity.U.Actual',
       'AmbientConditions.AmbientTemperature.U.Actual',
       'Machine1.RawMaterial.Property1', 'Machine1.RawMaterial.Property2',
       'Machine1.RawMaterial.Property3', 'Machine1.RawMaterial.Property4',
       'Machine1.RawMaterialFeederParameter.U.Actual',
       'Machine1.Zone1Temperature.C.Actual',
       'Machine1.Zone2Temperature.C.Actual', 'Machine1.MotorAmperage.U.Actual',
       'Machine1.MotorRPM.C.Actual', 'Machine1.MaterialPressure.U.Actual',
       'Machine1.MaterialTemperature.U.Actual',
       'Machine1.ExitZoneTemperature.C.Actual',
       'Machine2.RawMaterial.Property1', 'Machine2.RawMaterial.Property2',
       'Machine2.RawMaterial.Property3', 'Machine2.RawMaterial.Property4',
       'Machine2.RawMaterialFeederParameter.U.Actual',
       'Machine2.Zone1Temperature.C.Actual',
       'Machine2.Zone2Temperature.C.Actual', 'Machine2.MotorAmperage.U.Actual',
       'Machine2.MotorRPM.C.Actual', 'Machine2

In [12]:

# 2. Stage 데이터 분리
stage1_data, stage2_data = split_stage_data(df)
submission_stage1_data, submission_stage2_data = split_stage_data(submission_df)


In [13]:
stage1_data.columns

Index(['time_stamp', 'AmbientConditions.AmbientHumidity.U.Actual',
       'AmbientConditions.AmbientTemperature.U.Actual',
       'Machine1.RawMaterial.Property1', 'Machine1.RawMaterial.Property2',
       'Machine1.RawMaterial.Property3', 'Machine1.RawMaterial.Property4',
       'Machine1.RawMaterialFeederParameter.U.Actual',
       'Machine1.Zone1Temperature.C.Actual',
       'Machine1.Zone2Temperature.C.Actual', 'Machine1.MotorAmperage.U.Actual',
       'Machine1.MotorRPM.C.Actual', 'Machine1.MaterialPressure.U.Actual',
       'Machine1.MaterialTemperature.U.Actual',
       'Machine1.ExitZoneTemperature.C.Actual',
       'Machine2.RawMaterial.Property1', 'Machine2.RawMaterial.Property2',
       'Machine2.RawMaterial.Property3', 'Machine2.RawMaterial.Property4',
       'Machine2.RawMaterialFeederParameter.U.Actual',
       'Machine2.Zone1Temperature.C.Actual',
       'Machine2.Zone2Temperature.C.Actual', 'Machine2.MotorAmperage.U.Actual',
       'Machine2.MotorRPM.C.Actual', 'Machine2

In [None]:

# 3. Feature와 Target 설정 (Stage 1, Stage 2)
X_stage1 = stage1_data[important_features]
y_stage1 = stage1_data[[col for col in stage1_data.columns if 'Stage1.Output' in col]]

X_stage2 = stage2_data[important_features_stage2]
y_stage2 = stage2_data[[col for col in stage2_data.columns if 'Stage2.Output' in col]]

X_submission_stage1 = submission_stage1_data[important_features]
X_submission_stage2 = submission_stage2_data[important_features_stage2]

# 4. 모델 학습 (Stage 1: RandomForest, Stage 2: ExtraTrees)
model_rf_stage1 = train_model(X_stage1, y_stage1, model_type='random_forest')
model_et_stage2 = train_model(X_stage2, y_stage2, model_type='extra_trees')

# 5. 예측 수행
submission_stage1_predictions = predict_model(model_rf_stage1, X_submission_stage1)
submission_stage2_predictions = predict_model(model_et_stage2, X_submission_stage2)

# 6. 서브미션 파일 저장
save_submission(submission_stage1_predictions, submission_stage2_predictions)


In [10]:
# Setpoint 컬럼 제거
df_clean = df.drop(columns=[col for col in df.columns if 'Setpoint' in col])

# Machine 1, 2, 3 (Stage 1)과 Machine 4, 5 (Stage 2)의 컬럼 개수 확인
machine_1_columns = [col for col in df_clean.columns if 'Machine1' in col]
machine_2_columns = [col for col in df_clean.columns if 'Machine2' in col]
machine_3_columns = [col for col in df_clean.columns if 'Machine3' in col]
machine_4_columns = [col for col in df_clean.columns if 'Machine4' in col]
machine_5_columns = [col for col in df_clean.columns if 'Machine5' in col]

# 각 Machine별 컬럼 개수 출력
print(f"Machine 1 컬럼 개수: {len(machine_1_columns)}")
print(f"Machine 2 컬럼 개수: {len(machine_2_columns)}")
print(f"Machine 3 컬럼 개수: {len(machine_3_columns)}")
print(f"Machine 4 컬럼 개수: {len(machine_4_columns)}")
print(f"Machine 5 컬럼 개수: {len(machine_5_columns)}")


Machine 1 컬럼 개수: 12
Machine 2 컬럼 개수: 12
Machine 3 컬럼 개수: 12
Machine 4 컬럼 개수: 7
Machine 5 컬럼 개수: 7


In [57]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from tqdm import tqdm

# 데이터 전처리 함수
def preprocess_data(df):
    # 시간 데이터 처리 (시, 분, 초, minsec 생성)
    df['hour'] = df['time_stamp'].str.slice(start=11, stop=13).astype(int)
    df['minute'] = df['time_stamp'].str.slice(start=14, stop=16).astype(int)
    df['second'] = df['time_stamp'].str.slice(start=17, stop=19).astype(int)
    df['minsec'] = df['minute'] * 60 + df['second']

    # "Setpoint" 컬럼 제거
    df = df.drop(columns=[col for col in df.columns if 'Setpoint' in col])
    
    return df

# 시간 데이터를 주기적인 sin, cos 형태로 변환
def add_time_features(df):
    # 각 시간 관련 컬럼을 주기성을 반영하여 변환
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)

    df['minute_sin'] = np.sin(2 * np.pi * df['minute'] / 60)
    df['minute_cos'] = np.cos(2 * np.pi * df['minute'] / 60)

    df['second_sin'] = np.sin(2 * np.pi * df['second'] / 60)
    df['second_cos'] = np.cos(2 * np.pi * df['second'] / 60)

    return df

# Stage별 데이터 분리 함수
def split_stage_data(df):
    # 공통으로 사용될 환경 조건 컬럼 (Stage 1과 Stage 2에 모두 포함)
    env_features = ['AmbientConditions.AmbientHumidity.U.Actual', 
                    'AmbientConditions.AmbientTemperature.U.Actual']
    
    # Stage 1 데이터 분리 (Machine 1, 2, 3 + Stage1.Output 관련 feature들 + 환경 조건 컬럼 + FirstStage Operation 관련 컬럼)
    stage1_data = df[df.columns.drop(list(df.filter(regex='Machine4|Machine5|Stage2|time_stamp')))].copy()
    stage1_data = stage1_data[env_features + stage1_data.columns.tolist()]  # 환경 조건 추가
    
    # Stage 2 데이터 분리 (Machine 4, 5 + Stage2.Output 관련 feature들 + 환경 조건 컬럼)
    stage2_data = df[df.columns.drop(list(df.filter(regex='Machine1|Machine2|Machine3|Stage1|time_stamp|FirstStage.CombinerOperation')))].copy()
    stage2_data = stage2_data[env_features + stage2_data.columns.tolist()]  # 환경 조건 추가
    
    return stage1_data, stage2_data

# 모델 학습 함수
def train_model(X, y, model_type='random_forest'):
    if model_type == 'random_forest':
        model = RandomForestRegressor(n_estimators=500, random_state=0, n_jobs=-1)
    elif model_type == 'extra_trees':
        model = ExtraTreesRegressor(n_estimators=500, random_state=0, n_jobs=-1)
    
    # tqdm으로 학습 진행 상황 확인
    for _ in tqdm(range(1)):
        model.fit(X, y)
    
    return model

# 예측 함수
def predict_model(model, X):
    return model.predict(X)

# 최종 서브미션 파일 저장 함수
def save_submission(predictions_stage1, predictions_stage2):
    np.save("submission1.npy", predictions_stage1)
    np.save("submission2.npy", predictions_stage2)
    print("Submission files 'submission1.npy' and 'submission2.npy' have been created.")


In [58]:

# 1. 데이터 로드 및 전처리
df = pd.read_csv('data/continuous_factory_process.csv')
submission_df = pd.read_csv('data/submission_data.csv')

df = preprocess_data(df)
submission_df = preprocess_data(submission_df)

# 시간 관련 데이터 변환
df = add_time_features(df)
submission_df = add_time_features(submission_df)

# 2. Stage 데이터 분리
stage1_data, stage2_data = split_stage_data(df)
submission_stage1_data, submission_stage2_data = split_stage_data(submission_df)

# 3. Feature와 Target 설정 (Stage 1, Stage 2)
X_stage1 = stage1_data.drop(columns=[col for col in stage1_data.columns if 'Stage1.Output' in col])
y_stage1 = stage1_data[[col for col in stage1_data.columns if 'Stage1.Output' in col]]

X_stage2 = stage2_data.drop(columns=[col for col in stage2_data.columns if 'Stage2.Output' in col])
y_stage2 = stage2_data[[col for col in stage2_data.columns if 'Stage2.Output' in col]]

# 학습에 사용한 feature 이름과 순서 저장
stage1_features = X_stage1.columns.tolist()
stage2_features = X_stage2.columns.tolist()

# 4. 모델 학습 (Stage 1: RandomForest, Stage 2: ExtraTrees)
model_rf_stage1 = train_model(X_stage1, y_stage1, model_type='random_forest')
model_et_stage2 = train_model(X_stage2, y_stage2, model_type='extra_trees')

# 5. submission 데이터에서 학습 시 사용된 feature 순서를 그대로 맞춰줌
X_submission_stage1 = submission_stage1_data[stage1_features]  # 동일한 feature 순서 강제 적용
X_submission_stage2 = submission_stage2_data[stage2_features]  # 동일한 feature 순서 강제 적용

# 학습 시 사용한 feature와 submission 데이터의 feature 타입 비교 및 타입 변환
for col in X_stage1.columns:
    train_dtype = X_stage1[col].values.dtype
    submission_dtype = X_submission_stage1[col].values.dtype
    if train_dtype != submission_dtype:
        print(f"Converting {col} from {submission_dtype} to {train_dtype}")
        X_submission_stage1[col] = X_submission_stage1[col].astype(train_dtype)

# 동일하게 Stage 2 데이터에도 적용
for col in X_stage2.columns:
    train_dtype = X_stage2[col].values.dtype
    submission_dtype = X_submission_stage2[col].values.dtype
    if train_dtype != submission_dtype:
        print(f"Converting {col} from {submission_dtype} to {train_dtype}")
        X_submission_stage2[col] = X_submission_stage2[col].astype(train_dtype)

# 예측 수행 (학습 시와 동일한 feature 순서로 예측)
submission_stage1_predictions = predict_model(model_rf_stage1, X_submission_stage1)
submission_stage2_predictions = predict_model(model_et_stage2, X_submission_stage2)

# 6. 서브미션 파일 저장
save_submission(submission_stage1_predictions, submission_stage2_predictions)


100%|██████████| 1/1 [00:24<00:00, 24.41s/it]
100%|██████████| 1/1 [01:07<00:00, 67.31s/it]


ValueError: The feature names should match those that were passed during fit.
Feature names must be in the same order as they were in fit.


In [59]:
# 학습 시 사용한 feature와 submission 데이터의 feature 이름과 순서 비교
print("Stage 1 Feature Names Comparison")
for i, (train_col, submission_col) in enumerate(zip(X_stage1.columns, X_submission_stage1.columns)):
    if train_col != submission_col:
        print(f"Mismatch at index {i}: {train_col} (train) != {submission_col} (submission)")

print("Stage 2 Feature Names Comparison")
for i, (train_col, submission_col) in enumerate(zip(X_stage2.columns, X_submission_stage2.columns)):
    if train_col != submission_col:
        print(f"Mismatch at index {i}: {train_col} (train) != {submission_col} (submission)")


Stage 1 Feature Names Comparison
Mismatch at index 1: AmbientConditions.AmbientTemperature.U.Actual (train) != AmbientConditions.AmbientHumidity.U.Actual (submission)
Mismatch at index 2: AmbientConditions.AmbientHumidity.U.Actual (train) != AmbientConditions.AmbientTemperature.U.Actual (submission)
Mismatch at index 4: Machine1.RawMaterial.Property1 (train) != AmbientConditions.AmbientHumidity.U.Actual (submission)
Mismatch at index 5: Machine1.RawMaterial.Property2 (train) != AmbientConditions.AmbientHumidity.U.Actual (submission)
Mismatch at index 6: Machine1.RawMaterial.Property3 (train) != AmbientConditions.AmbientTemperature.U.Actual (submission)
Mismatch at index 7: Machine1.RawMaterial.Property4 (train) != AmbientConditions.AmbientTemperature.U.Actual (submission)
Mismatch at index 8: Machine1.RawMaterialFeederParameter.U.Actual (train) != Machine1.RawMaterial.Property1 (submission)
Mismatch at index 9: Machine1.Zone1Temperature.C.Actual (train) != Machine1.RawMaterial.Property

In [63]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# 1. 데이터 불러오기
def load_data(file_path):
    return pd.read_csv(file_path)

# 2. 데이터 전처리 함수
def preprocess_data(df):
    # time_stamp 컬럼 제거
    df = df.drop(columns=['time_stamp'])
    df = df.drop(columns=[col for col in df.columns if 'Setpoint' in col])
    # 시간 관련 feature 생성
    df['hour'] = pd.to_datetime(df['time_stamp']).dt.hour
    df['minute'] = pd.to_datetime(df['time_stamp']).dt.minute
    df['second'] = pd.to_datetime(df['time_stamp']).dt.second
    df['minsec'] = df['minute'] * 60 + df['second']
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
    df['minute_sin'] = np.sin(2 * np.pi * df['minute'] / 60)
    df['minute_cos'] = np.cos(2 * np.pi * df['minute'] / 60)
    df['second_sin'] = np.sin(2 * np.pi * df['second'] / 60)
    df['second_cos'] = np.cos(2 * np.pi * df['second'] / 60)
    return df

# 3. Stage 1, Stage 2 데이터 분리
def split_stage_data(df):
    stage1_data = df[df.columns.drop(list(df.filter(regex='Machine4|Machine5|Stage2')))]
    stage2_data = df[df.columns.drop(list(df.filter(regex='Machine1|Machine2|Machine3|Stage1')))]
    return stage1_data, stage2_data

# 4. 모델 학습 함수
def train_model(X, y, model_type="rf"):
    if model_type == "rf":
        model = RandomForestRegressor(n_estimators=100, random_state=42)
    elif model_type == "et":
        model = ExtraTreesRegressor(n_estimators=100, random_state=42)
    
    model.fit(X, y)
    return model

# 5. 예측 함수
def predict_model(model, X):
    return model.predict(X)

# 6. 서브미션 파일 저장 함수
def save_submission(stage1_preds, stage2_preds):
    np.save('submission1.npy', stage1_preds)
    np.save('submission2.npy', stage2_preds)

# 7. 학습과 서브미션 피처의 순서 및 타입 맞추기
def align_submission_features(X_train, X_submission):
    for col in X_train.columns:
        if col in X_submission.columns:
            train_dtype = X_train[col].values.dtype
            submission_dtype = X_submission[col].values.dtype
            if train_dtype != submission_dtype:
                X_submission[col] = X_submission[col].astype(train_dtype)

    X_submission = X_submission[X_train.columns]  # 학습 시 사용한 피처 순서로 재배열
    return X_submission

# 메인 코드 실행
if __name__ == "__main__":
    # 데이터 불러오기
    df = load_data('data/continuous_factory_process.csv')
    submission_df = load_data('data/submission_data.csv')

    # 데이터 전처리
    df = preprocess_data(df)
    submission_df = preprocess_data(submission_df)

    # Stage 1, Stage 2 데이터 분리
    stage1_data, stage2_data = split_stage_data(df)
    submission_stage1_data, submission_stage2_data = split_stage_data(submission_df)

    # 입력 피처(X)와 출력 타겟(y) 설정
    stage1_features = stage1_data.drop(columns=[col for col in stage1_data.columns if 'Stage1.Output' in col])
    stage1_target = stage1_data[[col for col in stage1_data.columns if 'Stage1.Output' in col]]

    stage2_features = stage2_data.drop(columns=[col for col in stage2_data.columns if 'Stage2.Output' in col])
    stage2_target = stage2_data[[col for col in stage2_data.columns if 'Stage2.Output' in col]]

    # 학습/테스트 데이터 분리
    X_stage1, X_stage1_test, y_stage1, y_stage1_test = train_test_split(stage1_features, stage1_target, test_size=0.2, random_state=42)
    X_stage2, X_stage2_test, y_stage2, y_stage2_test = train_test_split(stage2_features, stage2_target, test_size=0.2, random_state=42)

    # 모델 학습
    model_rf_stage1 = train_model(X_stage1, y_stage1, model_type="rf")
    model_et_stage2 = train_model(X_stage2, y_stage2, model_type="et")

    # submission 데이터의 피처 순서 및 타입 맞추기
    X_submission_stage1 = align_submission_features(X_stage1, submission_stage1_data[stage1_features.columns])
    X_submission_stage2 = align_submission_features(X_stage2, submission_stage2_data[stage2_features.columns])

    # 예측 수행 (학습 시와 동일한 피처 순서로 예측)
    submission_stage1_predictions = predict_model(model_rf_stage1, X_submission_stage1)
    submission_stage2_predictions = predict_model(model_et_stage2, X_submission_stage2)

    # 서브미션 파일 저장
    save_submission(submission_stage1_predictions, submission_stage2_predictions)


KeyError: 'time_stamp'

In [65]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import r2_score
from tqdm import tqdm

# 데이터 전처리 함수
def preprocess_data(df):
    # 시간 데이터 처리 (시, 분, 초, minsec 생성)
    df['hour'] = df['time_stamp'].str.slice(start=11, stop=13).astype(int)
    df['minute'] = df['time_stamp'].str.slice(start=14, stop=16).astype(int)
    df['second'] = df['time_stamp'].str.slice(start=17, stop=19).astype(int)
    df['minsec'] = df['minute'] * 60 + df['second']

    # "Setpoint" 컬럼 제거
    df = df.drop(columns=[col for col in df.columns if 'Setpoint' in col])
    
    return df

# 시간 데이터를 주기적인 sin, cos 형태로 변환
def add_time_features(df):
    # 각 시간 관련 컬럼을 주기성을 반영하여 변환
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)

    df['minute_sin'] = np.sin(2 * np.pi * df['minute'] / 60)
    df['minute_cos'] = np.cos(2 * np.pi * df['minute'] / 60)

    df['second_sin'] = np.sin(2 * np.pi * df['second'] / 60)
    df['second_cos'] = np.cos(2 * np.pi * df['second'] / 60)

    return df

# Stage별 데이터 분리 함수
def split_stage_data(df):
    # 공통으로 사용될 환경 조건 컬럼 (Stage 1과 Stage 2에 모두 포함)
    env_features = ['AmbientConditions.AmbientHumidity.U.Actual', 
                    'AmbientConditions.AmbientTemperature.U.Actual']
    
    # Stage 1 데이터 분리 (Machine 1, 2, 3 + Stage1.Output 관련 feature들 + 환경 조건 컬럼 + FirstStage Operation 관련 컬럼)
    stage1_data = df[df.columns.drop(list(df.filter(regex='Machine4|Machine5|Stage2|time_stamp')))].copy()
    stage1_data = stage1_data[env_features + stage1_data.columns.tolist()]  # 환경 조건 추가
    
    # Stage 2 데이터 분리 (Machine 4, 5 + Stage2.Output 관련 feature들 + 환경 조건 컬럼)
    stage2_data = df[df.columns.drop(list(df.filter(regex='Machine1|Machine2|Machine3|Stage1|time_stamp|FirstStage.CombinerOperation')))].copy()
    stage2_data = stage2_data[env_features + stage2_data.columns.tolist()]  # 환경 조건 추가
    
    return stage1_data, stage2_data

# 모델 학습 함수 (KFold 적용)
def train_model_kfold(X, y, model_type='random_forest', n_splits=5):
    kfold = KFold(n_splits=n_splits, shuffle=True, random_state=0)
    
    if model_type == 'random_forest':
        model = RandomForestRegressor(random_state=0, n_jobs=-1)
        param_grid = {
            'n_estimators': [200, 500],
            'max_depth': [10, 20, None],
            'min_samples_split': [2, 5],
        }
    elif model_type == 'extra_trees':
        model = ExtraTreesRegressor(random_state=0, n_jobs=-1)
        param_grid = {
            'n_estimators': [200, 500],
            'max_depth': [10, 20, None],
            'min_samples_split': [2, 5],
        }

    # GridSearch로 하이퍼파라미터 튜닝
    grid_search = GridSearchCV(model, param_grid, cv=kfold, n_jobs=-1, verbose=2)
    
    best_model = None
    best_score = -float('inf')

    # tqdm으로 진행 상황 확인
    for train_idx, val_idx in tqdm(kfold.split(X), total=n_splits):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        grid_search.fit(X_train, y_train)
        model = grid_search.best_estimator_

        y_pred = model.predict(X_val)
        score = r2_score(y_val, y_pred)

        if score > best_score:
            best_score = score
            best_model = model

    print(f"Best R2 Score: {best_score}")
    return best_model

# R2 Score 계산 및 평가
def evaluate_model_kfold(X, y, model, n_splits=5):
    kfold = KFold(n_splits=n_splits, shuffle=True, random_state=0)
    r2_scores = []

    for train_idx, val_idx in kfold.split(X):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)

        r2 = r2_score(y_val, y_pred)
        r2_scores.append(r2)

    avg_r2_score = np.mean(r2_scores)
    print(f"Average R2 Score: {avg_r2_score}")
    return avg_r2_score

# 최종 서브미션 파일 저장 함수
def save_submission(predictions_stage1, predictions_stage2):
    np.save("submission1.npy", predictions_stage1)
    np.save("submission2.npy", predictions_stage2)
    print("Submission files 'submission1.npy' and 'submission2.npy' have been created.")


In [66]:

# 1. 데이터 로드 및 전처리
df = pd.read_csv('data/continuous_factory_process.csv')
submission_df = pd.read_csv('data/submission_data.csv')

df = preprocess_data(df)
submission_df = preprocess_data(submission_df)

# 시간 관련 데이터 변환
df = add_time_features(df)
submission_df = add_time_features(submission_df)

# 2. Stage 데이터 분리
stage1_data, stage2_data = split_stage_data(df)
submission_stage1_data, submission_stage2_data = split_stage_data(submission_df)

# 3. Feature와 Target 설정 (Stage 1, Stage 2)
X_stage1 = stage1_data.drop(columns=[col for col in stage1_data.columns if 'Stage1.Output' in col])
y_stage1 = stage1_data[[col for col in stage1_data.columns if 'Stage1.Output' in col]]

X_stage2 = stage2_data.drop(columns=[col for col in stage2_data.columns if 'Stage2.Output' in col])
y_stage2 = stage2_data[[col for col in stage2_data.columns if 'Stage2.Output' in col]]

X_submission_stage1 = submission_stage1_data[X_stage1.columns]
X_submission_stage2 = submission_stage2_data[X_stage2.columns]


In [67]:

# 4. 모델 학습 (Stage 1: RandomForest, Stage 2: ExtraTrees)
model_rf_stage1 = train_model_kfold(X_stage1, y_stage1, model_type='random_forest', n_splits=5)
model_et_stage2 = train_model_kfold(X_stage2, y_stage2, model_type='extra_trees', n_splits=5)


  0%|          | 0/5 [00:00<?, ?it/s]

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END max_depth=10, min_samples_split=2, n_estimators=200; total time=  36.8s
[CV] END max_depth=10, min_samples_split=5, n_estimators=200; total time=  40.8s
[CV] END max_depth=10, min_samples_split=2, n_estimators=200; total time=  36.4s
[CV] END max_depth=10, min_samples_split=2, n_estimators=200; total time=  45.2s
[CV] END max_depth=10, min_samples_split=5, n_estimators=200; total time=  46.0s
[CV] END max_depth=10, min_samples_split=2, n_estimators=200; total time=  46.7s
[CV] END max_depth=10, min_samples_split=2, n_estimators=200; total time=  55.6s
[CV] END max_depth=10, min_samples_split=5, n_estimators=200; total time=  19.1s
[CV] END max_depth=10, min_samples_split=5, n_estimators=200; total time=  32.6s
[CV] END max_depth=10, min_samples_split=5, n_estimators=200; total time=  36.4s
[CV] END max_depth=10, min_samples_split=2, n_estimators=500; total time= 1.7min
[CV] END max_depth=10, min_samples_split=2, n_es

: 

In [None]:

# 5. 예측 수행
submission_stage1_predictions = model_rf_stage1.predict(X_submission_stage1)
submission_stage2_predictions = model_et_stage2.predict(X_submission_stage2)

# 6. 서브미션 파일 저장
save_submission(submission_stage1_predictions, submission_stage2_predictions)
