In [None]:
import os
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier, early_stopping
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold

# 1. 시계열 기반 피처 생성 함수 (window=3)
def add_time_features(df, window=3):
    df_sorted = df.sort_values(['subject_id', 'timestamp'])
    sensor_cols = [c for c in df.columns if c not in ['subject_id', 'timestamp']]

    for col in sensor_cols:
        if pd.api.types.is_numeric_dtype(df_sorted[col]):
            grp = df_sorted.groupby('subject_id')[col]
            df_sorted[f'{col}_rolling_mean'] = grp.transform(lambda x: x.rolling(window, min_periods=1).mean())
            df_sorted[f'{col}_rolling_std'] = grp.transform(lambda x: x.rolling(window, min_periods=1).std())
            df_sorted[f'{col}_rolling_median'] = grp.transform(lambda x: x.rolling(window, min_periods=1).median())
            df_sorted[f'{col}_rolling_iqr'] = grp.transform(lambda x: x.rolling(window, min_periods=1).quantile(0.75) - x.rolling(window, min_periods=1).quantile(0.25))
            df_sorted[f'{col}_rolling_mad'] = grp.transform(lambda x: x.rolling(window, min_periods=1).apply(lambda y: np.mean(np.abs(y - np.mean(y)))))
            df_sorted[f'{col}_diff'] = grp.transform(lambda x: x.diff().fillna(0))
            df_sorted[f'{col}_expanding_mean'] = grp.transform(lambda x: x.expanding(min_periods=1).mean())
            df_sorted[f'{col}_ema'] = grp.transform(lambda x: x.ewm(span=window, adjust=False).mean())
            df_sorted[f'{col}_pct_change'] = grp.transform(lambda x: x.pct_change().fillna(0))
            df_sorted[f'{col}_zero_cross'] = grp.transform(lambda x: ((x * x.shift(1)) < 0).astype(int).cumsum())
        else:
            for suffix in ['rolling_mean', 'rolling_std', 'rolling_median', 'rolling_iqr', 'rolling_mad', 'diff', 'expanding_mean', 'ema', 'pct_change', 'zero_cross']:
                df_sorted[f'{col}_{suffix}'] = np.nan
    return df_sorted

# 2. parquet 파일 전처리 및 피처 집계
def extract_features_with_time(parquet_dir):
    subject_features = pd.DataFrame()
    for f in os.listdir(parquet_dir):
        if not f.endswith('.parquet'):
            continue
        df = pd.read_parquet(os.path.join(parquet_dir, f))
        if 'subject_id' not in df or 'timestamp' not in df:
            continue
        df_time = add_time_features(df)
        numeric_cols = df_time.select_dtypes(include=['number']).columns.tolist()
        sensor_cols = [c for c in numeric_cols if c != 'subject_id']
        if not sensor_cols:
            continue
        agg_df = df_time.groupby('subject_id')[sensor_cols].agg(['mean', 'std', 'min', 'max']).reset_index()
        agg_df.columns = ['subject_id'] + [f"{c}_{stat}" for c in sensor_cols for stat in ['mean', 'std', 'min', 'max']]
        subject_features = agg_df if subject_features.empty else subject_features.merge(agg_df, on='subject_id', how='outer')
    return subject_features

# 3. 경로 설정 (/data 폴더 사용)
parquet_dir = './data/ch2025_data_items'
train_csv_path = './data/ch2025_metrics_train.csv'
submission_csv_path = './data/ch2025_submission_sample.csv'
submission_save_path = './data/submission_final_boost_timestrong.csv'

# 4. 피처 생성
subject_features = extract_features_with_time(parquet_dir)

# 5. 학습 데이터 로딩
train_df = pd.read_csv(train_csv_path)
train_merged = train_df.merge(subject_features, on='subject_id', how='left')

# 6. 날짜 파생 피처
for col in ['sleep_date', 'lifelog_date']:
    train_merged[col] = pd.to_datetime(train_merged[col])
    train_merged[f'{col}_dayofweek'] = train_merged[col].dt.dayofweek.astype('category')
    train_merged[f'{col}_month'] = train_merged[col].dt.month.astype('category')
    train_merged[f'{col}_hour'] = train_merged[col].dt.hour.astype('category')
    train_merged[f'{col}_is_weekend'] = train_merged[col].dt.dayofweek.isin([5, 6]).astype('category')
    train_merged[f'{col}_days_since_base'] = (train_merged[col] - pd.Timestamp("2020-01-01")).dt.days

train_merged['date_diff'] = (train_merged['lifelog_date'] - train_merged['sleep_date']).dt.days

# 7. 테스트 데이터
submission = pd.read_csv(submission_csv_path)
test_merged = submission.merge(subject_features, on='subject_id', how='left')
for col in ['sleep_date', 'lifelog_date']:
    test_merged[col] = pd.to_datetime(test_merged[col])
    test_merged[f'{col}_dayofweek'] = test_merged[col].dt.dayofweek.astype('category')
    test_merged[f'{col}_month'] = test_merged[col].dt.month.astype('category')
    test_merged[f'{col}_hour'] = test_merged[col].dt.hour.astype('category')
    test_merged[f'{col}_is_weekend'] = test_merged[col].dt.dayofweek.isin([5, 6]).astype('category')
    test_merged[f'{col}_days_since_base'] = (test_merged[col] - pd.Timestamp("2020-01-01")).dt.days

test_merged['date_diff'] = (test_merged['lifelog_date'] - test_merged['sleep_date']).dt.days

# 8. 피처/라벨 분리
target_cols = ['Q1', 'Q2', 'Q3', 'S1', 'S2', 'S3']
feature_cols = [c for c in train_merged.columns if c not in ['subject_id', 'sleep_date', 'lifelog_date'] + target_cols]
X = train_merged[feature_cols]
X_test = test_merged[feature_cols]

# 9. 결측치 처리
for df in [X, X_test]:
    for col in df.columns:
        if pd.api.types.is_categorical_dtype(df[col]):
            if -1 not in df[col].cat.categories:
                df[col] = df[col].cat.add_categories([-1])
            df[col] = df[col].fillna(-1)
        else:
            df[col] = df[col].fillna(-1)

# 10. 학습 및 예측
final_preds = submission.copy()
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for target in target_cols:
    print(f"\nTraining target: {target}")
    y = train_merged[target]
    test_pred = np.zeros((X_test.shape[0], len(np.unique(y))))

    for train_idx, val_idx in skf.split(X, y):
        X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]

        cat_features = [i for i, col in enumerate(X.columns) if str(X[col].dtype) == 'category']

        lgb_model = LGBMClassifier(
            random_state=42, n_estimators=1200, learning_rate=0.045,
            class_weight='balanced', force_col_wise=True
        )
        lgb_model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            callbacks=[early_stopping(30)]
        )
        lgb_probs = lgb_model.predict_proba(X_test)

        cat_model = CatBoostClassifier(
            random_state=42, iterations=1200, learning_rate=0.045,
            depth=6, l2_leaf_reg=4, early_stopping_rounds=30, verbose=0
        )
        cat_model.fit(
            X_train, y_train,
            cat_features=cat_features,
            eval_set=(X_val, y_val)
        )
        cat_probs = cat_model.predict_proba(X_test)

        test_pred += (0.3 * lgb_probs + 0.7 * cat_probs) / skf.n_splits

    final_preds[target] = np.argmax(test_pred, axis=1)

# 11. 저장 (/data 경로)
final_cols = ['subject_id', 'sleep_date', 'lifelog_date'] + target_cols
final_submission = final_preds[final_cols]
final_submission.to_csv(submission_save_path, index=False)
print("\n최종 제출 파일 저장 완료:", submission_save_path)
