In [1]:
import random
import glob
import re
import os
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tqdm import tqdm
import torch

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print("✅ 현재 디바이스:", device)



✅ 현재 디바이스: mps


In [2]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    os.environ['TF_DETERMINISTIC_OPS'] = '1'

set_seed(42)

In [3]:
LOOKBACK, PREDICT, BATCH_SIZE, EPOCHS = 28, 7, 16, 50

In [4]:
train = pd.read_csv('./data/train.csv')
#데이터확인
print("데이터확인")
print(train.shape)


데이터확인
(102676, 3)


전처리

In [6]:
# %%
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from typing import Dict, List, Tuple

# =========================
# 설정 (필요시 수정)
# =========================
VENUE_WEIGHTS = {
    '담하': 2.0, '미라시아': 2.0, '라그로타': 1.5,
    '느티나무 셀프BBQ': 1.0, '연회장': 1.0, '카페테리아': 0.8,
    '화담숲주막': 0.7, '포레스트릿': 0.5, '화담숲카페': 0.5
}

GROUP_MAP = {
    "회담숲주막": "group_2",
    "회담숲카페": "group_2",
    "카페테리아": "group_2",
    "포레스트릿": "group_2",
    "미라시아": "group_3",
    "담하": "group_3",
    "느티나무 셀프BBQ": "group_3",
    "연회장": "group_3"
}

HOLIDAYS = pd.to_datetime([
    # 2023
    '2023-01-01', '2023-01-21', '2023-01-22', '2023-01-23',
    '2023-03-01', '2023-05-05', '2023-06-06', '2023-08-15',
    '2023-09-28', '2023-09-29', '2023-09-30', '2023-10-03',
    '2023-10-09', '2023-12-25',
    # 2024
    '2024-01-01', '2024-02-09', '2024-02-10', '2024-02-11',
    '2024-03-01', '2024-05-05', '2024-06-06', '2024-08-15',
    '2024-09-16', '2024-09-17', '2024-09-18', '2024-10-03',
    '2024-10-09', '2024-12-25'
])


# ----------------- 헬퍼 -----------------
def season_from_month(m: int) -> str:
    if m in (12, 1, 2): return '겨울'
    if m in (3, 4, 5):  return '봄'
    if m in (6, 7, 8):  return '여름'
    return '가을'


def rolling_slope(x: np.ndarray) -> float:
    if len(x) < 2: return 0.0
    xs = np.arange(len(x))
    return np.polyfit(xs, x, 1)[0]


def days_to_nearest_holiday(dates: pd.Series, holidays: pd.Series) -> Tuple[pd.Series, pd.Series]:
    # dates: DatetimeIndex/Series
    d = pd.to_datetime(dates).values.astype('datetime64[D]')
    h = np.sort(pd.to_datetime(holidays).values.astype('datetime64[D]'))
    # 위치 찾기
    idx_next = np.searchsorted(h, d, side='left')
    idx_prev = idx_next - 1
    # 다음 공휴일까지
    next_valid = idx_next < len(h)
    d_next = np.full(len(d), 999, dtype=int)
    d_next[next_valid] = (h[idx_next[next_valid]] - d[next_valid]).astype('timedelta64[D]').astype(int)
    # 이전 공휴일까지
    prev_valid = idx_prev >= 0
    d_prev = np.full(len(d), 999, dtype=int)
    d_prev[prev_valid] = (d[prev_valid] - h[idx_prev[prev_valid]]).astype('timedelta64[D]').astype(int)
    return pd.Series(d_prev, index=dates.index), pd.Series(d_next, index=dates.index)


def days_since_last_positive(series: pd.Series) -> pd.Series:
    out, cnt = [], 0
    for v in series.shift(1).fillna(0):
        cnt = 0 if v > 0 else cnt + 1
        out.append(cnt)
    return pd.Series(out, index=series.index)


def base_clean(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df['영업일자'] = pd.to_datetime(df['영업일자'])
    if '영업장명' not in df.columns or '메뉴명' not in df.columns:
        df[['영업장명', '메뉴명']] = df['영업장명_메뉴명'].str.split('_', expand=True)
    df = df.sort_values(['영업장명_메뉴명', '영업일자']).reset_index(drop=True)
    return df


def calendar_features(df: pd.DataFrame, holidays: pd.Series, le_season: LabelEncoder = None, fit=False):
    df = df.copy()
    df['요일'] = df['영업일자'].dt.weekday
    df['월'] = df['영업일자'].dt.month
    df['주차'] = df['영업일자'].dt.isocalendar().week.astype(int)
    df['주말여부'] = df['요일'].isin([5, 6]).astype(int)
    df['공휴일'] = df['영업일자'].isin(holidays).astype(int)

    df['계절'] = df['월'].map(season_from_month)
    if fit:
        le_season = LabelEncoder().fit(df['계절'])
    df['계절코드'] = (le_season.transform(df['계절']) if le_season is not None else df['월'])

    # 공휴일 근접/연휴 플래그
    d_prev, d_next = days_to_nearest_holiday(df.set_index('영업일자').index.to_series(), holidays)
    df['d_prev_hol'] = d_prev.values
    df['d_next_hol'] = d_next.values
    df['내일공휴일'] = df['영업일자'].add(pd.Timedelta(days=1)).isin(holidays).astype(int)
    df['어제공휴일'] = df['영업일자'].add(pd.Timedelta(days=-1)).isin(holidays).astype(int)
    dow = df['영업일자'].dt.weekday
    df['연휴전날'] = ((df['공휴일'] == 0) & (df['내일공휴일'] == 1) & (dow >= 4)).astype(int)
    df['연휴다음날'] = ((df['공휴일'] == 0) & (df['어제공휴일'] == 1) & (dow <= 1)).astype(int)

    # 사이클릭 인코딩
    df['doy'] = df['영업일자'].dt.dayofyear
    df['sin_dow'] = np.sin(2 * np.pi * (df['요일'] / 7))
    df['cos_dow'] = np.cos(2 * np.pi * (df['요일'] / 7))
    df['sin_doy'] = np.sin(2 * np.pi * (df['doy'] / 365))
    df['cos_doy'] = np.cos(2 * np.pi * (df['doy'] / 365))

    return df, le_season


def simple_refund_block(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    if '매출수량' in df.columns:
        df['refund_flag'] = (df['매출수량'] < 0).astype(int)
        df.loc[df['매출수량'] < 0, '매출수량'] = 0
        df['refund_7d_cnt'] = df.groupby('영업장명_메뉴명')['refund_flag'] \
            .transform(lambda s: s.shift(1).rolling(7, min_periods=1).sum())
    else:
        df['refund_flag'] = 0
        df['refund_7d_cnt'] = 0
    return df


def store_level_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    if '매출수량' in df.columns:
        tmp = (df.groupby(['영업일자', '영업장명'], as_index=False)['매출수량'].sum()
               .rename(columns={'매출수량': '매장_일합'})
               .sort_values(['영업장명', '영업일자']))
        tmp['매장_일합_lag1'] = tmp.groupby('영업장명')['매장_일합'].shift(1)
        tmp['요일'] = tmp['영업일자'].dt.weekday
        tmp['매장_dow_mean'] = (
            tmp.groupby(['영업장명', '요일'])['매장_일합']
            .apply(lambda s: s.shift(1).expanding().mean())
            .reset_index(level=[0, 1], drop=True)
        )
        df = df.merge(tmp[['영업일자', '영업장명', '매장_일합_lag1', '매장_dow_mean']],
                      on=['영업일자', '영업장명'], how='left')
    else:
        df['매장_일합_lag1'] = 0
        df['매장_dow_mean'] = 0
    df[['매장_일합_lag1', '매장_dow_mean']] = df[['매장_일합_lag1', '매장_dow_mean']].fillna(0)
    return df


def series_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    if '매출수량' not in df.columns:
        # 매출 없으면 시계열 파생 최소화
        for c in ['lag_1', 'lag_7', 'lag_14', 'lag_21', 'lag_28',
                  'roll_mean_3', 'roll_mean_7', 'roll_mean_14', 'roll_mean_28',
                  'roll_std_7', 'roll_min_7', 'roll_max_7', 'roll_std_14',
                  'roll_slope_7', 'roll_slope_14', 'ewm_0_3', 'ewm_0_1',
                  'dow_mean_hist', 'days_since_pos', '어제_매출', '최근7일_평균']:
            df[c] = 0
        return df

    grp = df.groupby('영업장명_메뉴명')['매출수량']
    df['lag_1'] = grp.shift(1)
    for k in [7, 14, 21, 28]:
        df[f'lag_{k}'] = grp.shift(k)

    s1 = grp.shift(1)
    for w in [3, 7, 14, 28]:
        df[f'roll_mean_{w}'] = s1.rolling(w, min_periods=1).mean()
    df['roll_std_7'] = s1.rolling(7, min_periods=1).std().fillna(0)
    df['roll_min_7'] = s1.rolling(7, min_periods=1).min()
    df['roll_max_7'] = s1.rolling(7, min_periods=1).max()
    df['roll_std_14'] = s1.rolling(14, min_periods=1).std().fillna(0)

    for w in [7, 14]:
        df[f'roll_slope_{w}'] = s1.rolling(w, min_periods=2).apply(rolling_slope, raw=True).fillna(0)

    df['ewm_0_3'] = grp.shift(1).ewm(alpha=0.3, adjust=False).mean()
    df['ewm_0_1'] = grp.shift(1).ewm(alpha=0.1, adjust=False).mean()

    df['dow_mean_hist'] = (
        df.groupby(['영업장명_메뉴명', '요일'])['매출수량']
        .apply(lambda s: s.shift(1).expanding().mean())
        .reset_index(level=[0, 1], drop=True)
    )

    df['days_since_pos'] = df.groupby('영업장명_메뉴명')['매출수량'].transform(days_since_last_positive)

    df['어제_매출'] = df['lag_1'].fillna(0)
    df['최근7일_평균'] = df['roll_mean_7'].fillna(0)

    # 결측 보정
    for c in ['lag_1', 'lag_7', 'lag_14', 'lag_21', 'lag_28',
              'roll_mean_3', 'roll_mean_7', 'roll_mean_14', 'roll_mean_28',
              'roll_std_7', 'roll_min_7', 'roll_max_7', 'roll_std_14',
              'roll_slope_7', 'roll_slope_14', 'ewm_0_3', 'ewm_0_1',
              'dow_mean_hist', 'days_since_pos']:
        df[c] = df[c].fillna(0)
    return df


def static_encodings(df: pd.DataFrame, le_store: LabelEncoder, le_menu: LabelEncoder,
                     venue_weights: Dict[str, float], group_map: Dict[str, str],
                     fit=False):
    df = df.copy()
    if fit:
        le_store.fit(df['영업장명'])
        le_menu.fit(df['메뉴명'])
    df['업장코드'] = le_store.transform(df['영업장명'])
    df['메뉴코드'] = le_menu.transform(df['메뉴명'])
    df['영업장_가중치'] = df['영업장명'].map(venue_weights).fillna(1.0)
    df['업장그룹'] = df['영업장명'].map(group_map).fillna("group_2")
    return df


# =========================
# 메인 전처리 엔트리 (외부 API X)
# =========================
def fit_preprocessor(train_df: pd.DataFrame,
                     holidays: pd.Series = HOLIDAYS,
                     venue_weights: Dict[str, float] = VENUE_WEIGHTS,
                     group_map: Dict[str, str] = GROUP_MAP,
                     use_simple_refund: bool = False):
    """train에 전처리 적용 + 인코더/피처리스트 state 반환"""
    le_store = LabelEncoder()
    le_menu = LabelEncoder()
    # 1) 기본/정렬
    df = base_clean(train_df)
    # 2) 환불 간단 처리(선택)
    if use_simple_refund:
        df = simple_refund_block(df)
    # 3) 캘린더(계절 인코더 fit)
    df, le_season = calendar_features(df, holidays, le_season=None, fit=True)
    # 4) 시계열 파생
    df = series_features(df)
    # 5) 계층 피처
    df = store_level_features(df)
    # 6) 정적 인코딩(fit)
    df = static_encodings(df, le_store, le_menu, venue_weights, group_map, fit=True)

    # feature list
    base_feats = [
        '요일', '주말여부', '공휴일', '계절코드',
        '어제_매출', '최근7일_평균',
        '영업장_가중치', '업장코드', '메뉴코드',
        '내일공휴일', '어제공휴일', 'd_prev_hol', 'd_next_hol', '연휴전날', '연휴다음날',
        'sin_dow', 'cos_dow', 'sin_doy', 'cos_doy',
        '매장_일합_lag1', '매장_dow_mean'
    ]
    extra_feats = [
        'lag_1', 'lag_7', 'lag_14', 'lag_21', 'lag_28',
        'roll_mean_3', 'roll_mean_7', 'roll_std_7', 'roll_min_7', 'roll_max_7',
        'roll_mean_14', 'roll_std_14', 'roll_mean_28',
        'roll_slope_7', 'roll_slope_14',
        'ewm_0_3', 'ewm_0_1', 'dow_mean_hist', 'days_since_pos',
    ]
    refund_feats = ['refund_flag', 'refund_7d_cnt'] if use_simple_refund else []
    feature_cols = list(dict.fromkeys(base_feats + extra_feats + refund_feats))

    # 결측 보정
    for c in feature_cols:
        if c not in df.columns: df[c] = 0
    df[feature_cols] = df[feature_cols].fillna(0)

    state = {
        'feature_cols': feature_cols,
        'le_store': le_store,
        'le_menu': le_menu,
        'le_season': le_season,
        'holidays': pd.to_datetime(holidays),
        'venue_weights': venue_weights,
        'group_map': group_map,
        'use_simple_refund': use_simple_refund,
    }
    return df, state


def transform_with_state(df: pd.DataFrame, state: dict):
    """test/추가 데이터에 동일 전처리 적용 (외부 API X)"""
    df = base_clean(df)
    if state.get('use_simple_refund', False) and '매출수량' in df.columns:
        df = simple_refund_block(df)

    # 캘린더(계절코드 transform)
    df, _ = calendar_features(df, state['holidays'], le_season=state['le_season'], fit=False)
    # 시계열/계층
    df = series_features(df)
    df = store_level_features(df)
    # 정적 인코딩(transform)
    df = static_encodings(df, state['le_store'], state['le_menu'],
                          state['venue_weights'], state['group_map'], fit=False)

    # feature align
    for c in state['feature_cols']:
        if c not in df.columns: df[c] = 0
    df[state['feature_cols']] = df[state['feature_cols']].fillna(0)
    return df



In [8]:
# =========================
train_proc, state = fit_preprocessor(train, holidays=HOLIDAYS, use_simple_refund=False)
feature_cols = state['feature_cols']
# test00_proc = transform_with_state(test00, state)  # 예: TEST_00.csv 로드 후
X_train, y = train_proc[feature_cols], train_proc['매출수량']

학습

In [7]:
import lightgbm as lgb
from lightgbm import early_stopping
from sklearn.model_selection import train_test_split
import joblib
import os

In [10]:
# %%
import os, numpy as np, pandas as pd, joblib, lightgbm as lgb
from sklearn.model_selection import TimeSeriesSplit
from lightgbm.callback import early_stopping, log_evaluation

def smape(y_true, y_pred):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    denom = (np.abs(y_true) + np.abs(y_pred))
    denom[denom == 0] = 1.0
    return np.mean(2.0 * np.abs(y_pred - y_true) / denom)

def train_lgbm_by_group(train_proc, feature_cols, model_dir="./models/lgbm", n_splits=5):
    os.makedirs(model_dir, exist_ok=True)
    models, rows = {}, []

    # 범주형 캐스팅(있으면 이득)
    cat_cols = [c for c in ['업장코드','메뉴코드','요일','월','주차','계절코드'] if c in train_proc.columns]
    df_all = train_proc.copy()
    for c in cat_cols:
        df_all[c] = df_all[c].astype('category')

    for g in sorted(df_all['업장그룹'].unique()):
        df = df_all[df_all['업장그룹']==g].sort_values('영업일자').reset_index(drop=True)
        # 🔧 추가: 타깃 음수 제거(환불은 0으로 간주)
        df['매출수량'] = pd.to_numeric(df['매출수량'], errors='coerce').fillna(0)
        neg_cnt = (df['매출수량'] < 0).sum()
        if neg_cnt > 0:
            print(f"[Group {g}] 음수 타깃 {neg_cnt}건 → 0으로 치환")
            df.loc[df['매출수량'] < 0, '매출수량'] = 0
        X = df[feature_cols]
        y = df['매출수량'].astype(float)
        w = df['영업장_가중치'].astype(float)

        #epch 2배 해보기, 50이나 100일 경우
        params = dict(
            objective='tweedie',                 # 0이 많고 분산 큰 수요 데이터에 적합
            tweedie_variance_power=1.2,
            learning_rate=0.05,
            n_estimators=4000,
            num_leaves=63,
            max_depth=-1,
            min_data_in_leaf=32,
            feature_fraction=0.9,
            bagging_fraction=0.8,
            bagging_freq=1,
            lambda_l1=0.0,
            lambda_l2=2.0,
            random_state=42,
            n_jobs=-1
        )

        print(f"\n🧪 [Group {g}] CV 학습 시작")
        tscv = TimeSeriesSplit(n_splits=n_splits)
        oof = np.zeros(len(df))
        best_model, best_smape = None, 1e9

        for fold, (tr_idx, va_idx) in enumerate(tscv.split(X, y)):
            X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
            y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]
            w_tr, w_va = w.iloc[tr_idx], w.iloc[va_idx]

            model = lgb.LGBMRegressor(**params)
            model.fit(
                X_tr, y_tr,
                sample_weight=w_tr,
                eval_set=[(X_va, y_va)],
                eval_sample_weight=[w_va],
                eval_metric='mae',
                callbacks=[early_stopping(200), log_evaluation(100)]
            )
            pred = np.clip(model.predict(X_va), 0, None)
            oof[va_idx] = pred
            fold_smape = smape(y_va.values, pred)
            print(f"  - fold{fold}: SMAPE {fold_smape:.4f}")
            if fold_smape < best_smape:
                best_smape, best_model = fold_smape, model

        group_smape = smape(y.values, np.clip(oof, 0, None))
        print(f"✅ [Group {g}] OOF SMAPE: {group_smape:.4f} (best_fold {best_smape:.4f})")

        joblib.dump(best_model, os.path.join(model_dir, f"lgbm_{g}.pkl"))
        models[g] = best_model
        rows.append((g, group_smape, best_smape, len(df)))

    rep = pd.DataFrame(rows, columns=['group','oof_smape','best_fold_smape','n']).sort_values('oof_smape')
    rep.to_csv(os.path.join(model_dir, "cv_report.csv"), index=False)
    print("\n=== 그룹별 결과 요약 ===")
    print(rep)
    return models, rep

# 실행
models, cv_report = train_lgbm_by_group(train_proc, feature_cols, model_dir="./models/lgbm", n_splits=5)


[Group group_2] 음수 타깃 6건 → 0으로 치환

🧪 [Group group_2] CV 학습 시작
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001674 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5829
[LightGBM] [Info] Number of data points in the train set: 6563, number of used features: 40
[LightGBM] [Info] Start training from score 2.956388
Training until validation scores don't improve for 200 rounds
[100]	valid_0's l1: 7.44686	valid_0's tweedie: 32.9168
[200]	valid_0's l1: 7.63471	valid_0's tweedie: 38.1774
Early stopping, best iteration is:
[38]	valid_0's l1: 7.53229	valid_0's tweedie: 29.6279
  - fold0: SMAPE 1.3569
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001102 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Tota

예측

In [12]:
# %%
import os, glob, numpy as np, pandas as pd, joblib

DATA_DIR = "./data"
SAMPLE_PATH = os.path.join(DATA_DIR, "sample_submission.csv")
OUTPUT_PATH = "./0810_lgbm_submission.csv"

# sample 형식 파악
sample = pd.read_csv(SAMPLE_PATH)
is_wide = '영업장명_메뉴명' not in sample.columns
if is_wide:
    menu_cols = [c for c in sample.columns if c != '영업일자']
else:
    # long 포맷(가정): ['영업일자','영업장명_메뉴명','예측'] 형태
    raise ValueError("현재 코드는 sample의 'wide' 형식(열=메뉴)만 지원하도록 작성됨.")

def key_label(k_idx, d):
    return f"TEST_{k_idx:02d}+{d}일"

def forecast_between_tests(models, state, feature_cols, data_dir=DATA_DIR):
    """TEST_00 ~ TEST_09 각각의 '다음 7일'을 반복예측하여 dict로 반환"""
    pred_store = {}  # label -> DataFrame(['영업장명_메뉴명','pred'])
    test_paths = sorted(glob.glob(os.path.join(data_dir, "TEST_*.csv")))
    if not test_paths:
        raise FileNotFoundError("TEST 파일을 찾지 못했습니다.")

    for k_idx, path in enumerate(test_paths):
        anchor = pd.read_csv(path)
        # 기본 정리(열 분리/정렬)
        anchor = base_clean(anchor)
        # 예측 대상 키(업장명_메뉴명) 기준은 anchor에 있는 전체 키로
        key_df = anchor[['영업장명_메뉴명','영업장명','메뉴명']].drop_duplicates().reset_index(drop=True)

        sim = anchor.copy()
        last_date = sim['영업일자'].max()
        for d in range(1, 8):
            future_date = last_date + pd.Timedelta(days=d)
            # 미래 행(매출수량 미정) 추가
            new_rows = key_df.copy()
            new_rows['영업일자'] = future_date
            new_rows['매출수량'] = np.nan
            sim = pd.concat([sim, new_rows], ignore_index=True)

            # 피처 생성 (과거+지금까지의 예측을 포함)
            sim_proc = transform_with_state(sim, state)
            fut = sim_proc[sim_proc['영업일자'] == future_date].copy()

            # 🔒 feature guard: 예측 전에 한 번만 체크 (여기에 넣기!)
            missing = [c for c in feature_cols if c not in fut.columns]
            if missing:
                # 디버깅을 원하면 에러를 내고
                raise ValueError(f"예측 피처 누락: {missing}")
                # 혹은 조용히 0으로 만들어 진행하고 싶다면 아래 두 줄로 대체
                # for c in missing: fut[c] = 0
                # fut[missing] = fut[missing].fillna(0)

            # 그룹별 모델로 예측
            fut['pred'] = 0.0
            for g, model in models.items():
                m = fut['업장그룹'] == g
                if m.any():
                    # (수정)
                    X_infer = fut.loc[m, feature_cols]
                    # 열 순서가 학습과 100% 동일해야 하므로 feature_cols 그대로 사용
                    fut.loc[m, 'pred'] = np.clip(model.predict(X_infer.values), 0, None)
                    # fut.loc[m, 'pred'] = np.clip(model.predict(fut.loc[m, feature_cols]), 0, None)

            # 다음 날을 위해 sim의 매출수량에 방금 예측값을 반영
            sim.loc[(sim['영업일자']==future_date), '매출수량'] = fut['pred'].values

            # 저장(샘플 포맷용 라벨)
            label = key_label(k_idx, d)
            pred_store[label] = fut[['영업장명_메뉴명','pred']].reset_index(drop=True)

    return pred_store

pred_store = forecast_between_tests(models, state, feature_cols, DATA_DIR)

# === sample_submission 채우기 (wide 포맷) ===
sub = sample.copy()
sub_rows = []

# 빠르게 조회하게 dict로 변환: label -> {키: 값}
store_map = {}
for label, dfp in pred_store.items():
    store_map[label] = dict(zip(dfp['영업장명_메뉴명'], dfp['pred']))

# sample의 각 라벨(영업일자=TEST_xx+?일) 순회하며 해당 행 채우기
for i, row in sub.iterrows():
    label = row['영업일자']
    pred_row = np.zeros(len(menu_cols), dtype=float)
    if label in store_map:
        mp = store_map[label]
        # 메뉴 열 순서대로 값 채우기(없으면 0)
        pred_row = np.array([mp.get(col, 0.0) for col in menu_cols], dtype=float)
    sub_rows.append(pred_row)

sub[menu_cols] = np.vstack(sub_rows)
sub.to_csv(OUTPUT_PATH, index=False)
print("✅ 제출 파일 저장:", OUTPUT_PATH)




















































































































































































































































































✅ 제출 파일 저장: ./0810_lgbm_submission.csv




예측-7일

In [12]:
import os, glob, re, warnings
import numpy as np
import pandas as pd
from datetime import timedelta
import joblib
warnings.filterwarnings("ignore")

# ================== 경로/설정 ==================
DATA_DIR = "./data"                                 # test, train, sample_submission.csv 위치
TRAIN_PATH = os.path.join(DATA_DIR, "train.csv")
SAMPLE_PATH = os.path.join(DATA_DIR, "sample_submission.csv")
OUTPUT_PATH = "./0809_submission8.csv"

ENC_STORE_PATH = "models/le_영업장명.pkl"
ENC_MENU_PATH  = "models/le_메뉴명.pkl"
GROUP_MODEL_DIRS = {
    "group_2": "models/group2",
    "group_3": "models/group3",
}

SALES_COL = "매출수량"
WINDOW_DAYS_FOR_SEED = 60  # 랙 계산을 위한 히스토리 윈도
GAP_DAYS = 7               # test 사이 7일 + 마지막 이후 7일

# ================== 도메인 맵/휴일 ==================
venue_weights = {
    '담하': 2.0, '미라시아': 2.0, '라그로타': 1.5,
    '느티나무 셀프BBQ': 1.0, '연회장': 1.0, '카페테리아': 0.8,
    '화담숲주막': 0.7, '포레스트릿': 0.5, '화담숲카페': 0.5
}
group_map = {
    "회담숲주막": "group_2", "회담숲카페": "group_2", "느티나무 셀프BBQ": "group_3", "연회장": "group_3",
    "카페테리아": "group_2", "포레스트릿": "group_2",
    "미라시아": "group_3", "담하": "group_3"
}
holiday_list = pd.to_datetime([
    # 2023
    '2023-01-01','2023-01-21','2023-01-22','2023-01-23','2023-03-01','2023-05-05',
    '2023-06-06','2023-08-15','2023-09-28','2023-09-29','2023-09-30','2023-10-03',
    '2023-10-09','2023-12-25',
    # 2024
    '2024-01-01','2024-02-09','2024-02-10','2024-02-11','2024-03-01','2024-05-05','2024-05-06',
    '2024-06-06','2024-08-15','2024-09-16','2024-09-17','2024-09-18',
    '2024-10-03','2024-10-09','2024-12-25'
])

# ================== 공통 피처 (학습과 동일) ==================
# 학습 때 '계절코드' 포함했다고 가정. 만약 일부 모델이 8개(계절코드 없음)라면 아래 align_features_to_model에서 자동 보정됨.
FEATURE_COLS = [
    '요일','주말여부','공휴일',
    '어제_매출','최근7일_평균',
    '영업장_가중치',
    '업장코드','메뉴코드',
    '계절코드'
]

# ================== 유틸 함수 ==================
def safe_key(text: str) -> str:
    return re.sub(r"[^\w\-]", "_", text)

def get_season(month: int) -> str:
    if month in [12, 1, 2]: return '겨울'
    if month in [3, 4, 5]:  return '봄'
    if month in [6, 7, 8]:  return '여름'
    return '가을'

def add_calendar_feats(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df['영업일자'] = pd.to_datetime(df['영업일자'])
    df['공휴일']   = df['영업일자'].isin(holiday_list).astype(int)
    df['요일']     = df['영업일자'].dt.weekday
    df['월']       = df['영업일자'].dt.month
    df['주차']     = df['영업일자'].dt.isocalendar().week.astype(int)
    df['주말여부'] = df['요일'].isin([5, 6]).astype(int)
    if '영업장명_메뉴명' in df.columns and '영업장명' not in df.columns:
        df[['영업장명','메뉴명']] = df['영업장명_메뉴명'].str.split('_', n=1, expand=True)
    return df

def load_all_group_models():
    out = {}
    for g, d in GROUP_MODEL_DIRS.items():
        models = {}
        if os.path.isdir(d):
            for p in glob.glob(os.path.join(d, "*.pkl")):
                key = os.path.basename(p)[:-4]
                try:
                    models[key] = joblib.load(p)
                except Exception as e:
                    print(f"⚠️ 모델 로드 실패: {p} -> {e}")
        print(f"✅ {g} 모델 수: {len(models)}")
        out[g] = models
    return out

def get_recent_history(train_df: pd.DataFrame, key: str, upto_date: pd.Timestamp, window_days=WINDOW_DAYS_FOR_SEED):
    sdf = train_df[train_df['영업장명_메뉴명'] == key].copy()
    sdf = sdf[sdf['영업일자'] < upto_date].sort_values('영업일자')
    if not sdf.empty:
        last = sdf['영업일자'].max()
        sdf  = sdf[sdf['영업일자'] >= last - pd.Timedelta(days=window_days)]
    return sdf[['영업일자', SALES_COL]]

def calc_lag_features(hist_df: pd.DataFrame, current_date: pd.Timestamp):
    prev = hist_df[hist_df['영업일자'] < current_date].sort_values('영업일자').tail(7)
    if prev.empty:
        return 0.0, 0.0
    y1 = float(prev.iloc[-1][SALES_COL])
    y7 = float(prev[SALES_COL].mean())
    return y1, y7

# 학습-예측 피처 불일치 자동 보정 (모델 feature_name 기준으로 정렬/채우기/드롭)
_printed_align_once = {"done": False}
def align_features_to_model(Xrow: pd.DataFrame, model) -> pd.DataFrame:
    exp = list(model.feature_name())
    X = Xrow.copy()
    for col in exp:
        if col not in X.columns:
            X[col] = 0.0
    X = X[exp]
    if not _printed_align_once["done"]:
        print("\n[align] 학습피처:", exp)
        print("[align] 입력피처(정렬 후):", list(X.columns))
        print("[align] 개수(학습/입력):", len(exp), "/", X.shape[1])
        _printed_align_once["done"] = True
    return X

def build_feature_row(date, venue, menu, le_store, le_menu, le_season, y1, y7):
    row = pd.DataFrame({'영업일자':[pd.to_datetime(date)]})
    row = add_calendar_feats(row)
    row['어제_매출']   = y1
    row['최근7일_평균'] = y7
    row['영업장_가중치'] = venue_weights.get(venue, 1.0)
    row['업장코드'] = le_store.transform([venue])[0]
    row['메뉴코드'] = le_menu.transform([menu])[0]
    season_str = get_season(int(row['월'].iloc[0]))
    row['계절코드'] = le_season.transform([season_str])[0]
    return row[FEATURE_COLS].astype(float)

def predict_one_step(model, Xrow: pd.DataFrame) -> float:
    try:
        X_aligned = align_features_to_model(Xrow, model)
        pred = float(model.predict(X_aligned)[0])
    except Exception as e:
        print(f"⚠️ 예측 실패 -> {type(e).__name__}: {e}")
        pred = 0.0
    return max(pred, 0.0)

# ================== 데이터 로드 ==================
# test 앵커 합치기
test_files = sorted(glob.glob(os.path.join(DATA_DIR, "TEST_*.csv")))
if not test_files:
    raise FileNotFoundError("테스트 파일(TEST_*.csv)을 찾지 못했습니다.")
tests = [pd.read_csv(p) for p in test_files]
anchors = pd.concat(tests, ignore_index=True)
anchors = add_calendar_feats(anchors)
if '영업장명' not in anchors.columns or '메뉴명' not in anchors.columns:
    anchors[['영업장명','메뉴명']] = anchors['영업장명_메뉴명'].str.split('_', n=1, expand=True)
anchors = anchors.sort_values(['영업장명_메뉴명','영업일자'])

# train 로드 (계절 인코더/랙 시드용)
train = pd.read_csv(TRAIN_PATH)
train = add_calendar_feats(train)
train[SALES_COL] = train[SALES_COL].clip(lower=0)

# 계절 라벨 인코더 (학습과 동일 규칙 재현)
from sklearn.preprocessing import LabelEncoder
le_season = LabelEncoder()
train['계절'] = train['월'].apply(get_season)
le_season.fit(train['계절'])

# 저장된 라벨 인코더 로드
le_store = joblib.load(ENC_STORE_PATH)
le_menu  = joblib.load(ENC_MENU_PATH)

# 그룹 모델 로드
group_models = load_all_group_models()

# (선택) 모델 피처셋 샘플 확인
for g, mdict in group_models.items():
    if mdict:
        any_model = next(iter(mdict.values()))
        try:
            print(f"[{g}] 학습 피처 개수:", len(any_model.feature_name()))
        except:
            pass

# ================== 예측: 각 키에 대해 test 사이 7일 + 마지막 뒤 7일 ==================
out_rows = []
for key, kdf in anchors.groupby('영업장명_메뉴명'):
    kdf = kdf.sort_values('영업일자').reset_index(drop=True)
    venue = kdf['영업장명'].iloc[0]
    menu  = kdf['메뉴명'].iloc[0]
    grp   = group_map.get(venue, 'group_3')

    # 모델 찾기 (정확키 / safe_key 둘 다 시도)
    mdl_dict = group_models.get(grp, {})
    mdl = mdl_dict.get(key)
    if mdl is None:
        mdl = mdl_dict.get(safe_key(key))

    # 시드 히스토리
    first_anchor = pd.to_datetime(kdf['영업일자'].min())
    hist = get_recent_history(train, key, first_anchor, window_days=WINDOW_DAYS_FOR_SEED)

    # 인접 앵커 사이 7일
    anchor_dates = kdf['영업일자'].tolist()
    for i in range(len(anchor_dates)-1):
        start = pd.to_datetime(anchor_dates[i])
        end   = pd.to_datetime(anchor_dates[i+1])
        gap_days = [start + timedelta(days=d) for d in range(1, GAP_DAYS+1)]
        gap_days = [d for d in gap_days if d < end]
        if len(gap_days) == 0:
            continue

        for d in gap_days:
            y1, y7 = calc_lag_features(hist, d)
            Xrow = build_feature_row(d, venue, menu, le_store, le_menu, le_season, y1, y7)
            yhat = predict_one_step(mdl, Xrow) if mdl is not None else max(y7, 0.0)
            out_rows.append({'영업일자': d, '영업장명_메뉴명': key, 'pred': yhat})
            hist = pd.concat([hist, pd.DataFrame([{'영업일자': d, SALES_COL: yhat}])], ignore_index=True)

    # 마지막 앵커 뒤로 7일
    last_anchor = pd.to_datetime(anchor_dates[-1])
    tail_days = [last_anchor + timedelta(days=d) for d in range(1, GAP_DAYS+1)]
    for d in tail_days:
        y1, y7 = calc_lag_features(hist, d)
        Xrow = build_feature_row(d, venue, menu, le_store, le_menu, le_season, y1, y7)
        yhat = predict_one_step(mdl, Xrow) if mdl is not None else max(y7, 0.0)
        out_rows.append({'영업일자': d, '영업장명_메뉴명': key, 'pred': yhat})
        hist = pd.concat([hist, pd.DataFrame([{'영업일자': d, SALES_COL: yhat}])], ignore_index=True)

pred_df = pd.DataFrame(out_rows)
if pred_df.empty:
    raise RuntimeError("예측 결과가 비었습니다. 모델 경로/키 이름을 확인하세요.")

# 예측 순서 고정 (키→날짜)  ← sample이 id,pred 두 컬럼만일 때도 순서로 주입 가능
pred_df = pred_df.sort_values(['영업장명_메뉴명','영업일자']).reset_index(drop=True)
print("\n[pred_df] rows:", len(pred_df))
print("[pred_df] head:\n", pred_df.head(3))

# ================== 제출 포맷 맞추기 ==================
# ================== 제출 포맷 맞추기 (sample의 '영업일자' 문자열 유지) ==================
sample = pd.read_csv(SAMPLE_PATH)
pred_df['pred'] = pred_df['pred'].clip(lower=0)

# 1) wide로 피벗하고, 날짜 오름차순 정렬 (행 순서 기준 매핑용)
wide_pred = (
    pred_df
    .pivot(index='영업일자', columns='영업장명_메뉴명', values='pred')
    .sort_index()
    .reset_index(drop=True)             # 날짜 컬럼 버리고 순서만 유지
)

# 2) sample 기준으로 컬럼(메뉴) 구성 맞추기
sample_cols = list(sample.columns)      # 첫 컬럼이 '영업일자' 문자열
menu_cols   = [c for c in sample_cols if c != '영업일자']

# wide_pred에 없는 메뉴는 0으로 채움, 있는 메뉴만 사용
wide_pred = wide_pred.reindex(columns=menu_cols, fill_value=0.0)

# 3) 행 개수 맞추기(부족하면 0으로 패딩, 초과하면 컷)
need = len(sample)
have = len(wide_pred)
if have < need:
    pad = pd.DataFrame(0.0, index=range(need - have), columns=menu_cols)
    wide_pred = pd.concat([wide_pred, pad], ignore_index=True)
elif have > need:
    wide_pred = wide_pred.iloc[:need].copy()

# 4) 최종 submit: sample의 '영업일자' 문자열을 그대로 사용
submit = pd.concat([sample[['영업일자']].copy(), wide_pred], axis=1)

# 5) 안전장치: NaN -> 0, 음수 방지, 컬럼 순서 최종 정렬
submit = submit.reindex(columns=sample_cols).fillna(0)
num_cols = [c for c in submit.columns if c != '영업일자']
submit[num_cols] = submit[num_cols].clip(lower=0)

# 6) 저장
submit.to_csv(OUTPUT_PATH, index=False, encoding='utf-8-sig')
print(f"✅ 제출 파일 저장 완료: {OUTPUT_PATH}")
print("[check] shape:", submit.shape)
print("[check] sample 날짜 예시:", submit['영업일자'].iloc[:5].tolist())


✅ group_2 모델 수: 0
✅ group_3 모델 수: 0

[pred_df] rows: 13510
[pred_df] head:
         영업일자            영업장명_메뉴명      pred
0 2024-07-14  느티나무 셀프BBQ_1인 수저세트  2.714286
1 2024-07-15  느티나무 셀프BBQ_1인 수저세트  3.102041
2 2024-07-16  느티나무 셀프BBQ_1인 수저세트  3.545190
✅ 제출 파일 저장 완료: ./0809_submission8.csv
[check] shape: (70, 194)
[check] sample 날짜 예시: ['TEST_00+1일', 'TEST_00+2일', 'TEST_00+3일', 'TEST_00+4일', 'TEST_00+5일']


In [13]:
import glob, os
print('LSTM files:', glob.glob(os.path.join('models','lstm','**','*.keras'), recursive=True))


LSTM files: []


In [16]:
import os
import re
import glob
import warnings
import numpy as np
import pandas as pd
from datetime import timedelta
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import joblib
warnings.filterwarnings("ignore")

# ====== 설정 ======
DATA_DIR = "./data"
TRAIN_PATH = os.path.join(DATA_DIR, "train.csv")
SAMPLE_PATH = os.path.join(DATA_DIR, "sample_submission.csv")
OUTPUT_PATH = "./lstm_submission9.csv"

ENC_STORE_PATH = "models/le_영업장명.pkl"
ENC_MENU_PATH  = "models/le_메뉴명.pkl"
GROUP_MODEL_DIRS = {
    "group_2": "models/group2",
    "group_3": "models/group3",
}

SALES_COL = "매출수량"
WINDOW_DAYS_FOR_SEED = 60
GAP_DAYS = 7

venue_weights = {
    '담하': 2.0, '미라시아': 2.0, '라그로타': 1.5,
    '느티나무 셀프BBQ': 1.0, '연회장': 1.0, '카페테리아': 0.8,
    '화담숲주막': 0.7, '포레스트릿': 0.5, '화담숲카페': 0.5
}
group_map = {
    "회담숲주막": "group_2", "회담숲카페": "group_2", "느티나무 셀프BBQ": "group_3", "연회장": "group_3",
    "카페테리아": "group_2", "포레스트릿": "group_2",
    "미라시아": "group_3", "담하": "group_3"
}
holiday_list = pd.to_datetime([
    '2023-01-01','2023-01-21','2023-01-22','2023-01-23','2023-03-01','2023-05-05',
    '2023-06-06','2023-08-15','2023-09-28','2023-09-29','2023-09-30','2023-10-03',
    '2023-10-09','2023-12-25',
    '2024-01-01','2024-02-09','2024-02-10','2024-02-11','2024-03-01','2024-05-05','2024-05-06',
    '2024-06-06','2024-08-15','2024-09-16','2024-09-17','2024-09-18',
    '2024-10-03','2024-10-09','2024-12-25'
])

FEATURE_COLS = ['요일','주말여부','공휴일','어제_매출','최근7일_평균','영업장_가중치','업장코드','메뉴코드','계절코드']

# ====== Dataset ======
class TimeSeriesDataset(Dataset):
    def __init__(self, series_df, feature_cols, target_col='매출수량', window_size=7):
        self.X, self.y = [], []
        for i in range(len(series_df) - window_size):
            window = series_df.iloc[i:i + window_size]
            target = series_df.iloc[i + window_size][target_col]
            self.X.append(window[feature_cols].values.astype(np.float32))
            self.y.append(np.float32(target))
        self.X = np.array(self.X)
        self.y = np.array(self.y)
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return torch.tensor(self.X[idx]), torch.tensor(self.y[idx])

# ====== Model ======
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size=64, num_layers=1):
        super().__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)
    def forward(self, x):
        out, _ = self.lstm(x)
        out = out[:, -1, :]
        out = self.fc(out)
        return out.squeeze(1)

# ====== Helper ======
def safe_key(text: str) -> str:
    return re.sub(r"[^\w\-]", "_", text)

def get_season(month: int) -> str:
    if month in [12, 1, 2]: return '겨울'
    if month in [3, 4, 5]:  return '봄'
    if month in [6, 7, 8]:  return '여름'
    return '가을'

def add_calendar_feats(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df['영업일자'] = pd.to_datetime(df['영업일자'])
    df['공휴일']   = df['영업일자'].isin(holiday_list).astype(int)
    df['요일']     = df['영업일자'].dt.weekday
    df['월']       = df['영업일자'].dt.month
    df['주차']     = df['영업일자'].dt.isocalendar().week.astype(int)
    df['주말여부'] = df['요일'].isin([5, 6]).astype(int)
    if '영업장명_메뉴명' in df.columns and '영업장명' not in df.columns:
        df[['영업장명','메뉴명']] = df['영업장명_메뉴명'].str.split('_', n=1, expand=True)
    return df

def get_recent_history(train_df, key, upto_date, window_days=WINDOW_DAYS_FOR_SEED):
    sdf = train_df[train_df['영업장명_메뉴명'] == key].copy()
    sdf = sdf[sdf['영업일자'] < upto_date].sort_values('영업일자')
    if not sdf.empty:
        last = sdf['영업일자'].max()
        sdf  = sdf[sdf['영업일자'] >= last - pd.Timedelta(days=window_days)]
    return sdf[['영업일자', SALES_COL]]

def calc_lag_features(hist_df, current_date):
    prev = hist_df[hist_df['영업일자'] < current_date].sort_values('영업일자').tail(7)
    if prev.empty:
        return 0.0, 0.0
    y1 = float(prev.iloc[-1][SALES_COL])
    y7 = float(prev[SALES_COL].mean())
    return y1, y7

def build_feature_row(date, venue, menu, le_store, le_menu, le_season, y1, y7):
    row = pd.DataFrame({'영업일자':[pd.to_datetime(date)]})
    row = add_calendar_feats(row)
    row['어제_매출']   = y1
    row['최근7일_평균'] = y7
    row['영업장_가중치'] = venue_weights.get(venue, 1.0)
    row['업장코드'] = le_store.transform([venue])[0]
    row['메뉴코드'] = le_menu.transform([menu])[0]
    season_str = get_season(int(row['월'].iloc[0]))
    row['계절코드'] = le_season.transform([season_str])[0]
    return row[FEATURE_COLS].astype(np.float32)

# ====== Load models ======
def load_all_group_models():
    out = {}
    for g, d in GROUP_MODEL_DIRS.items():
        models = {}
        if os.path.isdir(d):
            for p in glob.glob(os.path.join(d, "*.pt")):
                key = os.path.basename(p)[:-3]
                try:
                    model = LSTMModel(input_size=len(FEATURE_COLS))
                    model.load_state_dict(torch.load(p, map_location='cpu'))
                    model.eval()
                    models[key] = model
                except Exception as e:
                    print(f"⚠️ 모델 로드 실패: {p} -> {e}")
        print(f"✅ {g} 모델 수: {len(models)}")
        out[g] = models
    return out

# ====== Prediction ======
def predict_one_step(model, Xrow):
    try:
        x_tensor = torch.tensor(Xrow.values, dtype=torch.float32).unsqueeze(0)
        with torch.no_grad():
            pred = float(model(x_tensor))
    except Exception as e:
        print(f"⚠️ 예측 실패 -> {type(e).__name__}: {e}")
        pred = 0.0
    return max(pred, 0.0)

# ====== 실행 ======
# Load Data
test_files = sorted(glob.glob(os.path.join(DATA_DIR, "TEST_*.csv")))
tests = [pd.read_csv(p) for p in test_files]
anchors = pd.concat(tests, ignore_index=True)
anchors = add_calendar_feats(anchors)
if '영업장명' not in anchors.columns:
    anchors[['영업장명','메뉴명']] = anchors['영업장명_메뉴명'].str.split('_', n=1, expand=True)
anchors = anchors.sort_values(['영업장명_메뉴명','영업일자'])

train = pd.read_csv(TRAIN_PATH)
train = add_calendar_feats(train)
train[SALES_COL] = train[SALES_COL].clip(lower=0)

from sklearn.preprocessing import LabelEncoder
le_season = LabelEncoder()
train['계절'] = train['월'].apply(get_season)
le_season.fit(train['계절'])

le_store = joblib.load(ENC_STORE_PATH)
le_menu  = joblib.load(ENC_MENU_PATH)

group_models = load_all_group_models()

# Prediction Loop
out_rows = []
for key, kdf in anchors.groupby('영업장명_메뉴명'):
    kdf = kdf.sort_values('영업일자').reset_index(drop=True)
    venue, menu = kdf['영업장명'].iloc[0], kdf['메뉴명'].iloc[0]
    grp = group_map.get(venue, 'group_3')

    mdl_dict = group_models.get(grp, {})
    mdl = mdl_dict.get(f"{grp}_{safe_key(key)}") or mdl_dict.get(safe_key(key))

    first_anchor = pd.to_datetime(kdf['영업일자'].min())
    hist = get_recent_history(train, key, first_anchor)

    anchor_dates = kdf['영업일자'].tolist()
    for i in range(len(anchor_dates)-1):
        start = pd.to_datetime(anchor_dates[i])
        end   = pd.to_datetime(anchor_dates[i+1])
        gap_days = [start + timedelta(days=d) for d in range(1, GAP_DAYS+1) if start + timedelta(days=d) < end]
        for d in gap_days:
            y1, y7 = calc_lag_features(hist, d)
            Xrow = build_feature_row(d, venue, menu, le_store, le_menu, le_season, y1, y7)
            yhat = predict_one_step(mdl, Xrow) if mdl else max(y7, 0.0)
            out_rows.append({'영업일자': d, '영업장명_메뉴명': key, 'pred': yhat})
            hist = pd.concat([hist, pd.DataFrame([{'영업일자': d, SALES_COL: yhat}])], ignore_index=True)

    last_anchor = pd.to_datetime(anchor_dates[-1])
    for d in [last_anchor + timedelta(days=i) for i in range(1, GAP_DAYS+1)]:
        y1, y7 = calc_lag_features(hist, d)
        Xrow = build_feature_row(d, venue, menu, le_store, le_menu, le_season, y1, y7)
        yhat = predict_one_step(mdl, Xrow) if mdl else max(y7, 0.0)
        out_rows.append({'영업일자': d, '영업장명_메뉴명': key, 'pred': yhat})
        hist = pd.concat([hist, pd.DataFrame([{'영업일자': d, SALES_COL: yhat}])], ignore_index=True)

pred_df = pd.DataFrame(out_rows).sort_values(['영업장명_메뉴명','영업일자']).reset_index(drop=True)
sample = pd.read_csv(SAMPLE_PATH)
wide_pred = pred_df.pivot(index='영업일자', columns='영업장명_메뉴명', values='pred').sort_index().reset_index(drop=True)
menu_cols = [c for c in sample.columns if c != '영업일자']
wide_pred = wide_pred.reindex(columns=menu_cols, fill_value=0.0)

need, have = len(sample), len(wide_pred)
if have < need:
    pad = pd.DataFrame(0.0, index=range(need - have), columns=menu_cols)
    wide_pred = pd.concat([wide_pred, pad], ignore_index=True)
elif have > need:
    wide_pred = wide_pred.iloc[:need].copy()

submit = pd.concat([sample[['영업일자']].copy(), wide_pred], axis=1)
submit = submit.reindex(columns=sample.columns).fillna(0)
submit[menu_cols] = submit[menu_cols].clip(lower=0)
submit.to_csv(OUTPUT_PATH, index=False, encoding='utf-8-sig')
print(f"✅ 제출 파일 저장 완료: {OUTPUT_PATH}")


✅ group_2 모델 수: 0
✅ group_3 모델 수: 0
✅ 제출 파일 저장 완료: ./lstm_submission9.csv


↑ 여기까지.

예측&저장 해보기