In [54]:
import numpy as np
import pandas as pd

In [None]:
cons = pd.read_csv("consultations.csv", encoding="utf-8-sig", parse_dates=["start_time","end_time","consultation_date"])
cft  = pd.read_csv("consultation_fund_types.csv", encoding="utf-8-sig")
ft   = pd.read_csv("fund_types.csv", encoding="utf-8-sig")

1) 공통 전처리: 슬롯/요일 생성 + sanity check


In [56]:
# --- 슬롯/요일 컬럼 만들기 ---
base = cons.copy()
base["ts_slot"]   = base["start_time"].dt.floor("h")   # 09:23 -> 09:00
base["slot_hour"] = base["ts_slot"].dt.hour            # 9,10,11,...
base["weekday"]   = base["ts_slot"].dt.weekday         # 0=월 ~ 6=일

# --- 확인(이미 9–18, 점심 제외) ---
allowed_hours = sorted(base["slot_hour"].unique().tolist())
print("데이터에 존재하는 시간대:", allowed_hours)
print("요일 분포(0=월 ~ 6=일):\n", base["weekday"].value_counts().sort_index())
print(pd.crosstab(base["weekday"], base["slot_hour"]))

데이터에 존재하는 시간대: [9, 10, 11, 12, 13, 14, 15, 16, 17]
요일 분포(0=월 ~ 6=일):
 weekday
0    5925
1    5954
2    6047
3    6064
4    6010
Name: count, dtype: int64
slot_hour   9    10   11   12    13    14    15   16   17
weekday                                                  
0          387  365  409  394  1227  1221  1169  391  362
1          398  456  378  368  1174  1193  1194  429  364
2          380  366  426  399  1216  1200  1247  377  436
3          397  416  379  427  1172  1219  1238  402  414
4          381  420  379  385  1209  1207  1239  413  377


2) 회귀용 프레임 만들기 (슬롯별 상담 건수 y)
- 핵심: 날짜×시간 그리드를 만들고 상담이 없던 슬롯도 y=0으로 채움 -> 학습 안정화


In [57]:
# --- 날짜 × (데이터에 실제 존재하는) 시간대 그리드 ---
days = pd.date_range(base["ts_slot"].dt.normalize().min(),
                     base["ts_slot"].dt.normalize().max(), freq="D")

grid = (pd.DataFrame({"date": days}).assign(key=1)
        .merge(pd.DataFrame({"slot_hour": allowed_hours, "key":1}), on="key")
        .drop("key", axis=1))
grid["ts_slot"] = grid["date"] + pd.to_timedelta(grid["slot_hour"], unit="h")
grid["weekday"] = grid["ts_slot"].dt.weekday

# --- y = 슬롯별 상담 건수 (없으면 0으로 채움) ---
y_cnt = base.groupby("ts_slot").size().rename("y").reset_index()
Xy_reg = (grid.merge(y_cnt, on="ts_slot", how="left")
               .fillna({"y":0})
               .sort_values("ts_slot").reset_index(drop=True))

# --- 간단 파생 피처 ---
Xy_reg["hour_sin"] = np.sin(2*np.pi*Xy_reg["slot_hour"]/24)
Xy_reg["hour_cos"] = np.cos(2*np.pi*Xy_reg["slot_hour"]/24)

# 직전 슬롯(lag_1)
Xy_reg["lag_1"] = Xy_reg["y"].shift(1)

# 전일 같은 슬롯(lag_day): ts_slot을 +1일 시켜서 맞춰붙이기 (슬롯 수가 24가 아니어도 안전)
prev = Xy_reg[["ts_slot","y"]].copy()
prev["ts_slot"] = prev["ts_slot"] + pd.Timedelta(days=1)
Xy_reg = Xy_reg.merge(prev.rename(columns={"y":"lag_day"}), on="ts_slot", how="left")

# 하루 이동평균 (운영 슬롯 수 기준)
slots_per_day = len(allowed_hours)
Xy_reg["roll_day"] = Xy_reg["y"].rolling(slots_per_day, min_periods=1).mean()

# 래깅으로 생긴 앞부분 결측 제거
Xy_reg = Xy_reg.dropna().reset_index(drop=True)

print("\n[회귀용 Xy_reg 미리보기]")
print(Xy_reg.head())
print(Xy_reg.columns.tolist())


[회귀용 Xy_reg 미리보기]
        date  slot_hour             ts_slot  weekday     y      hour_sin  \
0 2025-01-02          9 2025-01-02 09:00:00        3  11.0  7.071068e-01   
1 2025-01-02         10 2025-01-02 10:00:00        3  11.0  5.000000e-01   
2 2025-01-02         11 2025-01-02 11:00:00        3  17.0  2.588190e-01   
3 2025-01-02         12 2025-01-02 12:00:00        3  14.0  1.224647e-16   
4 2025-01-02         13 2025-01-02 13:00:00        3  33.0 -2.588190e-01   

   hour_cos  lag_1  lag_day   roll_day  
0 -0.707107    5.0      8.0  13.444444  
1 -0.866025   11.0     11.0  13.444444  
2 -0.965926   11.0      6.0  14.666667  
3 -1.000000   17.0     11.0  15.000000  
4 -0.965926   14.0     17.0  16.777778  
['date', 'slot_hour', 'ts_slot', 'weekday', 'y', 'hour_sin', 'hour_cos', 'lag_1', 'lag_day', 'roll_day']


3) 분류용 프레임 만들기 (슬롯별 최다 자금유형 label)

- 핵심: 슬롯×자금유형 건수를 세고 각 슬롯에서 가장 많은 유형을 라벨로 잡아줌
- 보조 피처: 지난주 같은 슬롯의 라벨(label_lag7)

In [58]:
# ========= 대분류 라벨 파이프라인 =========
import re

# 1) fund_type_name -> major(대분류) 변환기
def to_major(name: str) -> str:
    if pd.isna(name):
        return "OTHER"
    s = str(name).strip()
    # 하이픈 다양성 처리: -, –(en dash), —(em dash)
    parts = re.split(r"\s*[-–—]\s*", s, maxsplit=1)
    major = parts[0].strip()
    return major if major else s  # 비어있으면 원본

# 2) 이벤트 단위로 ts_slot/weekday/slot_hour 붙이고 'major' 생성
cf = (
    cft.merge(base[["consultation_id","ts_slot","weekday","slot_hour"]],
              on="consultation_id", how="inner")
      .merge(ft, on="fund_type_id", how="left")
)
cf["major"] = cf["fund_type_name"].map(to_major)

# 3) 슬롯 × 대분류 건수 집계
slot_cnt_major = (
    cf.groupby(["ts_slot","major"]).size()
      .rename("cnt").reset_index()
)

# 4) 같은 ts_slot 내에서 cnt 최댓값 대분류 하나 선택(동률 시 처음 것)
top_major = (
    slot_cnt_major.sort_values(["ts_slot","cnt"], ascending=[True, False])
                  .drop_duplicates("ts_slot")
                  .rename(columns={"major":"label"})
                  .reset_index(drop=True)
)

# 5) 피처 붙이기: 요일/슬롯시각
Xy_cls = (
    top_major.merge(
        base.drop_duplicates("ts_slot")[["ts_slot","weekday","slot_hour"]],
        on="ts_slot", how="left"
    )
    .sort_values("ts_slot")
    .reset_index(drop=True)
)

# 6) 지난주 같은 슬롯의 '대분류' 라벨(lag7) 붙이기
lag = Xy_cls[["ts_slot","label"]].copy()
lag["ts_slot"] = lag["ts_slot"] + pd.Timedelta(days=7)
Xy_cls = Xy_cls.merge(lag.rename(columns={"label":"label_lag7"}), on="ts_slot", how="left")
Xy_cls["label_lag7"] = Xy_cls["label_lag7"].fillna("OTHER")

# 7) 원핫 인코딩(요일, 지난주 대분류)
Xy_cls = pd.concat([
    Xy_cls,
    pd.get_dummies(Xy_cls["weekday"],    prefix="wd",   drop_first=False),
    pd.get_dummies(Xy_cls["label_lag7"], prefix="lag7", drop_first=False)
], axis=1)

# (옵션) 불리언을 0/1로 바꾸고 싶다면:
bool_cols = Xy_cls.select_dtypes(include=["bool"]).columns
Xy_cls[bool_cols] = Xy_cls[bool_cols].astype("uint8")

print("\n[대분류 기준 분류용 Xy_cls 미리보기]")
print(Xy_cls.head())
print(Xy_cls.columns.tolist())



[대분류 기준 분류용 Xy_cls 미리보기]
              ts_slot  label  cnt  weekday  slot_hour label_lag7  wd_0  wd_1  \
0 2025-01-01 09:00:00    일반형    6        2          9      OTHER     0     0   
1 2025-01-01 10:00:00     창업    6        2         10      OTHER     0     0   
2 2025-01-01 11:00:00  스마트공장    3        2         11      OTHER     0     0   
3 2025-01-01 12:00:00    일반형    6        2         12      OTHER     0     0   
4 2025-01-01 13:00:00     창업    8        2         13      OTHER     0     0   

   wd_2  wd_3  wd_4  lag7_OTHER  lag7_스마트공장  lag7_스마트기술  lag7_운전자금  lag7_일반형  \
0     1     0     0           1           0           0          0         0   
1     1     0     0           1           0           0          0         0   
2     1     0     0           1           0           0          0         0   
3     1     0     0           1           0           0          0         0   
4     1     0     0           1           0           0          0         0   

   lag7_창업  

## 1) 회귀 — 슬롯(1시간)별 상담 건수 예측

In [59]:
# =======================
# 회귀: 시간 슬롯별 y(건수) 예측
# =======================
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from xgboost import XGBRegressor
import numpy as np
import pandas as pd

def rmse(y_true, y_pred):
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))

def take(A, idx):
    return A.iloc[idx] if hasattr(A, "iloc") else A[idx]

# ---- (A) 피처/타깃 준비 ----
feature_cols = ["weekday","slot_hour","hour_sin","hour_cos","lag_1","lag_day","roll_day"]
X = Xy_reg[feature_cols].astype(float)
y = Xy_reg["y"].astype(float)
t = Xy_reg["ts_slot"]

# ---- (B) 시간순 split: Train 80% 중 뒤쪽 10%를 Valid, 마지막 20% Test ----
n = len(X)
idx = np.arange(n)
cut_test = int(n * 0.8)
tr_all_idx, te_idx = idx[:cut_test], idx[cut_test:]
cut_val = int(len(tr_all_idx) * 0.9)
tr_idx, val_idx = tr_all_idx[:cut_val], tr_all_idx[cut_val:]

X_tr_all, y_tr_all = take(X, tr_all_idx), take(y, tr_all_idx)
X_te,      y_te    = take(X, te_idx),     take(y, te_idx)
X_tr, y_tr         = take(X, tr_idx),     take(y, tr_idx)
X_val, y_val       = take(X, val_idx),    take(y, val_idx)
t_te               = take(t, te_idx)

# ---- (C) 나이브 베이스라인(전일 같은 슬롯값) ----
# lag_day가 드물게 비면 0으로 보정(앞에서 dropna 했으면 거의 없음)
base_pred = Xy_reg.loc[te_idx, "lag_day"].to_numpy()
base_pred = np.where(np.isnan(base_pred), 0.0, base_pred)
rmse_base = rmse(y_te, base_pred)

# ---- (D) 모델들 정의 ----
# RF: 조금 더 깊게, 잎 최소 샘플 완화 → 패턴 포착력↑ + 과적합 억제
rf = RandomForestRegressor(
    n_estimators=900, max_depth=6, min_samples_leaf=6,
    random_state=42, n_jobs=-1
)

# HGB Poisson: 카운트 데이터 특성 활용 (y >= 0 가정)
hgb_p = HistGradientBoostingRegressor(
    loss="poisson", learning_rate=0.07,
    max_iter=800, max_leaf_nodes=31,
    min_samples_leaf=20, random_state=42
)

# XGB (평균제곱오차)
xgb_mse = XGBRegressor(
    objective="reg:squarederror",
    n_estimators=2000,  # early stopping으로 최적 스텝 선택
    learning_rate=0.05, max_depth=6, min_child_weight=2,
    subsample=0.9, colsample_bytree=0.9,
    reg_lambda=1.0, reg_alpha=0.0,
    random_state=42, n_jobs=-1
)

# XGB (포아송/카운트) - 과도한 과소/과대 추정 완화용 대안
xgb_pois = XGBRegressor(
    objective="count:poisson",
    n_estimators=2000,
    learning_rate=0.05, max_depth=6, min_child_weight=2,
    subsample=0.9, colsample_bytree=0.9,
    max_delta_step=1.0,  # 포아송 안정화
    reg_lambda=1.0, reg_alpha=0.0,
    random_state=42, n_jobs=-1
)

metrics = {"Baseline(day-ago)": {"RMSE": rmse_base}}
preds_reg = pd.DataFrame({"ts_slot": t_te.values, "y_true": y_te.values})

# ---- (E) 학습/예측 ----
# RF/HGB는 tr_all 전체로 적합해서 test에 예측
rf.fit(X_tr_all, y_tr_all)
p_rf = rf.predict(X_te)
metrics["RF"] = {"RMSE": rmse(y_te, p_rf)}
preds_reg["y_pred_RF"] = p_rf

hgb_p.fit(X_tr_all, y_tr_all)
p_hgb = hgb_p.predict(X_te)
metrics["HGB_P"] = {"RMSE": rmse(y_te, p_hgb)}
preds_reg["y_pred_HGB_P"] = p_hgb

# XGB는 Train/Valid로 early stopping → 과적합 방지
xgb_mse.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    early_stopping_rounds=80,
    verbose=False
)
p_xgb_mse = xgb_mse.predict(X_te)
metrics["XGB_MSE"] = {"RMSE": rmse(y_te, p_xgb_mse), "BestIter": int(xgb_mse.best_iteration)}
preds_reg["y_pred_XGB_MSE"] = p_xgb_mse

xgb_pois.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    early_stopping_rounds=80,
    verbose=False
)
p_xgb_pois = xgb_pois.predict(X_te)
metrics["XGB_Pois"] = {"RMSE": rmse(y_te, p_xgb_pois), "BestIter": int(xgb_pois.best_iteration)}
preds_reg["y_pred_XGB_Pois"] = p_xgb_pois

# ---- (F) 간단 앙상블: XGB(둘 중 더 좋은 것) + RF 가중 평균 ----
# 어떤 XGB가 더 좋은지 선택
rmse_xgb_mse = metrics["XGB_MSE"]["RMSE"]
rmse_xgb_poi = metrics["XGB_Pois"]["RMSE"]
if rmse_xgb_mse <= rmse_xgb_poi:
    tag_main = "XGB_MSE"
    p_main   = p_xgb_mse
else:
    tag_main = "XGB_Pois"
    p_main   = p_xgb_pois

best_w, best_rmse, best_pred = None, float("inf"), None
for w in np.linspace(0.0, 1.0, 21):  # 0.00 ~ 1.00 (0.05 간격)
    p_ens = w*p_main + (1-w)*p_rf
    r = rmse(y_te, p_ens)
    if r < best_rmse:
        best_w, best_rmse, best_pred = w, r, p_ens

ens_name = f"ENS({best_w:.2f}*{tag_main}+{1-best_w:.2f}*RF)"
metrics[ens_name] = {"RMSE": best_rmse}
preds_reg["y_pred_ENS"] = best_pred
preds_reg["resid_ENS"]  = preds_reg["y_true"] - preds_reg["y_pred_ENS"]

# ---- (G) 결과 요약 출력 ----
def fmt(m):
    out = {k: (round(v,3) if isinstance(v,float) else v) for k,v in m.items()}
    return out
print("[회귀] 결과 요약")
print(f"Baseline(day-ago) RMSE = {rmse_base:.3f}")
for k,v in metrics.items():
    if k!="Baseline(day-ago)":
        print(k, "=>", fmt(v))

best_model = min([k for k in metrics], key=lambda k: metrics[k]["RMSE"])
print(f"=> 최종 Best: {best_model} (RMSE={metrics[best_model]['RMSE']:.3f})")

# (선택) 저장
preds_reg.to_csv("predictions_test.csv", index=False, encoding="utf-8-sig")
pd.DataFrame(metrics).T.reset_index().rename(columns={"index":"Model"}).to_csv("metrics_test.csv", index=False, encoding="utf-8-sig")



[회귀] 결과 요약
Baseline(day-ago) RMSE = 13.251
RF => {'RMSE': 3.805}
HGB_P => {'RMSE': 4.144}
XGB_MSE => {'RMSE': 3.7, 'BestIter': 158}
XGB_Pois => {'RMSE': 3.684, 'BestIter': 359}
ENS(0.75*XGB_Pois+0.25*RF) => {'RMSE': 3.668}
=> 최종 Best: ENS(0.75*XGB_Pois+0.25*RF) (RMSE=3.668)


In [60]:
from datetime import timedelta

# --- 내일 날짜 슬롯 만들기 ---
last_day = Xy_reg["ts_slot"].dt.normalize().max()
tomorrow = last_day + pd.Timedelta(days=1)

grid_next = pd.DataFrame({"slot_hour": allowed_hours})
grid_next["date"] = tomorrow
grid_next["ts_slot"] = grid_next["date"] + pd.to_timedelta(grid_next["slot_hour"], unit="h")
grid_next["weekday"] = grid_next["ts_slot"].dt.weekday

# --- 회귀용 피처 (lag_1, lag_day, roll_day 반영 필요) ---
# lag_day는 전일 같은 슬롯 → Xy_reg 마지막날 데이터를 참조
prev_ser = (
    Xy_reg.loc[Xy_reg["ts_slot"].dt.normalize() == last_day, ["slot_hour","y"]]
          .drop_duplicates("slot_hour")
          .set_index("slot_hour")["y"]   # 여기까지만!
)
grid_next["lag_day"] = grid_next["slot_hour"].map(prev_ser).fillna(0.0)

# sin/cos
grid_next["hour_sin"] = np.sin(2*np.pi*grid_next["slot_hour"]/24)
grid_next["hour_cos"] = np.cos(2*np.pi*grid_next["slot_hour"]/24)

# lag_1: 첫 슬롯은 전일 마지막 슬롯 값 사용
y_last = Xy_reg.iloc[-1]["y"]
grid_next["lag_1"] = [y_last] + [np.nan]*(len(grid_next)-1)

# roll_day: 직전 하루 평균
roll_last = Xy_reg[Xy_reg["ts_slot"].dt.normalize() == last_day]["y"].mean()
grid_next["roll_day"] = roll_last

# --- 모델 학습 (전체로 다시 fit) ---
rf_final = RandomForestRegressor(
    n_estimators=900, max_depth=6, min_samples_leaf=6,
    random_state=42, n_jobs=-1
).fit(X, y)

xgb_pois_final = XGBRegressor(
    objective="count:poisson",
    n_estimators=int(xgb_pois.best_iteration),   # early stopping에서 찾은 best_iter
    learning_rate=0.05, max_depth=6, min_child_weight=2,
    subsample=0.9, colsample_bytree=0.9,
    max_delta_step=1.0, reg_lambda=1.0, reg_alpha=0.0,
    random_state=42, n_jobs=-1
).fit(X, y)

# 2) ENS 예측 함수 정의
def ens_predict(X_new, w=0.75):
    p_xgb = xgb_pois_final.predict(X_new)
    p_rf  = rf_final.predict(X_new)
    return w*p_xgb + (1-w)*p_rf

# 3) 내일 grid_next 같은 데이터에 적용
y_pred_next = ens_predict(grid_next[feature_cols], w=0.75)

grid_next_reg = grid_next[["ts_slot","weekday","slot_hour"]].copy()
grid_next_reg["y_pred"] = y_pred_next
grid_next_reg.to_csv("predictions_tomorrow.csv", index=False, encoding="utf-8-sig")

## 2) 분류 — 슬롯(1시간)별 최다 자금유형 예측

In [61]:
# =======================
# 분류: 슬롯별 최다 자금유형 예측
# =======================
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# --- (안전) 멀티클래스 AUC 계산 보조 ---
def macro_auc_safe(y_true_str, proba, classes_str, clf_classes_str):
    idx = {c:i for i,c in enumerate(clf_classes_str)}
    aligned = np.zeros((proba.shape[0], len(classes_str)))
    for j, c in enumerate(classes_str):
        aligned[:, j] = proba[:, idx[c]] if c in idx else 0.0
    aucs = []
    for j, c in enumerate(classes_str):
        y_bin = (y_true_str == c).astype(int)
        if y_bin.min() == y_bin.max():
            continue
        aucs.append(roc_auc_score(y_bin, aligned[:, j]))
    return float(np.mean(aucs)) if aucs else float("nan")

# ---- (A) 피처/타깃 준비 ----
feature_cols_cls = (
    ["slot_hour"] +
    [c for c in Xy_cls.columns if c.startswith("wd_")] +
    [c for c in Xy_cls.columns if c.startswith("lag7_")]
)

# ---- (B) 시간순 split: 마지막 20% Test, Train의 뒤쪽 10% Valid ----
n = len(Xy_cls)
cut_te = int(n * 0.8)
train_all = Xy_cls.iloc[:cut_te].copy()
test      = Xy_cls.iloc[cut_te:].copy()

# ---- (C) 라벨 축소: train 기준 TopK + OTHER (누수 방지) ----
topK = 10
headK = train_all["label"].value_counts().nlargest(topK).index
train_all["y_str"] = np.where(train_all["label"].isin(headK), train_all["label"], "OTHER")
test["y_str"]      = np.where(test["label"].isin(headK),      test["label"],      "OTHER")

# [ADD] lag7 인코더: 누수 방지를 위해 train_all 기준으로 학습
# OTHER가 없을 수 있으니 보장
_lag_pool = pd.Index(train_all["label_lag7"].fillna("OTHER")).astype(str)
if "OTHER" not in _lag_pool:
    _lag_pool = _lag_pool.append(pd.Index(["OTHER"]))
from sklearn.preprocessing import LabelEncoder
le_lag = LabelEncoder().fit(_lag_pool)

# [ADD] lag7_enc 정수 피처 + lag7_same(지난주와 동일 여부) + lag7_headK(지난주가 상위라벨?)
train_all["lag7_enc"]  = le_lag.transform(train_all["label_lag7"].fillna("OTHER").astype(str))
test["lag7_enc"]       = le_lag.transform(test["label_lag7"].fillna("OTHER").astype(str))
train_all["lag7_same"] = (train_all["label_lag7"] == train_all["y_str"]).astype("uint8")
test["lag7_same"]      = (test["label_lag7"] == test["y_str"]).astype("uint8")
train_all["lag7_headK"] = train_all["label_lag7"].isin(headK).astype("uint8")
test["lag7_headK"]      = test["label_lag7"].isin(headK).astype("uint8")

# ---- (D) Train→Train/Valid (시간순) ----
m = len(train_all)
cut_val = int(m * 0.9)
train = train_all.iloc[:cut_val].copy()
valid = train_all.iloc[cut_val:].copy()

# ---- (E) 인코딩 ----
le = LabelEncoder().fit(np.unique(np.concatenate([train_all["y_str"].values, test["y_str"].values])))
y_tr   = le.transform(train["y_str"].values)
y_val  = le.transform(valid["y_str"].values)
y_test = le.transform(test["y_str"].values)

# [REPLACE] 피처 목록 확장: 기존 one-hot + lag7 강화 피처 3개 추가
feature_cols_cls2 = (
    ["slot_hour"] +
    [c for c in Xy_cls.columns if c.startswith("wd_")] +
    [c for c in Xy_cls.columns if c.startswith("lag7_")] +  # 기존 one-hot들
    ["lag7_enc", "lag7_same", "lag7_headK"]                # [ADD]
)

X_tr   = train[feature_cols_cls2].astype(float)
X_val  = valid[feature_cols_cls2].astype(float)
X_te   = test[feature_cols_cls2].astype(float)
# ---- (F) 불균형 보정: 샘플가중치 (역빈도) ----
freq = pd.Series(y_tr).value_counts(normalize=True)
w_map = {cls: 1.0/max(p, 1e-9) for cls, p in freq.items()}
w_tr = np.array([w_map[c] for c in y_tr], dtype=float)
w_tr = w_tr / w_tr.mean()

# ---- (G) 나이브 베이스라인 (지난주 라벨) ----
test_lag7 = np.where(test["label_lag7"].isin(le.classes_), test["label_lag7"], "OTHER")
acc_base  = (test_lag7 == test["y_str"].values).mean()

# ---- (H) 모델 정의 ----
rf = RandomForestClassifier(
    n_estimators=900, max_depth=8, min_samples_leaf=3,
    class_weight="balanced_subsample",
    random_state=42, n_jobs=-1
)

xgb = XGBClassifier(
    objective="multi:softprob", eval_metric="mlogloss",
    n_estimators=3000,          # 충분히 크게 두고 ES로 최적 스텝
    learning_rate=0.05, max_depth=4, min_child_weight=2,
    subsample=0.85, colsample_bytree=0.85,
    reg_lambda=2.0, reg_alpha=1.0,
    random_state=42, n_jobs=-1
)

results = {}
preds_cls = pd.DataFrame({"ts_slot": test["ts_slot"].values})

# ---- (I) 학습/예측 (RF) ----
rf.fit(X_tr, y_tr, sample_weight=w_tr)
proba_val_rf = rf.predict_proba(X_val)                  # (n_val, C)
proba_te_rf  = rf.predict_proba(X_te)
pred_te_rf   = rf.predict(X_te)
preds_cls["pred_RF"] = le.inverse_transform(pred_te_rf)
# 모델 내부 정수→문자열 클래스 매핑
classes_str_model_rf = le.inverse_transform(rf.classes_)
# 메트릭
acc_rf  = accuracy_score(y_test, pred_te_rf)
top3_rf = np.mean([le.inverse_transform([yy])[0] in le.inverse_transform(np.argsort(proba_te_rf[i])[::-1][:3])
                   for i, yy in enumerate(y_test)])
auc_rf  = macro_auc_safe(le.inverse_transform(y_test),
                         proba_te_rf, classes_str=le.classes_,
                         clf_classes_str=classes_str_model_rf)

results["RF"] = {"AUC_macro": float(auc_rf), "ACC": float(acc_rf), "Top-3": float(top3_rf)}

# ---- (J) 학습/예측 (XGB, early stopping: Valid 사용) ----
xgb.fit(
    X_tr, y_tr,
    sample_weight=w_tr,
    eval_set=[(X_val, y_val)],
    early_stopping_rounds=100,
    verbose=False
)
proba_val_xgb = xgb.predict_proba(X_val)
proba_te_xgb  = xgb.predict_proba(X_te)
pred_te_xgb   = xgb.predict(X_te)
preds_cls["pred_XGB"] = le.inverse_transform(pred_te_xgb)

classes_str_model_xgb = le.inverse_transform(xgb.classes_)
acc_xgb  = accuracy_score(y_test, pred_te_xgb)
top3_xgb = np.mean([le.inverse_transform([yy])[0] in le.inverse_transform(np.argsort(proba_te_xgb[i])[::-1][:3])
                    for i, yy in enumerate(y_test)])
auc_xgb  = macro_auc_safe(le.inverse_transform(y_test),
                          proba_te_xgb, classes_str=le.classes_,
                          clf_classes_str=classes_str_model_xgb)

results["XGB"] = {"AUC_macro": float(auc_xgb), "ACC": float(acc_xgb), "Top-3": float(top3_xgb), "BestIter": int(getattr(xgb, "best_iteration", 0))}

# ---- (K) 확률 앙상블: w*XGB + (1-w)*RF  (w는 VALID에서 Top-3 최대화로 탐색) ----
best_w, best_metric = None, -1.0
for w in np.linspace(0.0, 1.0, 21):  # 0.00~1.00
    # valid 기준 정렬/클래스정합 필요 없음: 두 모델 모두 le 기준 클래스 순서
    proba_val_ens = w*proba_val_xgb + (1-w)*proba_val_rf
    # valid Top-3
    order = np.argsort(proba_val_ens, axis=1)[:, ::-1]
    top3_valid = np.mean([train_all["y_str"].values[cut_val + i] in le.inverse_transform(order[i, :3])
                          for i in range(len(order))])
    if top3_valid > best_metric:
        best_metric, best_w = top3_valid, w

# 최적 w로 test 집합 예측/지표
proba_te_ens = best_w*proba_te_xgb + (1-best_w)*proba_te_rf
pred_te_ens  = np.argmax(proba_te_ens, axis=1)
preds_cls["pred_ENS"] = le.inverse_transform(pred_te_ens)

# Top-1/Top-3/매크로AUC
acc_ens  = accuracy_score(y_test, pred_te_ens)
order    = np.argsort(proba_te_ens, axis=1)[:, ::-1]
top3_ens = np.mean([le.inverse_transform([yy])[0] in le.inverse_transform(order[i, :3])
                    for i, yy in enumerate(y_test)])
auc_ens  = macro_auc_safe(le.inverse_transform(y_test),
                          proba_te_ens, classes_str=le.classes_,
                          clf_classes_str=le.classes_)  # 이미 정렬됨

results[f"ENS({best_w:.2f}*XGB+{1-best_w:.2f}*RF)"] = {
    "AUC_macro": float(auc_ens), "ACC": float(acc_ens), "Top-3": float(top3_ens)
}

# ---- (L) 대시보드용 Top-3 결과 칼럼 ----
labels_str = le.classes_
preds_cls["p1_ENS"]   = proba_te_ens[np.arange(len(proba_te_ens)), np.argmax(proba_te_ens, axis=1)].astype(float)

order = np.argsort(proba_te_ens, axis=1)[:, ::-1]
k = min(3, proba_te_ens.shape[1])
top_idx = order[:, :k]
preds_cls["ens_top1"] = labels_str[top_idx[:,0]]
preds_cls["ens_p1"]   = proba_te_ens[np.arange(len(proba_te_ens)), top_idx[:,0]].astype(float)
if k >= 2:
    preds_cls["ens_top2"] = labels_str[top_idx[:,1]]
    preds_cls["ens_p2"]   = proba_te_ens[np.arange(len(proba_te_ens)), top_idx[:,1]].astype(float)
if k >= 3:
    preds_cls["ens_top3"] = labels_str[top_idx[:,2]]
    preds_cls["ens_p3"]   = proba_te_ens[np.arange(len(proba_te_ens)), top_idx[:,2]].astype(float)

# (선택) 요일/시간대 메타
if ("weekday" not in preds_cls.columns) and ("weekday" in test.columns):
    preds_cls = preds_cls.join(test[["weekday","slot_hour"]].reset_index(drop=True))
weekday_map = {0:"월",1:"화",2:"수",3:"목",4:"금",5:"토",6:"일"}
if "weekday" in preds_cls.columns and "weekday_ko" not in preds_cls.columns:
    preds_cls["weekday_ko"] = preds_cls["weekday"].map(weekday_map)

# ---- (M) 결과 출력 ----
def _round(d):
    return {k: (round(v,4) if isinstance(v,float) else v) for k,v in d.items()}

print(f"[분류] Baseline(lag7) ACC={acc_base:.3f}")
for k,v in results.items():
    print(k, "=>", _round(v))

# (선택) 저장
preds_cls.to_csv("cls_predictions_test.csv", index=False, encoding="utf-8-sig")
pd.DataFrame(results).T.reset_index().rename(columns={"index":"Model"}).to_csv("cls_metrics_table.csv", index=False, encoding="utf-8-sig")



[분류] Baseline(lag7) ACC=0.456
RF => {'AUC_macro': 0.7906, 'ACC': 0.5508, 'Top-3': 0.8066}
XGB => {'AUC_macro': 0.8443, 'ACC': 0.6295, 'Top-3': 0.9443, 'BestIter': 451}
ENS(0.55*XGB+0.45*RF) => {'AUC_macro': 0.8204, 'ACC': 0.6164, 'Top-3': 0.9311}


In [63]:
wd_cols    = [c for c in Xy_cls.columns if c.startswith("wd_")]
lag7_cols  = [c for c in Xy_cls.columns if c.startswith("lag7_")]
feature_cols_cls2 = ["slot_hour"] + wd_cols + lag7_cols + ["lag7_enc","lag7_same","lag7_headK"]

# 1) 시간순 split
n = len(Xy_cls)
cut_te = int(n * 0.8)
train_all = Xy_cls.iloc[:cut_te].copy()
test      = Xy_cls.iloc[cut_te:].copy()

# 2) 라벨 축소(누수 방지): train 기준 TopK + OTHER
topK  = 10
headK = train_all["label"].value_counts().nlargest(topK).index
train_all["y_str"] = np.where(train_all["label"].isin(headK), train_all["label"], "OTHER")
test["y_str"]      = np.where(test["label"].isin(headK),      test["label"],      "OTHER")

# 3) lag7 인코더(누수 방지: train_all 기준) + 강화피처
_lag_pool = pd.Index(train_all["label_lag7"].fillna("OTHER").astype(str))
if "OTHER" not in _lag_pool:
    _lag_pool = _lag_pool.append(pd.Index(["OTHER"]))
le_lag = LabelEncoder().fit(_lag_pool)

train_all["lag7_enc"]   = le_lag.transform(train_all["label_lag7"].fillna("OTHER").astype(str))
test["lag7_enc"]        = le_lag.transform(test["label_lag7"].fillna("OTHER").astype(str))
train_all["lag7_same"]  = (train_all["label_lag7"] == train_all["y_str"]).astype("uint8")
test["lag7_same"]       = (test["label_lag7"] == test["y_str"]).astype("uint8")
train_all["lag7_headK"] = train_all["label_lag7"].isin(headK).astype("uint8")
test["lag7_headK"]      = test["label_lag7"].isin(headK).astype("uint8")

# 4) Train/Valid (시간순 9:1)
m = len(train_all)
cut_val = int(m * 0.9)
train = train_all.iloc[:cut_val].copy()
valid = train_all.iloc[cut_val:].copy()

# 5) 라벨 인코더(최종 클래스)
le = LabelEncoder().fit(
    np.unique(np.concatenate([train_all["y_str"].values, test["y_str"].values]))
)

X_tr   = train[feature_cols_cls2].astype(float)
X_val  = valid[feature_cols_cls2].astype(float)
X_all  = train_all[feature_cols_cls2].astype(float)      # 최종 재학습용
y_tr   = le.transform(train["y_str"].values)
y_val  = le.transform(valid["y_str"].values)
y_all  = le.transform(train_all["y_str"].values)

# 6) XGB 학습(early stopping) → best_iteration 추출
xgb = XGBClassifier(
    objective="multi:softprob", eval_metric="mlogloss",
    n_estimators=3000, learning_rate=0.05, max_depth=4, min_child_weight=2,
    subsample=0.85, colsample_bytree=0.85,
    reg_lambda=2.0, reg_alpha=1.0,
    random_state=42, n_jobs=-1
)
xgb.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], early_stopping_rounds=100, verbose=False)
best_iter = int(getattr(xgb, "best_iteration", 300))
best_iter = max(best_iter, 50)  # 안전판

# 7) 최종 재학습(전체 train_all)
xgb_final = XGBClassifier(
    objective="multi:softprob", eval_metric="mlogloss",
    n_estimators=best_iter, learning_rate=0.05, max_depth=4, min_child_weight=2,
    subsample=0.85, colsample_bytree=0.85,
    reg_lambda=2.0, reg_alpha=1.0,
    random_state=42, n_jobs=-1
)
xgb_final.fit(X_all, y_all)

# ===== 내일 그리드 생성 & 피처 구성 (Xy_cls 파이프라인과 동일 전제) =====
last_day = Xy_cls["ts_slot"].dt.normalize().max()
tomorrow = last_day + pd.Timedelta(days=1)
allowed_hours = sorted(Xy_cls["slot_hour"].unique().tolist())

grid_next = pd.DataFrame({"slot_hour": allowed_hours})
grid_next["date"] = tomorrow
# grid_next["ts_slot"] = grid_next["date"] + pd.to_datetime(grid_next["slot_hour"], unit="h").dt.to_pytimedelta()
# 위 한 줄이 환경에 따라 경고가 나면 아래 대체 사용:
grid_next["ts_slot"] = grid_next["date"] + pd.to_timedelta(grid_next["slot_hour"], unit="h")

grid_next["weekday"] = grid_next["ts_slot"].dt.weekday

# 지난주 같은 슬롯 라벨(lag7): Xy_cls에서 가져오기 (네가 만든 major 기반 label)
last_week_day = tomorrow - pd.Timedelta(days=7)
lag7_map = (Xy_cls[Xy_cls["ts_slot"].dt.normalize() == last_week_day]
            .drop_duplicates("slot_hour")
            .set_index("slot_hour")["label"])
grid_next["label_lag7"] = grid_next["slot_hour"].map(lag7_map).fillna("OTHER")

# one-hot(요일/lag7) – 학습시 존재했던 컬럼 집합과 정확히 맞추기
wd_next   = pd.get_dummies(grid_next["weekday"],    prefix="wd",   drop_first=False)
lag7_next = pd.get_dummies(grid_next["label_lag7"], prefix="lag7", drop_first=False)

wd_next   = wd_next.reindex(columns=wd_cols,   fill_value=0)
lag7_next = lag7_next.reindex(columns=lag7_cols, fill_value=0)

# 강화 피처(내일은 정답 없음 → same=0, headK는 신호 유지)
grid_next["lag7_enc"]   = le_lag.transform(
    grid_next["label_lag7"].where(grid_next["label_lag7"].isin(le_lag.classes_), "OTHER").astype(str)
)
grid_next["lag7_same"]  = 0
grid_next["lag7_headK"] = grid_next["label_lag7"].isin(headK).astype("uint8")

# 예측용 피처 프레임(학습 순서/개수와 100% 동일)
X_next_parts = pd.concat([
    grid_next[["slot_hour"]].astype(float).reset_index(drop=True),
    wd_next.astype(float).reset_index(drop=True),
    lag7_next.astype(float).reset_index(drop=True),
    grid_next[["lag7_enc","lag7_same","lag7_headK"]].astype(float).reset_index(drop=True)
], axis=1)

# 학습에 실제 사용된 열 순서로 재정렬(누락은 0으로 채우고, 여분은 버림)
feature_names_fit = X_all.columns.tolist()
for c in feature_names_fit:
    if c not in X_next_parts.columns:
        X_next_parts[c] = 0.0
X_next = X_next_parts[feature_names_fit].astype(float)

# 예측 (Top-3)
proba_next = xgb_final.predict_proba(X_next)
pred_next  = xgb_final.predict(X_next)

df_next_cls = grid_next[["ts_slot","weekday","slot_hour","label_lag7"]].copy()
df_next_cls["pred_label"] = le.inverse_transform(pred_next)

order = np.argsort(proba_next, axis=1)[:, ::-1]
labels_str = le.classes_
k = min(3, proba_next.shape[1])
df_next_cls["top1"] = labels_str[order[:,0]]
df_next_cls["p1"]   = proba_next[np.arange(len(proba_next)), order[:,0]].astype(float)
if k >= 2:
    df_next_cls["top2"] = labels_str[order[:,1]]
    df_next_cls["p2"]   = proba_next[np.arange(len(proba_next)), order[:,1]].astype(float)
if k >= 3:
    df_next_cls["top3"] = labels_str[order[:,2]]
    df_next_cls["p3"]   = proba_next[np.arange(len(proba_next)), order[:,2]].astype(float)

weekday_map = {0:"월",1:"화",2:"수",3:"목",4:"금",5:"토",6:"일"}
df_next_cls["weekday_ko"] = df_next_cls["weekday"].map(weekday_map)

df_next_cls.to_csv("cls_pred_tomorrow_xgb.csv", index=False, encoding="utf-8-sig")
print(">> 내일 분류 예측 저장: cls_pred_tomorrow_xgb.csv  (XGB, n_estimators=%d)" % best_iter)



>> 내일 분류 예측 저장: cls_pred_tomorrow_xgb.csv  (XGB, n_estimators=158)
