# End-to-End XGBoost Demo (GPU, No External Imports)
This notebook mirrors the project pipeline—data split, clustering, feature engineering, training, and inference—in a **single file**. All code is inlined from the repository scripts so you can understand each step without jumping across modules or using a config file.

**Defaults** (matching the repository's GPU/XGBoost setup):
- Data root: `/workspace/oibc/data`
- Save path: `/workspace/oibc/data/result/exp34_all_1`
- Clusters: 10
- Model: XGBoost (GPU accelerated if available)


In [None]:
import logging, warnings, os, time, pickle
from pathlib import Path
from datetime import datetime
from typing import List, Tuple, Optional

import joblib
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
import pvlib, pytz

try:
    from xgboost import XGBRegressor
except ImportError:
    raise ImportError("Please install xgboost before running this notebook.")

warnings.filterwarnings("ignore")


In [None]:
# ---------------------------------------------------------------------
# Hard-coded configuration (mirrors src/config.yaml defaults)
# ---------------------------------------------------------------------
DATA_ROOT = Path("/workspace/oibc/data")
SAVE_PATH = Path("/workspace/oibc/data/result/exp34_all_1")
SPLIT_DIR = Path("/workspace/oibc/data_split")
CLUSTER_DIR = Path("/workspace/oibc/cluster_all")
SUBMISSION_FILE = DATA_ROOT / "submission_sample.csv"

USE_GPU = True
N_CLUSTERS = 10

# Model params (from src/main_xgb.py)
BASE_MODEL_PARAMS = {
    "objective": "reg:squarederror",
    "n_estimators": 6000,
    "learning_rate": 0.035,
    "max_depth": 11,
    "subsample": 0.856,
    "colsample_bytree": 0.835,
    "random_state": 42,
    "n_jobs": -1,
    "verbosity": 1,
    "eval_metric": "mae",
}
GPU_MODEL_PARAMS = {"device": "cuda", "tree_method": "hist"}

TARGET = "nins"
CAT_FEATURES = ["pv_id", "cluster"]
EXCLUDE_FEATURES = {"time", "type", "energy", TARGET}
TIMEZONE = "Asia/Seoul"
DEFAULT_LATITUDE = 36.0
DEFAULT_LONGITUDE = 128.0

SAVE_PATH.mkdir(parents=True, exist_ok=True)
(SAVE_PATH / "model").mkdir(parents=True, exist_ok=True)
CLUSTER_DIR.mkdir(parents=True, exist_ok=True)
SPLIT_DIR.mkdir(parents=True, exist_ok=True)


In [None]:
# ---------------------------------------------------------------------
# Utilities
# ---------------------------------------------------------------------
def setup_logging(log_path: Path):
    log_path.parent.mkdir(parents=True, exist_ok=True)
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s - %(levelname)s - %(message)s",
        handlers=[logging.FileHandler(log_path, "w"), logging.StreamHandler()],
    )


def check_gpu_available():
    try:
        import torch
        return torch.cuda.is_available()
    except ImportError:
        import subprocess
        try:
            subprocess.check_output(["nvidia-smi"])
            return True
        except Exception:
            return False


## 1) Data split (train/validation)
Splits by unique `pv_id` to avoid leakage. Mirrors `scripts/split.py` but uses the hard-coded paths above.


In [None]:
def split_data(seed: int = 42, ratio: float = 0.1):
    csv_path = DATA_ROOT / "train.csv"
    train_path = SPLIT_DIR / "train_split.csv"
    val_path = SPLIT_DIR / "val_split.csv"

    df = pd.read_csv(csv_path)
    pv_ids = df["pv_id"].unique()
    train_ids, val_ids = train_test_split(
        pv_ids, test_size=ratio, random_state=seed, shuffle=True
    )
    train_df = df[df["pv_id"].isin(train_ids)]
    val_df = df[df["pv_id"].isin(val_ids)]

    train_df.to_csv(train_path, index=False)
    val_df.to_csv(val_path, index=False)

    logging.info(f"Training set: {len(train_df)} rows, {len(train_ids)} unique pv_ids")
    logging.info(f"Validation set: {len(val_df)} rows, {len(val_ids)} unique pv_ids")
    return train_path, val_path

train_split_path, val_split_path = split_data()


## 2) Clustering (train + reuse for val/test)
Follows `scripts/add_cluster_efficient.py` and `scripts/add_cluster_from_train.py` to build cluster-enhanced features.


In [None]:
SELECT_COLUMNS = ["uv_idx", "nins", "humidity"]


def build_train_clusters(train_path: Path, num_cluster: int = N_CLUSTERS):
    df = pd.read_csv(train_path, parse_dates=["time"])
    df["_orig_idx"] = np.arange(len(df))
    orig_idx_snapshot = df["_orig_idx"].copy()

    if not {"coord1", "coord2"}.issubset(df.columns):
        raise ValueError("Columns 'coord1' and 'coord2' are required for clustering.")

    pv_location = (
        df[["pv_id", "coord1", "coord2"]]
        .dropna(subset=["coord1", "coord2"])
        .drop_duplicates(subset=["pv_id"], keep="first")
    )

    kmeans = KMeans(n_clusters=num_cluster, random_state=42)
    pv_location["cluster"] = kmeans.fit_predict(pv_location[["coord1", "coord2"]]).astype(int)

    joblib.dump(kmeans, CLUSTER_DIR / "train_cluster_model.joblib", compress=3)
    centroids = pd.DataFrame(kmeans.cluster_centers_, columns=["coord1", "coord2"])
    joblib.dump(centroids, CLUSTER_DIR / "train_cluster_centroids.joblib", compress=3)

    df = df.merge(pv_location[["pv_id", "cluster"]], on="pv_id", how="left")
    df["hour"] = df["time"].dt.floor("H")

    cluster_means = df.groupby(["hour", "cluster"])[SELECT_COLUMNS].mean().reset_index()
    df_cluster_all = None
    for c in range(num_cluster):
        sub = cluster_means[cluster_means["cluster"] == c].copy()
        sub = sub.drop(columns="cluster").add_prefix(f"cluster_{c}_")
        sub = sub.rename(columns={f"cluster_{c}_hour": "hour"})
        df_cluster_all = sub if df_cluster_all is None else pd.merge(
            df_cluster_all, sub, on="hour", how="outer"
        )

    df = df.merge(df_cluster_all, on="hour", how="left")

    coords = df[["coord1", "coord2"]].to_numpy()
    for i, (cx, cy) in enumerate(kmeans.cluster_centers_):
        dist = np.sqrt((coords[:, 0] - cx) ** 2 + (coords[:, 1] - cy) ** 2)
        df[f"cluster_{i}_ratio"] = 1 / (1 + dist) ** 2

    df = (
        df.sort_values(["cluster", "pv_id", "hour"])
        .groupby("cluster", group_keys=False)
        .apply(lambda g: g.ffill().bfill())
    )
    df = df.sort_values("_orig_idx").reset_index(drop=True).drop(columns="_orig_idx")

    output_path = CLUSTER_DIR / "train_with_cluster_means_and_ratios.joblib"
    joblib.dump(df, output_path, compress=3)
    return df, cluster_means, kmeans


def apply_clusters(df_path: Path, kmeans: KMeans, cluster_means: pd.DataFrame, prefix: str):
    df = pd.read_csv(df_path, parse_dates=["time"])
    df["_orig_idx"] = np.arange(len(df))
    orig_idx_snapshot = df["_orig_idx"].copy()

    pv_location = (
        df[["pv_id", "coord1", "coord2"]]
        .dropna(subset=["coord1", "coord2"])
        .drop_duplicates(subset=["pv_id"], keep="first")
    )
    df = df.merge(
        pv_location.assign(cluster=kmeans.predict(pv_location[["coord1", "coord2"]]).astype(int))[
            ["pv_id", "cluster"]
        ],
        on="pv_id",
        how="left",
    )

    num_cluster = len(kmeans.cluster_centers_)
    df["hour"] = df["time"].dt.floor("H")
    df_cluster_all = None
    for c in range(num_cluster):
        sub = cluster_means[cluster_means["cluster"] == c].copy()
        sub = sub.drop(columns="cluster").add_prefix(f"cluster_{c}_")
        sub = sub.rename(columns={f"cluster_{c}_hour": "hour"})
        df_cluster_all = sub if df_cluster_all is None else pd.merge(
            df_cluster_all, sub, on="hour", how="outer"
        )

    df = df.merge(df_cluster_all, on="hour", how="left")

    coords = df[["coord1", "coord2"]].to_numpy()
    for i, (cx, cy) in enumerate(kmeans.cluster_centers_):
        dist = np.sqrt((coords[:, 0] - cx) ** 2 + (coords[:, 1] - cy) ** 2)
        df[f"cluster_{i}_ratio"] = 1 / (1 + dist) ** 2

    df = (
        df.sort_values(["cluster", "pv_id", "hour"])
        .groupby("cluster", group_keys=False)
        .apply(lambda g: g.ffill().bfill())
    )
    df = df.sort_values("_orig_idx").reset_index(drop=True).drop(columns="_orig_idx")

    output_path = CLUSTER_DIR / f"{prefix}_with_cluster_means_and_ratios.joblib"
    joblib.dump(df, output_path, compress=3)
    return df

train_cluster_df, cluster_means, kmeans = build_train_clusters(train_split_path)
val_cluster_df = apply_clusters(val_split_path, kmeans, cluster_means, prefix="val")
test_cluster_df = apply_clusters(DATA_ROOT / "test.csv", kmeans, cluster_means, prefix="test")


## 3) Feature engineering
Inlined from `src/preprocess.py` (temporal, solar, weather interactions, interpolation, and matrix building).


In [None]:
def engineer_features(
    df: pd.DataFrame,
    latitude: float = DEFAULT_LATITUDE,
    longitude: float = DEFAULT_LONGITUDE,
) -> pd.DataFrame:
    logging.info("Engineering features...")
    if df["time"].dt.tz is None:
        df["time"] = df["time"].dt.tz_localize(TIMEZONE)

    original_index = df.index
    sort_keys = ["pv_id", "time"] if "pv_id" in df.columns else ["time"]
    df = df.sort_values(sort_keys)

    df = add_temporal_features(df)
    df = add_solar_features(df, latitude, longitude)
    df = add_weather_interactions(df)
    df = interpolate_weather(df)

    df = df.loc[original_index].reset_index(drop=True)
    logging.info(f"  Total features after engineering: {len(df.columns)}")
    return df


def add_temporal_features(df: pd.DataFrame) -> pd.DataFrame:
    minute_of_day = df["time"].dt.hour * 60 + df["time"].dt.minute
    df["hour_sin"] = np.sin(2 * np.pi * minute_of_day / 1440)
    df["hour_cos"] = np.cos(2 * np.pi * minute_of_day / 1440)
    df["day_of_year"] = df["time"].dt.dayofyear
    df["day_sin"] = np.sin(2 * np.pi * df["day_of_year"] / 365.25)
    df["day_cos"] = np.cos(2 * np.pi * df["day_of_year"] / 365.25)
    df["week_of_year"] = df["time"].dt.isocalendar().week.astype(int)
    df["is_weekend"] = (df["time"].dt.weekday >= 5).astype(np.int8)
    df["month"] = df["time"].dt.month.astype(np.int8)
    df["season"] = ((df["month"] % 12 + 3) // 3).astype(np.int8)
    df["minute_of_day"] = minute_of_day.astype(np.int16)
    return df


def add_solar_features(df: pd.DataFrame, latitude: float, longitude: float) -> pd.DataFrame:
    times = df["time"]
    if times.dt.tz is None:
        times = times.dt.tz_localize(TIMEZONE)
    solar = pvlib.solarposition.get_solarposition(
        times.dt.tz_convert(pytz.utc), latitude=latitude, longitude=longitude
    )
    df["solar_altitude"] = solar["apparent_elevation"].clip(lower=0).values
    df["solar_azimuth"] = solar["azimuth"].values
    df["sun_up"] = (df["solar_altitude"] > 0).astype(np.int8)
    df["solar_altitude_norm"] = df["solar_altitude"] / 90.0
    return df


def add_weather_interactions(df: pd.DataFrame) -> pd.DataFrame:
    if {"temp_max", "temp_min"}.issubset(df.columns):
        df["temp_range"] = df["temp_max"] - df["temp_min"]
    if {"temp_a", "temp_b"}.issubset(df.columns):
        df["temp_diff_surface"] = df["temp_a"] - df["temp_b"]
    if {"pressure", "ground_press"}.issubset(df.columns):
        df["pressure_gap"] = df["pressure"] - df["ground_press"]
    if {"real_feel_temp", "real_feel_temp_shade"}.issubset(df.columns):
        df["feels_gap"] = df["real_feel_temp"] - df["real_feel_temp_shade"]
    if {"wind_spd_a", "wind_spd_b"}.issubset(df.columns):
        df["wind_resultant"] = np.hypot(
            df["wind_spd_a"].fillna(0), df["wind_spd_b"].fillna(0)
        )
    if "wind_gust_spd" in df.columns:
        df["gust_ratio"] = df["wind_gust_spd"] / (df["wind_resultant"] + 1e-3)
    if {"cloud_a", "cloud_b"}.issubset(df.columns):
        df["cloud_cover_mean"] = df[["cloud_a", "cloud_b"]].mean(axis=1)
    if {"temp_a", "dew_point"}.issubset(df.columns):
        df["dewpoint_spread"] = df["temp_a"] - df["dew_point"]
    if {"humidity", "rel_hum"}.issubset(df.columns):
        rel = df["rel_hum"].replace(0, np.nan)
        df["humidity_ratio"] = df["humidity"] / rel
    if "precip_1h" in df.columns:
        df["is_rainy"] = (df["precip_1h"].fillna(0) > 0).astype(np.int8)
    return df


def interpolate_weather(df: pd.DataFrame) -> pd.DataFrame:
    weather_cols = [
        c
        for c in df.select_dtypes(include=[np.number]).columns
        if c not in EXCLUDE_FEATURES and c not in CAT_FEATURES
    ]
    if "pv_id" in df.columns and weather_cols:
        df[weather_cols] = df.groupby("pv_id")[weather_cols].bfill().ffill()
    return df


def align_feature_frames(train_df: pd.DataFrame, test_df: pd.DataFrame, cat_features: List[str]):
    train_filled = train_df.copy()
    test_filled = test_df.copy()

    common_numeric = [
        c
        for c in train_filled.select_dtypes(include=[np.number]).columns
        if c in test_filled.columns
    ]
    if common_numeric:
        medians = train_filled[common_numeric].median().fillna(0)
        train_filled[common_numeric] = train_filled[common_numeric].fillna(medians)
        test_filled[common_numeric] = test_filled[common_numeric].fillna(medians)

    for col in cat_features:
        if col not in train_filled.columns or col not in test_filled.columns:
            continue
        train_filled[col] = train_filled[col].astype("category")
        test_filled[col] = test_filled[col].astype("category")
        mode_value = train_filled[col].mode(dropna=True)
        fill_value = mode_value.iloc[0] if not mode_value.empty else "missing"
        if fill_value not in train_filled[col].cat.categories:
            train_filled[col] = train_filled[col].cat.add_categories([fill_value])
        if fill_value not in test_filled[col].cat.categories:
            test_filled[col] = test_filled[col].cat.add_categories([fill_value])
        train_filled[col] = train_filled[col].fillna(fill_value)
        test_filled[col] = test_filled[col].fillna(fill_value)

    return train_filled, test_filled


def build_feature_matrix(train_df: pd.DataFrame, test_df: pd.DataFrame):
    logging.info("Building feature matrices...")
    feature_cols = [c for c in train_df.columns if c not in EXCLUDE_FEATURES]
    cat_features = [c for c in CAT_FEATURES if c in feature_cols]
    train_features = train_df[feature_cols]
    test_features = test_df[[c for c in feature_cols if c in test_df.columns]]
    train_features, test_features = align_feature_frames(
        train_features, test_features, cat_features
    )
    return train_features, test_features, feature_cols, cat_features


def encode_for_xgb(df: pd.DataFrame, cat_features: List[str]) -> pd.DataFrame:
    if not cat_features:
        return df
    for col in cat_features:
        df[col] = df[col].cat.codes.astype("int16")
    return df


## 4) Process datasets & build matrices
Runs feature engineering for train/val/test, drops the auxiliary `hour` column, aligns categories, and encodes for XGBoost.


In [None]:
def process_split(df: pd.DataFrame, name: str):
    logging.info(f"Processing {name}…")
    df["time"] = pd.to_datetime(df["time"])
    df = engineer_features(df)
    if name == "train":
        df = df.dropna(subset=[TARGET])
    if "hour" in df.columns:
        df.drop(columns=["hour"], inplace=True)
    return df

train_processed = process_split(train_cluster_df.copy(), "train")
val_processed = process_split(val_cluster_df.copy(), "val")
test_processed = process_split(test_cluster_df.copy(), "test")

for df in (train_processed, val_processed, test_processed):
    if "pv_id" in df.columns:
        df["pv_id"] = df["pv_id"].astype("category")
    if "cluster" in df.columns:
        df["cluster"] = df["cluster"].astype("category")


## 5) Train GPU XGBoost and evaluate
Mirrors `src/main_xgb.py` (single model). Uses GPU if available.


In [None]:
def build_model(use_gpu: bool):
    params = BASE_MODEL_PARAMS.copy()
    if use_gpu:
        params.update(GPU_MODEL_PARAMS)
    return XGBRegressor(**params)


gpu = check_gpu_available() and USE_GPU
if USE_GPU and not gpu:
    logging.warning("GPU requested but not available — running on CPU.")

logging.info("Building feature matrices…")
train_features, val_features, feature_cols, cat_features = build_feature_matrix(
    train_processed, val_processed
)
train_features, test_features, _, _ = build_feature_matrix(train_processed, test_processed)

if "pv_id" in feature_cols:
    feature_cols = [c for c in feature_cols if c != "pv_id"]
if "pv_id" in cat_features:
    cat_features = [c for c in cat_features if c != "pv_id"]

X_train = train_features[feature_cols]
X_val = val_features[feature_cols]
X_test = test_features[feature_cols]

y_train = train_processed[TARGET].clip(lower=0).values
y_val = val_processed[TARGET].clip(lower=0).values
y_val_raw = val_processed[TARGET].values

xgb_data = {
    k: encode_for_xgb(v.copy(), cat_features)
    for k, v in zip(["xgb_train", "xgb_val", "xgb_test"], [X_train, X_val, X_test])
}

logging.info("Training XGBoost…")
start_time = time.time()
model = build_model(use_gpu=gpu)
model.fit(xgb_data["xgb_train"], y_train, eval_set=[(xgb_data["xgb_val"], y_val)], verbose=100)
val_preds = np.maximum(model.predict(xgb_data["xgb_val"]), 0)
test_preds = np.maximum(model.predict(xgb_data["xgb_test"]), 0)

model_path = SAVE_PATH / "model" / "xgboost.pkl"
with open(model_path, "wb") as f:
    pickle.dump(model, f)
logging.info(f"XGBoost training finished in {time.time() - start_time:.2f}s")

mae = mean_absolute_error(y_val_raw, val_preds)
logging.info(f"Validation MAE: {mae:.6f}")


## 6) Inference & submission file
Generates predictions for the public test set using the trained model and writes a submission CSV next to the sample file.


In [None]:
submission_path = DATA_ROOT / "submission.csv"
if SUBMISSION_FILE.exists():
    submission = pd.read_csv(SUBMISSION_FILE)
    if "prediction" in submission.columns:
        submission["prediction"] = test_preds
    elif "nins" in submission.columns:
        submission["nins"] = test_preds
    else:
        submission = submission.assign(prediction=test_preds)
else:
    submission = pd.DataFrame({"prediction": test_preds})

submission.to_csv(submission_path, index=False)
logging.info(f"Saved submission to {submission_path}")
