# LightGBM

### Setup

In [None]:
import ast
import os

import pandas as pd
from darts.models.forecasting.lgbm import LightGBMModel

from config import ENCODERS, FORECAST_DATES, HORIZON, NUM_SAMPLES, QUANTILES, RANDOM_SEEDS, ROOT
from src.realtime_utils import (
    compute_forecast,
    load_realtime_training_data,
)


  __import__("pkg_resources").declare_namespace(__name__)  # type: ignore


In [3]:
import warnings

warnings.filterwarnings(
    "ignore",
    message="X does not have valid feature names, but LGBMRegressor was fitted with feature names",
    category=UserWarning,
    module="sklearn.utils.validation",
)

# Load best model

In [None]:
def get_best_parameters(csv_path: str) -> dict:
    """
    Load a gridsearch CSV, parse covariate columns, drop error columns,
    and return the configuration with the lowest WIS.
    """
    gs = pd.read_csv(csv_path)

    # convert string representations back into Python objects
    for col in ["lags_past_covariates", "lags_future_covariates"]:
        if col in gs.columns:
            gs[col] = gs[col].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

    gs = gs.drop(columns=[c for c in ["error_flag", "error_msg"] if c in gs.columns])

    best_row = gs.loc[gs["WIS"].idxmin()].to_dict()
    wis = best_row.pop("WIS")  # remove from dict, keep separately

    print(f"WIS of best run: {wis:.3f}")
    return {k: best_row[k] for k in sorted(best_row)}


In [5]:
params = get_best_parameters("gridsearch_lightgbm.csv")

WIS of best run: 447.993


In [6]:
use_covariates = params.pop("use_covariates")
use_encoders = params.pop("use_encoders")
sample_weight = params.pop("sample_weight")

if not use_covariates:
    del params["lags_past_covariates"]

model = LightGBMModel(
    **params,
    output_chunk_length=HORIZON,
    add_encoders=ENCODERS if use_encoders else None,
    likelihood="quantile",
    quantiles=QUANTILES,
    verbose=-1,
)

# Train model

In [45]:
for forecast_date in FORECAST_DATES[0:1]:
    path = f"../models/{forecast_date}/"
    os.makedirs(path, exist_ok=True)

    targets, covariates = load_realtime_training_data(as_of=forecast_date)

    for seed in RANDOM_SEEDS:
        model_path = path + f"{forecast_date}-lightgbm-{seed}.pt"
        print(model_path)
        model = LightGBMModel(
            **params,
            output_chunk_length=HORIZON,
            add_encoders=ENCODERS if use_encoders else None,
            likelihood="quantile",
            quantiles=QUANTILES,
            verbose=-1,
            random_state=seed,
        )
        model.fit(
            targets,
            past_covariates=covariates if use_covariates else None,
            sample_weight=sample_weight,
        )
        model.save(model_path)

../models/2023-11-16/2023-11-16-lightgbm-1.pt
../models/2023-11-16/2023-11-16-lightgbm-2.pt
../models/2023-11-16/2023-11-16-lightgbm-3.pt
../models/2023-11-16/2023-11-16-lightgbm-4.pt
../models/2023-11-16/2023-11-16-lightgbm-5.pt
../models/2023-11-16/2023-11-16-lightgbm-6.pt
../models/2023-11-16/2023-11-16-lightgbm-7.pt
../models/2023-11-16/2023-11-16-lightgbm-8.pt
../models/2023-11-16/2023-11-16-lightgbm-9.pt
../models/2023-11-16/2023-11-16-lightgbm-10.pt


# Forecast

In [9]:
def compute_ensemble(forecast_date, model_name, export=False):
    """Computes the ensemble forecast for a given date by combining multiple runs of the same model.

    Args:
        forecast_date (str): Forecast date (e.g., "2024-06-13").
        model_name (str): Identifier of the model whose runs will be ensembled (e.g., "lightgbm").
        export (bool, optional): If True, write the ensembled forecast to the standard output path. Defaults to False.

    Returns:
        pandas.DataFrame: Dataframe containing the ensembled forecast.
    """

    # Keep incomplete covariates, we need the most recent values for prediction.
    # Incomplete target values will be replaced by a nowcast later.
    targets, covariates = load_realtime_training_data(as_of=forecast_date, drop_incomplete=False)

    dfs = []
    for seed in RANDOM_SEEDS:
        print(seed)
        model_path = f"../models/{forecast_date}/{forecast_date}-{model_name}-{seed}.pt"
        model = LightGBMModel.load(model_path)
        df = compute_forecast(
            model,
            targets,
            covariates,
            forecast_date,
            HORIZON,
            NUM_SAMPLES,
            vincentization=False,
            probabilistic_nowcast=True,
            local=True,
        )
        dfs.append(df)

    df = pd.concat(dfs)
    df = (
        df.groupby(
            [
                "location",
                "age_group",
                "forecast_date",
                "target_end_date",
                "horizon",
                "type",
                "quantile",
            ]
        )
        .agg({"value": "mean"})
        .reset_index()
    )

    df = df.sort_values(["location", "age_group", "horizon", "quantile"])

    if export:
        path = ROOT / "forecasts_new/lightgbm_new/"
        os.makedirs(path, exist_ok=True)

        df.to_csv(path / f"{forecast_date}-icosari-sari-lightgbm_new.csv", index=False)

    return df

In [10]:
forecasts = []
for forecast_date in FORECAST_DATES[:1]:
    print(forecast_date)
    forecast = compute_ensemble(forecast_date, "lightgbm", export=True)
    forecasts.append(forecast)

2023-11-16
1
2
3
4
5
6
7
8
9
10
