# Create Submission for VN1 forecasting competition

_Created by: [Davide Burba](https://www.linkedin.com/in/davide-burba/)_

In this notebook I included the code used to produce the submission for the 
[vn1 forecasting competition](https://www.datasource.ai/en/users/davide-burba/competitions/phase-2-vn1-forecasting-accuracy-challenge/datathon_detail/rules).

I built this notebook only for the sake of submitting the code, but in reality I used scripts and multiple python modules to produce the submission. I copied the code here, but for this reason looking at the original codebase might be more user friendly.

Anyway, if my submission will get a decent score (hope so!), I will open source the code at: [github.com/davide-burba/vn1-forecasting-challenge](https://github.com/davide-burba/vn1-forecasting-challenge)

## Python modules 

Original relative imports are removed.

**vn1/config.py**

In [None]:
from typing import Any, Literal

import mlflow
import yaml
from pydantic import BaseModel, ConfigDict


class FeatureEng(BaseModel):
    kind: str
    params: dict[str, Any]


class DataFeatureEng(BaseModel):
    source: Literal["sales", "price"]
    groupby: list[str] | None = None
    group_stat: Literal["sum", "mean", "std", "min", "max"] = "sum"
    feature_eng_list: list[FeatureEng]

    model_config = ConfigDict(extra="forbid")


class StaticFeature(BaseModel):
    name: str
    categorical: bool

    model_config = ConfigDict(extra="forbid")


class PreprocessingConfig(BaseModel):
    data_feature_eng_list: list[DataFeatureEng]
    static_feature_list: list[StaticFeature]
    date_features: list[Literal["year", "month", "day"]]
    normalize_price: bool = True
    normalize_sales: bool = True

    model_config = ConfigDict(extra="forbid")


class Config(BaseModel):
    preprocessing_config: PreprocessingConfig
    engine_params: dict[str, Any]

    multi_horizon: bool = True
    include_horizon_feature: bool = True
    include_horizon_year: bool = False
    include_horizon_month: bool = False
    include_horizon_day: bool = False

    magic_multiplier: float = 1.0

    model_config = ConfigDict(extra="forbid")


def load_config(path):
    with open(path, "r") as file:
        config_data = yaml.safe_load(file)
    return Config(**config_data)


def track_config_with_mlflow(config):
    # track the most important attributes of the config with mlfow
    mlflow.log_params(config.engine_params)
    mlflow.log_params(config.preprocessing_config.model_dump())
    mlflow.log_param("multi_horizon", config.multi_horizon)
    mlflow.log_param("include_horizon_feature", config.include_horizon_feature)


**vn1/data_loading.py**

In [None]:
from pathlib import Path

import pandas as pd


def load_data(phase, path):
    assert phase in {1, 2}
    path = Path(path)
    sales_0 = pd.read_csv(path / "Phase 0 - Sales.csv")
    price_0 = pd.read_csv(path / "Phase 0 - Price.csv")

    if phase == 1:
        return sales_0, price_0

    sales_1 = pd.read_csv(path / "Phase 1 - Sales.csv")
    price_1 = pd.read_csv(path / "Phase 1 - Price.csv")

    sales = pd.concat([sales_0, sales_1[sales_1.columns[3:]]], axis=1)
    price = pd.concat([price_0, price_1[price_1.columns[3:]]], axis=1)

    return sales, price


**vn1/score.py**

In [None]:
import numpy as np


def compute_competition_score(submission, objective):
    abs_err = np.nansum(abs(submission - objective))
    err = np.nansum((submission - objective))
    score = abs_err + abs(err)
    score /= objective.sum().sum()
    return score


**vn1/preprocessing.py**

In [None]:
import pandas as pd

ID_COLS = ("Client", "Warehouse", "Product")
HORIZONS = (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13)


class Preprocessor:
    def __init__(self, static_feature_list, data_feature_eng_list, date_features):
        for col in static_feature_list:
            assert col.name in ID_COLS
        self.static_feature_map = {feat.name: feat for feat in static_feature_list}
        self.data_feature_eng_list = data_feature_eng_list
        self.date_features = date_features

    def prepare_data(self, sales, price):
        targets = self.prepare_targets(sales)
        features = self.prepare_features(sales, price)

        return targets, features

    def prepare_targets(self, sales):
        sales_ts = sales.set_index(list(ID_COLS))
        all_targets = {}
        for horizon in HORIZONS:
            df = (
                build_lagged_df(sales_ts, -horizon)
                .melt(ignore_index=False)
                .reset_index()
                .rename(columns={"variable": "time"})
            )
            df["time"] = pd.to_datetime(df["time"])
            all_targets[horizon] = df

        return all_targets

    def prepare_features(self, sales, price):
        all_features = []
        for data_feature_eng in self.data_feature_eng_list:
            # set time-series data on which to apply feature engineering
            data_name = self._build_data_name(
                data_feature_eng.source, data_feature_eng.groupby
            )
            match data_feature_eng.source:
                case "sales":
                    df_input = sales
                case "price":
                    df_input = price

            df_ts = self._group_timeseries(
                df_input,
                data_feature_eng.groupby,
                data_feature_eng.group_stat,
            )

            # engineer features
            for feat_eng in data_feature_eng.feature_eng_list:
                feat = self._apply_feat_eng(
                    feat_eng.kind, feat_eng.params, df_ts, data_name
                )
                all_features.append(feat)

        features = self._merge_all_features(all_features, sales)

        # Static features
        for col in ID_COLS:
            # drop if not listed
            if col not in self.static_feature_map:
                features = features.drop(columns=col)
            # convert to category if specified
            elif self.static_feature_map[col].categorical:
                features[col] = features[col].astype("category")

        # Date features
        for date_feat in self.date_features:
            match date_feat:
                case "year":
                    features["year"] = features["time"].dt.year
                case "month":
                    features["month"] = features["time"].dt.month
                case "day":
                    features["day"] = features["time"].dt.day
                case _:
                    raise ValueError(f"Unknown date feature: {date_feat}")

        return features.drop(columns=["time"])

    def _apply_feat_eng(self, kind, params, df_ts, data_name):
        match kind:
            case "lag":
                lag = params["lag"]
                feature_name = f"{data_name}_lag-{lag}"
                feat = build_lagged_df(df_ts, lag)
            case "rolling":
                window = params["window"]
                statistic = params["statistic"]
                feature_name = f"{data_name}_rolling-{statistic}_w{window}"
                feat = build_rolling_statistic_df(df_ts, window, statistic)
            case _:
                raise ValueError(f"Unknown feature engineering kind: {kind}")

        return self._format_engineered_feature(feat, feature_name)

    def _merge_all_features(self, all_features, sales):
        join_df = (
            sales.set_index(list(ID_COLS))
            .melt(ignore_index=False)
            .reset_index()
            .rename(columns={"variable": "time"})[list(ID_COLS) + ["time"]]
        )
        join_df["time"] = pd.to_datetime(join_df["time"])

        # merge all with join_df
        features_df = join_df.copy()
        for feat in all_features:
            for_join, join_on = self._prepare_for_join(feat)
            features_df = features_df.merge(for_join, on=join_on, how="left")

        return features_df

    def _group_timeseries(
        self,
        df: pd.DataFrame,
        groupby: list[str] | None = None,
        group_stat: str = "sum",
    ) -> pd.DataFrame:
        ts = df.copy()

        index_cols = groupby or list(ID_COLS)

        if groupby is not None:
            for col in groupby:
                assert col in ID_COLS

            cols_to_drop = [c for c in ID_COLS if c not in groupby]
            ts = ts.drop(columns=cols_to_drop)

            # ts = ts.groupby(groupby).sum().reset_index()
            ts = ts.groupby(groupby).agg(group_stat).reset_index()

        return ts.set_index(index_cols)

    def _prepare_for_join(self, df):
        join_on = list(df.index.names) + ["time"]
        df_for_join = df.reset_index()
        return df_for_join, join_on

    def _build_data_name(self, source, groupby):
        return f"{source}_{'' if groupby is None else '-'.join(groupby)}"

    def _format_engineered_feature(self, feat, feature_name):
        feat = feat.melt(ignore_index=False)
        feat = feat.rename(columns={"value": feature_name, "variable": "time"})
        feat["time"] = pd.to_datetime(feat["time"])
        return feat


def build_lagged_df(df, lag=0):
    return df.shift(lag, axis=1)


def build_rolling_statistic_df(df, window=4, statistic="mean"):
    return df.T.rolling(window).agg(statistic).T


**submission.py**

In [None]:
import mlflow
import pandas as pd

# from vn1.preprocessing import ID_COLS


def validate_submission(submission, example_sumbission_path):
    example_submission = pd.read_csv(example_sumbission_path)

    assert (submission.columns == example_submission.columns).all()

    base_cols = list(ID_COLS)
    pd.testing.assert_frame_equal(submission[base_cols], example_submission[base_cols])


def track_submission_with_mlflow(submission):
    run_id = mlflow.active_run().info.run_id
    path = f"/tmp/submission_{run_id}.csv"
    submission.to_csv(path, index=False)
    mlflow.log_artifact(path)


**vn1/forecaster.py**

In [None]:
import os

import mlflow
import pandas as pd
from lightgbm import LGBMRegressor
from sklearn.preprocessing import MinMaxScaler

# from vn1.preprocessing import HORIZONS, ID_COLS, Preprocessor
# from vn1.score import compute_competition_score


class Forecaster:
    def __init__(self, config):
        self.config = config

        self._model = None
        self._price_scaler = None
        self._sales_norm_data = None

    def build_future_predictions(self, sales, price):
        print("preprocess")
        sales, price = sales.copy(), price.copy()
        sales, price = self._normalize_sales_and_price(sales, price)
        targets, features = self._prepare_targets_and_features(sales, price)

        if self.config.multi_horizon:
            future_predictions = self._predict_multi_horizon(targets, features)
        else:
            future_predictions = self._predict_by_horizon(targets, features)

        return self._inverse_normalize(future_predictions)

    def cross_validate(self, sales, price, n_folds=4, test_size=0.05):
        print("preprocess")
        sales, price = sales.copy(), price.copy()
        # slight leakage by using min/max of all data
        sales, price = self._normalize_sales_and_price(sales, price)
        targets, features = self._prepare_targets_and_features(sales, price)

        if self.config.multi_horizon:
            self._cross_validate_multi_horizon(features, targets, n_folds, test_size)
        else:
            self._cross_validate_by_horizon(features, targets, n_folds, test_size)

    def _predict_multi_horizon(
        self,
        targets: dict[int, pd.DataFrame],
        features: pd.DataFrame,
    ):
        print("Training multi horizon")
        # prepare data in x,y format
        multi_h_target, multi_h_features = self._prepare_data_multi_horizon(
            targets, features
        )

        # train model
        # Important: drop the target NA, otherwise treated as 0!
        print("Training model")
        mask_na = multi_h_target.value.isna()
        multi_h_target_train = multi_h_target[~mask_na].reset_index(drop=True)
        multi_h_features_train = multi_h_features[~mask_na].reset_index(drop=True)
        self._model = LGBMRegressor(**self.config.engine_params)
        self._model.fit(multi_h_features_train, multi_h_target_train.value)

        # inference
        print("Inference")
        last_timestep = multi_h_target.time.max()
        mask = multi_h_target.time == last_timestep
        inference_features = multi_h_features[mask].reset_index(drop=True)
        inference_predictions = (
            multi_h_target[mask].drop(columns=["value", "time"]).reset_index(drop=True)
        )

        inference_predictions["y_pred"] = self._predict_with_magic_multiplier(
            inference_features
        )

        # format output
        print("Formatting output")
        id_cols = list(ID_COLS)
        future_predictions = targets[1][targets[1].time == last_timestep][
            id_cols
        ].reset_index(drop=True)
        for horizon in HORIZONS:
            pred_horizon = inference_predictions[
                inference_predictions.horizon == horizon
            ].drop(columns=["horizon"])

            future_predictions = pd.merge(
                future_predictions,
                pred_horizon,
                on=id_cols,
                how="left",
            ).rename(columns={"y_pred": f"pred_{horizon}"})

        return self._format_inference(future_predictions, last_timestep)

    def _predict_with_magic_multiplier(self, features: pd.DataFrame):
        return self._model.predict(features) * self.config.magic_multiplier

    def _prepare_data_multi_horizon(self, targets, features):
        # prepare data in x,y format
        multi_h_features_list = []
        multi_h_target_list = []
        for horizon in HORIZONS:
            # build target
            targets_h = targets[horizon].copy()
            targets_h["horizon"] = horizon
            multi_h_target_list.append(targets_h)

            # build features
            features_h = features.copy()

            if self.config.include_horizon_feature:
                features_h["horizon"] = horizon

            horizon_date = targets_h.time + pd.offsets.Week(horizon)
            if self.config.include_horizon_year:
                features_h["horizon_year"] = horizon_date.dt.year
            if self.config.include_horizon_month:
                features_h["horizon_month"] = horizon_date.dt.month
            if self.config.include_horizon_day:
                features_h["horizon_day"] = horizon_date.dt.day

            multi_h_features_list.append(features_h)

        multi_h_target = pd.concat(multi_h_target_list).reset_index(drop=True)
        multi_h_features = pd.concat(multi_h_features_list).reset_index(drop=True)

        return multi_h_target, multi_h_features

    def _predict_by_horizon(
        self,
        targets: dict[int, pd.DataFrame],
        features: pd.DataFrame,
    ):
        print("Training by horizon")

        last_timestep = targets[1].time.max()
        mask = targets[1].time == last_timestep
        id_cols = list(ID_COLS)
        future_predictions = targets[1][mask][id_cols].reset_index(drop=True)

        for horizon in HORIZONS:
            self._model = LGBMRegressor(**self.config.engine_params)
            self._model.fit(features, targets[horizon].value)

            mask = targets[horizon].time == last_timestep
            inference_features = features[mask].reset_index(drop=True)
            inference_targets = (
                targets[horizon][mask]
                .reset_index(drop=True)
                .drop(columns=["value", "time"])
            )
            inference_targets["y_pred"] = self._predict_with_magic_multiplier(
                inference_features
            )
            future_predictions = pd.merge(
                future_predictions,
                inference_targets,
                on=id_cols,
                how="left",
            ).rename(columns={"y_pred": f"pred_{horizon}"})

        return self._format_inference(future_predictions, last_timestep)

    def _format_inference(self, future_predictions, last_timestep):
        return future_predictions.rename(
            columns={
                f"pred_{h}": (
                    pd.Timestamp(last_timestep) + pd.offsets.Week(h)
                ).strftime("%Y-%m-%d")
                for h in HORIZONS
            }
        ).reset_index(drop=True)

    def _cross_validate_multi_horizon(self, features, targets, n_folds, test_size):
        print("CV multi horizon")
        # prepare data in x,y format
        multi_h_target, multi_h_features = self._prepare_data_multi_horizon(
            targets, features
        )
        # Important: drop the target NA, otherwise treated as 0!
        mask_na = multi_h_target.value.isna()
        multi_h_target = multi_h_target[~mask_na].reset_index(drop=True)
        multi_h_features = multi_h_features[~mask_na].reset_index(drop=True)

        # Time-based cross-validation
        end_quantile = 1
        delta_quantile = test_size
        scores = []
        for fold in range(n_folds):
            # Split data by time.
            # Not exactly equivalent as by-horizon split, the most recent fold
            # will have more short and less far horizons.
            # Probably negligible.
            end_timestep = multi_h_target.time.quantile(end_quantile)
            start_timestep = multi_h_target.time.quantile(end_quantile - delta_quantile)
            print(f"Fold {fold} start: {start_timestep}, end: {end_timestep}")

            mask_train = multi_h_target.time <= start_timestep
            mask_test = (multi_h_target.time > start_timestep) & (
                multi_h_target.time <= end_timestep
            )
            x_train = multi_h_features[mask_train]
            y_train = multi_h_target[mask_train]
            x_test = multi_h_features[mask_test]
            y_test = multi_h_target[mask_test].reset_index(drop=True)

            # Train model.
            self._model = LGBMRegressor(**self.config.engine_params)
            self._model.fit(x_train, y_train.value)
            y_pred = self._predict_with_magic_multiplier(x_test)

            # Inverse normalization by time-series.
            y_pred, y_test = self._inverse_normalize_cv(y_pred, y_test)

            # Evaluate.
            score = compute_competition_score(y_pred, y_test)
            print(f"Fold {fold} score: {score}")
            mlflow.log_metric(f"score_fold_{fold}", score)
            log_cv_predictions(y_pred, y_test, fold)
            end_quantile -= delta_quantile
            scores.append(score)

        avg_score = sum(scores) / len(scores)
        print(f"Avg score: {avg_score}")
        mlflow.log_metric("score_avg", avg_score)

    def _cross_validate_by_horizon(self, features, targets, n_folds, test_size):
        end_quantile = 1
        delta_quantile = test_size
        features = features.reset_index(drop=True)
        scores = []
        for fold in range(n_folds):
            fold_scores = []
            for horizon in HORIZONS:
                targets_h = targets[horizon].reset_index(drop=True)

                # split train/test by time
                timesteps_no_na = targets_h.dropna().time
                end_timestep = timesteps_no_na.quantile(end_quantile)
                start_timestep = timesteps_no_na.quantile(end_quantile - delta_quantile)

                mask_train = targets_h.time <= start_timestep
                mask_test = (targets_h.time > start_timestep) & (
                    targets_h.time <= end_timestep
                )
                x_train = features[mask_train]
                y_train = targets_h[mask_train]
                x_test = features[mask_test]
                y_test = targets_h[mask_test].reset_index(drop=True)

                # Train model.
                self._model = LGBMRegressor(**self.config.engine_params)
                self._model.fit(x_train, y_train.value)
                y_pred = self._predict_with_magic_multiplier(x_test)

                # Inverse normalization by time-series.
                y_pred, y_test = self._inverse_normalize_cv(y_pred, y_test)

                score = compute_competition_score(y_pred, y_test)
                mlflow.log_metric(f"score_horizon_{horizon}_fold_{fold}", score)
                scores.append(score)
                fold_scores.append(score)

            avg_fold_score = sum(fold_scores) / len(fold_scores)
            mlflow.log_metric(f"score_fold_{fold}", avg_fold_score)

            end_quantile -= delta_quantile
        avg_score = sum(scores) / len(scores)
        mlflow.log_metric("score_avg", avg_score)

    def _normalize_sales_and_price(self, sales, price):
        if self.config.preprocessing_config.normalize_price:
            self._price_scaler = MinMaxScaler()
            price[price.columns[3:]] = self._price_scaler.fit_transform(
                price[price.columns[3:]].T
            ).T

        if self.config.preprocessing_config.normalize_sales:
            # Manual normalization, makes it easier to use for cross-validation.
            sales_norm_data = sales[sales.columns[:3]].copy()
            sales_norm_data["min"] = sales[sales.columns[3:]].T.min()
            sales_norm_data["max"] = sales[sales.columns[3:]].T.max()
            sales_norm_data["delta"] = sales_norm_data["max"] - sales_norm_data["min"]

            sales[sales.columns[3:]] = (
                (sales[sales.columns[3:]].T - sales_norm_data["min"])
                / sales_norm_data["delta"]
            ).T.values

            self._sales_norm_data = sales_norm_data

        return sales, price

    def _inverse_normalize(self, predictions):
        if self.config.preprocessing_config.normalize_sales:
            pd.testing.assert_frame_equal(
                predictions[list(ID_COLS)], self._sales_norm_data[list(ID_COLS)]
            )
            predictions[predictions.columns[3:]] = (
                (
                    predictions[predictions.columns[3:]].T
                    * self._sales_norm_data["delta"]
                )
                + self._sales_norm_data["min"]
            ).T
        # clip negative values
        predictions[predictions < 0] = 0
        return predictions

    def _inverse_normalize_cv(self, y_pred, test_df):
        y_test = test_df["value"]
        if self.config.preprocessing_config.normalize_sales:
            test_norm_data = pd.merge(
                test_df,
                self._sales_norm_data,
                on=ID_COLS,
                how="left",
            )
            y_pred = y_pred * test_norm_data["delta"] + test_norm_data["min"]
            y_test = y_test * test_norm_data["delta"] + test_norm_data["min"]
        # clip negative values
        y_pred[y_pred < 0] = 0
        return y_pred, y_test

    def _prepare_targets_and_features(self, sales, price):
        preprocessor = Preprocessor(
            self.config.preprocessing_config.static_feature_list,
            self.config.preprocessing_config.data_feature_eng_list,
            self.config.preprocessing_config.date_features,
        )
        return preprocessor.prepare_data(sales, price)


def log_cv_predictions(y_pred, y_test, fold):
    run_id = mlflow.active_run().info.run_id
    path = f"/tmp/predictions_{run_id}_fold_{fold}.p"
    pd.to_pickle((y_pred, y_test), path)
    mlflow.log_artifact(path)
    os.remove(path)


## Scripts

These scripts have been slightly edited to make them runnable within the notebook.

In [None]:
# Edit this path to point to the folder with the raw input files
PATH_RAW_DATA = Path("../data/raw/")

**scripts/run_phase_2.py**

This script was used to cross-validate multiple configurations, in order to choose the best combination of feature selection, feature engineering, and hyper-parameters tuning.

In [None]:
import os
from pathlib import Path

import mlflow
from fire import Fire

# from vn1.config import load_config, track_config_with_mlflow
# from vn1.data_loading import load_data
# from vn1.forecaster import Forecaster
# from vn1.submission import track_submission_with_mlflow


def run_phase_2(
# def main(
    config: Config,
    #config_path: str = "config.yaml",
    skip_cross_validation: bool = False,
    dump_submission_locally: bool = False,
):
    mlflow.set_experiment("phase_2")
    with mlflow.start_run():
        print("load inputs")
        #config = load_config(config_path)
        sales, price = load_data(phase=2, path=PATH_RAW_DATA)
        # mlflow.log_artifact(config_path)
        track_config_with_mlflow(config)

        forecaster = Forecaster(config)

        if not skip_cross_validation:
            print("cross validate")
            forecaster.cross_validate(sales, price)

        print("build predictions")
        submission = forecaster.build_future_predictions(sales, price)

        print("dump")
        if dump_submission_locally:
            run_id = mlflow.active_run().info.run_id
            os.makedirs("submissions", exist_ok=True)
            submission.to_csv(
                f"submissions/submission_phase_2_{run_id}.csv", index=False
            )
        track_submission_with_mlflow(submission)


# if __name__ == "__main__":
#    Fire(main)


**scripts/run_phase_2_ensemble.py**

This script was used to ensemble multiple predictions using the same configuration but varying the random seed.

In [None]:
import os
from pathlib import Path

import mlflow
from fire import Fire

# from vn1.config import load_config, track_config_with_mlflow
# from vn1.data_loading import load_data
# from vn1.forecaster import Forecaster
# from vn1.preprocessing import ID_COLS
# from vn1.submission import track_submission_with_mlflow


def run_phase_2_ensemble(
# def main(
    config: Config,
    n_estimators: int = 10,
    #config_path: str = "config.yaml",
    dump_submission_locally: bool = True,
):
    mlflow.set_experiment("phase_2_ensemble")

    with mlflow.start_run():
        print("load inputs")
        # config = load_config(config_path)
        sales, price = load_data(phase=2, path=PATH_RAW_DATA)
        # mlflow.log_artifact(config_path)
        track_config_with_mlflow(config)

        run_id = mlflow.active_run().info.run_id
        if dump_submission_locally:
            output_folder = Path(f"submissions/submission-ensemble-{run_id}")
            os.makedirs(output_folder, exist_ok=True)

        submissions = []
        for seed in range(n_estimators):
            config.engine_params["seed"] = seed
            forecaster = Forecaster(config)

            print("build predictions")
            submission = forecaster.build_future_predictions(sales, price)

            print("dump")
            if dump_submission_locally:
                submission.to_csv(
                    output_folder / f"submission_seed_{seed}.csv",
                    index=False,
                )
            submissions.append(submission)

        id_cols = list(ID_COLS)
        submission = sum([df.set_index(id_cols) for df in submissions]) / n_estimators
        submission = submission.reset_index()

        track_submission_with_mlflow(submission)
        if dump_submission_locally:
            submission.to_csv(
                output_folder / "submission_ensemble.csv",
                index=False,
            )


# if __name__ == "__main__":
#     Fire(main)


## Build submission

**Final configuration**

In [None]:
config_data = {
 # lightgbm params
 'engine_params': {'colsample_bytree': 0.6,
  'learning_rate': 0.033932217718953266,
  'max_depth': 11,
  'min_child_samples': 25,
  'n_estimators': 488,
  'num_leaves': 220,
  'subsample': 0.5,
  'verbose': 0},
 # multi horizon strategy
 'multi_horizon': True,
 # multi-horizon features
 'include_horizon_feature': True,
 'include_horizon_day': True,
 'include_horizon_month': True,
 'include_horizon_year': True,
 # chase the metric: const to multiply predictions
 # which minimize the cross-validated scores
 'magic_multiplier': 0.9955,
 # preprocessing
 'preprocessing_config': {'normalize_price': False,
  'normalize_sales': False,
  'date_features': ['year', 'month', 'day'],
  'static_feature_list': [{'categorical': True, 'name': 'Client'},
   {'categorical': True, 'name': 'Warehouse'},
   {'categorical': True, 'name': 'Product'}],
  # sales feature engineering
  'data_feature_eng_list': [{'feature_eng_list': [{'kind': 'lag',
      'params': {'lag': 0}},
     {'kind': 'lag', 'params': {'lag': 1}},
     {'kind': 'lag', 'params': {'lag': 2}},
     {'kind': 'lag', 'params': {'lag': 3}},
     {'kind': 'lag', 'params': {'lag': 4}},
     {'kind': 'lag', 'params': {'lag': 5}},
     {'kind': 'lag', 'params': {'lag': 6}},
     {'kind': 'lag', 'params': {'lag': 7}},
     {'kind': 'lag', 'params': {'lag': 11}},
     {'kind': 'lag', 'params': {'lag': 15}},
     {'kind': 'lag', 'params': {'lag': 19}},
     {'kind': 'lag', 'params': {'lag': 25}},
     {'kind': 'lag', 'params': {'lag': 51}},
     {'kind': 'lag', 'params': {'lag': 103}},
     {'kind': 'rolling', 'params': {'statistic': 'mean', 'window': 4}},
     {'kind': 'rolling', 'params': {'statistic': 'mean', 'window': 8}},
     {'kind': 'rolling', 'params': {'statistic': 'mean', 'window': 12}},
     {'kind': 'rolling', 'params': {'statistic': 'mean', 'window': 26}},
     {'kind': 'rolling', 'params': {'statistic': 'mean', 'window': 52}},
     {'kind': 'rolling', 'params': {'statistic': 'mean', 'window': 104}},
     {'kind': 'rolling', 'params': {'statistic': 'std', 'window': 4}},
     {'kind': 'rolling', 'params': {'statistic': 'std', 'window': 8}},
     {'kind': 'rolling', 'params': {'statistic': 'std', 'window': 12}},
     {'kind': 'rolling', 'params': {'statistic': 'std', 'window': 26}},
     {'kind': 'rolling', 'params': {'statistic': 'std', 'window': 52}},
     {'kind': 'rolling', 'params': {'statistic': 'std', 'window': 104}},
     {'kind': 'rolling', 'params': {'statistic': 'max', 'window': 4}},
     {'kind': 'rolling', 'params': {'statistic': 'max', 'window': 8}},
     {'kind': 'rolling', 'params': {'statistic': 'max', 'window': 12}},
     {'kind': 'rolling', 'params': {'statistic': 'max', 'window': 26}},
     {'kind': 'rolling', 'params': {'statistic': 'max', 'window': 52}},
     {'kind': 'rolling', 'params': {'statistic': 'max', 'window': 104}}],
    'groupby': None,
    'source': 'sales'},
   {'feature_eng_list': [{'kind': 'lag', 'params': {'lag': 0}},
     {'kind': 'lag', 'params': {'lag': 1}},
     {'kind': 'lag', 'params': {'lag': 2}},
     {'kind': 'lag', 'params': {'lag': 3}},
     {'kind': 'rolling', 'params': {'statistic': 'mean', 'window': 4}},
     {'kind': 'rolling', 'params': {'statistic': 'mean', 'window': 8}},
     {'kind': 'rolling', 'params': {'statistic': 'mean', 'window': 12}},
     {'kind': 'rolling', 'params': {'statistic': 'mean', 'window': 26}},
     {'kind': 'rolling', 'params': {'statistic': 'mean', 'window': 52}},
     {'kind': 'rolling', 'params': {'statistic': 'mean', 'window': 104}},
     {'kind': 'rolling', 'params': {'statistic': 'std', 'window': 4}},
     {'kind': 'rolling', 'params': {'statistic': 'std', 'window': 8}},
     {'kind': 'rolling', 'params': {'statistic': 'std', 'window': 12}},
     {'kind': 'rolling', 'params': {'statistic': 'std', 'window': 26}},
     {'kind': 'rolling', 'params': {'statistic': 'std', 'window': 52}},
     {'kind': 'rolling', 'params': {'statistic': 'std', 'window': 104}}],
    'groupby': ['Warehouse'],
    'source': 'sales'},
   {'feature_eng_list': [{'kind': 'lag', 'params': {'lag': 0}},
     {'kind': 'lag', 'params': {'lag': 1}},
     {'kind': 'lag', 'params': {'lag': 2}},
     {'kind': 'lag', 'params': {'lag': 3}},
     {'kind': 'lag', 'params': {'lag': 4}},
     {'kind': 'lag', 'params': {'lag': 5}},
     {'kind': 'lag', 'params': {'lag': 6}},
     {'kind': 'lag', 'params': {'lag': 7}},
     {'kind': 'lag', 'params': {'lag': 11}},
     {'kind': 'lag', 'params': {'lag': 15}},
     {'kind': 'lag', 'params': {'lag': 19}},
     {'kind': 'lag', 'params': {'lag': 25}},
     {'kind': 'lag', 'params': {'lag': 51}},
     {'kind': 'lag', 'params': {'lag': 103}}],
    'groupby': ['Client'],
    'source': 'sales'},
   {'feature_eng_list': [{'kind': 'lag', 'params': {'lag': 0}},
     {'kind': 'lag', 'params': {'lag': 1}},
     {'kind': 'lag', 'params': {'lag': 2}},
     {'kind': 'lag', 'params': {'lag': 3}}],
    'groupby': ['Product'],
    'source': 'sales'}]}}

In [None]:
config = Config(**config_data)

When running it as a script, the following command took about 6 hours (12 minutes per model) on a recent MacBook.

Note that executing the notebook would create the mlflow runs and the submission outputs inside the notebook folders, and not at the project root.

In [None]:
run_phase_2_ensemble(config=config, n_estimators = 3)