In [9]:
import os

from pandas.core.interchange.dataframe_protocol import DataFrame

os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'

%load_ext autoreload
%autoreload 2

In [10]:
import numpy as np
from coreforecast.rolling import rolling_mean
from mlforecast.lag_transforms import RollingMean
from sklearn.metrics import mean_squared_error
import pandas as pd
from utilsforecast.plotting import plot_series
import numpy as np
import pandas as pd
import seaborn as sns
from coreforecast.lag_transforms import ExpandingMean
from matplotlib import pyplot as plt
from sklearn.metrics import mean_squared_error
from utilsforecast.plotting import plot_series
from mlforecast import MLForecast
from mlforecast.target_transforms import Differences
from sklearn.linear_model import LinearRegression
import re


from mlforecast import MLForecast
from mlforecast.target_transforms import Differences
from mlforecast.utils import PredictionIntervals
from sklearn.linear_model import Lasso, LinearRegression, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from mlforecast import MLForecast
from mlforecast.core import TimeSeries
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.ensemble import HistGradientBoostingRegressor

from neuralforecast.models import NHITS  # or RNN, TCN, NBEATS, etc.
from neuralforecast import NeuralForecast
from neuralforecast.models import NBEATS
from neuralforecast.models import RNN
from neuralforecast.models import LSTM
from neuralforecast.models import TCN

# Data

In [11]:
# Set the paths
data_path = "/Users/adi.k/Desktop/Reichman/Advanced ML/Final project/Data/"

# Load CSVs
train = pd.read_csv(data_path + "train.csv", parse_dates=["date"])
calendar = pd.read_csv(data_path + "calendar_events.csv", parse_dates=["date"])
submission = pd.read_csv(data_path + "forecast_submission.csv")

In [12]:
def preprocess(df, calendar, first_timestep = 0):
    df = df.dropna()

    # Rename columns
    df = df.rename(columns={'store_name': 'unique_id', 'date': 'ds', 'revenue': 'y'})

    # Interpolate the zero-values (xmas)
    if 'y' in df.columns:
        df.loc[df['y'] == 0, 'y'] = np.nan
        df['y'] = df.groupby('unique_id')['y'].transform(lambda s: s.interpolate(method='linear'))

    # Timestep columns
    df = df.sort_values('ds')
    date_to_timestep = {date: i + first_timestep for i, date in enumerate(sorted(df['ds'].unique()))}
    df['timestep'] = df['ds'].map(date_to_timestep)

    # Merge with calendar events
    df = df.merge(calendar.rename(columns={'date': 'ds'}), on='ds', how='left')
    df['event'] = df['event'].fillna('None')

    # Get all known event types (including 'None')
    all_events = calendar['event'].dropna().unique().tolist()
    all_events.append('None')

    # Convert to categorical with fixed categories
    df['event'] = pd.Categorical(df['event'], categories=all_events)

    # One-hot encode
    df = pd.get_dummies(df, columns=['event'], prefix='event')

    # Sanitize column names
    df.columns = [clean_column(col) for col in df.columns]

    return df


def clean_column(name):
    return re.sub(r'\W+', '_', name)


def plot_df(df, count = 4):
    store_counts = df['unique_id'].value_counts()
    top_stores = store_counts[store_counts.index != 'All Stores'].head(count).index.tolist()

    # Plot revenue over time for top 4 stores
    fig, axs = plt.subplots(len(top_stores), 1, figsize=(12, 10), sharex=True)

    for i, store in enumerate(top_stores):
        sns.lineplot(data=df[df['unique_id'] == store], x='ds', y='y', ax=axs[i])
        axs[i].set_title(f'Store: {store}')
        axs[i].set_ylabel('Revenue')

    plt.xlabel('Date')
    plt.tight_layout()
    plt.show()


def plot_forecasts(df_actual, df_forecast, models, store_ids=None, history_days=60):
    """
    Plots actual vs forecasted revenue for selected stores.

    Parameters:
    - df_actual: DataFrame with actuals (`ds`, `y`, `unique_id`)
    - df_forecast: DataFrame with forecast (`ds`, `y_pred`, `unique_id`)
    - store_ids: list of store names to plot
    - history_days: number of days of actual data to show
    """
    if store_ids is None:
        store_ids = df_forecast['unique_id'].unique()[:3]  # default to first 3 stores

    fig, axs = plt.subplots(len(store_ids), 1, figsize=(12, 4 * len(store_ids)), sharex=True)

    if len(store_ids) == 1:
        axs = [axs]

    for i, store_id in enumerate(store_ids):
        ax = axs[i]
        actual = df_actual[df_actual['unique_id'] == store_id].sort_values('ds')
        forecast = df_forecast[df_forecast['unique_id'] == store_id].sort_values('ds')

        ax.plot(actual['ds'].iloc[-history_days:], actual['y'].iloc[-history_days:], label='Actual', linewidth=2)

        for model in models:
            ax.plot(forecast['ds'], forecast[model], label=model, linestyle='--', marker='o')

        ax.set_title(f'Forecast vs Actual for {store_id}')
        ax.legend()
        ax.grid(True)

    plt.tight_layout()
    plt.show()


def split_train_test(df, h):
    last_dates = df.groupby('unique_id')['ds'].max().reset_index()
    cutoff_dates = last_dates.copy()
    cutoff_dates['cutoff'] = cutoff_dates['ds'] - pd.Timedelta(days=h)

    df = df.merge(cutoff_dates[['unique_id', 'cutoff']], on='unique_id')
    train_df = df[df['ds'] <= df['cutoff']].drop(columns='cutoff')
    test_df = df[df['ds'] > df['cutoff']].drop(columns='cutoff')
    return train_df, test_df


def calc_rmse(df_actual, df_forecast):
    merged = df_actual.merge(df_forecast, on=['unique_id', 'ds'], how='left')
    rmse = {}
    for model in df_forecast.drop(columns=['unique_id', 'ds']).columns:
        rmse[model] = np.sqrt(mean_squared_error(merged['y'], merged[model]))
        print(f"{model}: {rmse[model]:.2f}")

    return rmse

In [14]:
h = 92  # number of days to predict
df = preprocess(train, calendar)

In [15]:
def predict_recursive(nf: NeuralForecast, future: DataFrame, train_df: DataFrame, train_vars: list):
    future = future.copy()
    history = train_df.copy()
    h = nf.h

    # Ensure y exists in future
    future['y'] = np.nan

    # Useful vars for slicing and joining
    vars_ext = train_vars + ['timestep']
    start = int(future['timestep'].min())
    end = int(future['timestep'].max())

    preds = []
    model_names = nf._get_model_names()

    for i in range(start, end, h):
        # 1. Predict the next h steps
        pred = nf.predict(history)

        # 2. Keep only model outputs + IDs
        pred = pred[['unique_id', 'ds'] + model_names]

        # 3. Save current predictions
        preds.append(pred)

        # 4. Merge with future to get exogenous info
        merged = pred.merge(future, on=['unique_id', 'ds'], how='left')

        # 5. Add mean prediction as fallback y
        merged['y'] = pred[model_names].mean(axis=1)

        # 6. Keep only necessary columns to feed next step
        pred_next = merged[vars_ext]

        # 7. Add to history
        history = pd.concat([history, pred_next], ignore_index=True)[train_vars]

    # Combine all step-wise predictions
    full_preds = pd.concat(preds, ignore_index=True)

    # Add averaged prediction column (optional)
    full_preds['y_hat'] = full_preds[model_names].mean(axis=1)

    return full_preds

# Subsets

In [77]:
# Define exogenous features (event columns)
exog_vars = [col for col in df.columns if col.startswith('event_')]
test_vars = ['unique_id', 'ds'] + exog_vars
train_vars = ['y'] + test_vars

train_store_0 = df[df['store_id'] == 0].copy()
train_others = df[df['store_id'] != 0].copy()
train_others = train_others.sort_values(by=['store_id', 'ds'])

In [78]:
def preprocess_submission(df, calendar, train_df):
    submission_df = df.copy()
    submission_df[['store_id', 'ds']] = submission_df['id'].str.split('_', expand=True)
    submission_df['store_id'] = submission_df['store_id'].astype(int)
    submission_df['ds'] = pd.to_datetime(submission_df['ds'])
    submission_df = submission_df.merge(store_id_map, on='store_id', how='left')
    submission_df = preprocess(submission_df, calendar, max(train_df['timestep']) + 1)

    return submission_df.sort_values(by=['store_id', 'ds']).copy()

store_id_map = df[['store_id', 'unique_id']].drop_duplicates()

submission_df = preprocess_submission(submission, calendar, df)

# Train

In [20]:
# === Train Models on Full Dataset for Final Submission ===

#-----------MLF-------------------

print("\n=== TRAINING ON FULL DATA FOR SUBMISSION ===")

# Define and configure the MLF model (using only CatBoost here)
mlf_submission = MLForecast(
    models=[
        CatBoostRegressor(verbose=0),  # You can uncomment others if needed
        lgb.LGBMRegressor(),
        xgb.XGBRegressor(),
        HistGradientBoostingRegressor()
    ],
    freq='D',
    lags=[1, 7, 14, 28],  # Meaningful lags for 28-day horizon
    date_features=['dayofweek', 'month', 'is_month_end'],
    num_threads=4  # Parallel processing
)


# Train MLF on the entire training set (excluding store 0)
mlf_submission.fit(df=train_others[train_vars], static_features=[])

print("✅ MLForecast Models successfully trained on full dataset for submission.")


=== TRAINING ON FULL DATA FOR SUBMISSION ===
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001711 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1104
[LightGBM] [Info] Number of data points in the train set: 16780, number of used features: 38
[LightGBM] [Info] Start training from score 23840.687732


MLForecast(models=[CatBoostRegressor, LGBMRegressor, XGBRegressor, HistGradientBoostingRegressor], freq=D, lag_features=['lag1', 'lag7', 'lag14', 'lag28'], date_features=['dayofweek', 'month', 'is_month_end'], num_threads=4)

In [18]:
# Define and configure the NeuralForecast model (NHITS)
#-----------NF-------------------

nf_submission = NeuralForecast(
    models=[
        NHITS(
            input_size=4 * h,
            h=h,
            max_steps=1000,
            scaler_type='standard',
            val_check_steps=100
        )
    ],
    freq='D'
)

# Train NHITS model on the same full training data
nf_submission.fit(df=train_others[train_vars])

print("✅ NeuralForecast Model successfully trained on full dataset for submission.")


=== TRAINING ON FULL DATA FOR SUBMISSION ===


Seed set to 1
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name         | Type          | Params | Mode 
-------------------------------------------------------
0 | loss         | MAE           | 0      | train
1 | padder_train | ConstantPad1d | 0      | train
2 | scaler       | TemporalNorm  | 0      | train
3 | blocks       | ModuleList    | 3.4 M  | train
-------------------------------------------------------
3.4 M     Trainable params
0         Non-trainable params
3.4 M     Total params
13.565    Total estimated model params size (MB)
34        Modules in train mode
0         Modules in eval mode


Sanity Checking: |                                        | 0/? [00:00<?, ?it/s]

Training: |                                               | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_steps=1000` reached.


✅ Models successfully trained on full dataset for submission.


# Predict Submission

#### MLF

In [80]:
# Split submission into store 0 and other stores
store0_df = submission_df[submission_df["store_id"] == 0].copy()
other_stores_df = submission_df[submission_df["store_id"] != 0].copy()

# Predict for all stores using MLF (to compute store0 as a sum)
mlf_forecast_all = mlf_submission.predict(
    h=submission_df["ds"].nunique(),
    X_df=other_stores_df[test_vars]
)

# Add store_id and unique_id back from the original input
mlf_forecast_all = mlf_forecast_all.merge(
    other_stores_df[["unique_id", "store_id", "ds"]],
    on=["unique_id", "ds"],
    how="left"
)

In [81]:
# Calculate synthetic store 0 forecast by summing predictions across stores per day
store0_pred = mlf_forecast_all[mlf_forecast_all["store_id"] != 0] \
    .groupby("ds")["CatBoostRegressor"].sum().reset_index()

store0_pred["store_id"] = 0
store0_pred["unique_id"] = "All Stores"

In [107]:
# Merge with original store0 to get ID and metadata
store0_pred = store0_df[["ds", "id"]].merge(store0_pred, on="ds", how="left")
# store0_pred = store0_pred.rename(columns={"CatBoostRegressor": "prediction"})


#### NF

In [44]:
nf_forecast = predict_recursive(nf_submission, other_stores_df, train_others, train_vars)
# nf_forecast = nf_forecast.rename(columns={"NHITS": "prediction"})

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


Predicting: |                                             | 0/? [00:00<?, ?it/s]

In [88]:
# Merge everything into a final submission DataFrame - only NN

other_preds = nf_forecast[["unique_id", "ds", "y_hat"]].merge(
    other_stores_df[["unique_id", "ds", "id",'store_id']],
    on=["unique_id", "ds"],
    how="left").sort_values(by=['store_id', 'ds']).copy()

#### Final original method

In [100]:
store0_pred = store0_pred.rename(columns={"CatBoostRegressor": "prediction"})
other_preds = other_preds.rename(columns={"y_hat": "prediction"})

final_submission = pd.concat([
    store0_pred[["id", "prediction","store_id"]],
    other_preds[["id", "prediction","store_id"]]])

final_submission

Unnamed: 0,id,prediction,store_id
0,0_20151001,258210.690076,0
1,0_20151002,291962.499625,0
2,0_20151003,356656.748823,0
3,0_20151004,355721.087491,0
4,0_20151005,294549.695530,0
...,...,...,...
915,10_20151227,19863.480469,10
916,10_20151228,15937.326172,10
917,10_20151229,16156.546875,10
918,10_20151230,17472.130859,10


In [101]:
# Extract date from 'id' (format is like "0_20151225")
final_submission['ds'] = pd.to_datetime(final_submission['id'].str.split('_').str[1], format='%Y%m%d')

# Set predictions on Christmas Day to 0
christmas_mask = (final_submission['ds'].dt.month == 12) & (final_submission['ds'].dt.day == 25)
final_submission.loc[christmas_mask, 'prediction'] = 0
print(f"Set {christmas_mask.sum()} Christmas predictions to 0")

# Ensure no negative predictions
final_submission['prediction'] = final_submission['prediction'].clip(lower=0)

Set 11 Christmas predictions to 0


In [102]:
# Remove store_id and ds columns
final_submission = final_submission.drop(['store_id', 'ds'], axis=1)
final_submission["prediction"] = final_submission["prediction"].round(2)

# Report stats
print(f"Final submission shape: {final_submission.shape}")
print(f"Prediction statistics:")
print(f"  Mean: {final_submission['prediction'].mean():.2f}")
print(f"  Min: {final_submission['prediction'].min():.2f}")
print(f"  Max: {final_submission['prediction'].max():.2f}")
print(f"  NaN values: {final_submission['prediction'].isna().sum()}")

# Save submission file
final_submission.to_csv('final_submission.csv', index=False)
print(f"\nSubmission saved to 'final_submission.csv'")

# Show sample of submission
print(f"\nSample submission:")
print(final_submission.head(10))

Final submission shape: (1012, 2)
Prediction statistics:
  Mean: 52818.82
  Min: 0.00
  Max: 376963.85
  NaN values: 0

Submission saved to 'final_submission.csv'

Sample submission:
           id  prediction
0  0_20151001   258210.69
1  0_20151002   291962.50
2  0_20151003   356656.75
3  0_20151004   355721.09
4  0_20151005   294549.70
5  0_20151006   264152.36
6  0_20151007   259515.29
7  0_20151008   260843.89
8  0_20151009   293629.92
9  0_20151010   365125.86


In [103]:
from IPython.display import FileLink

# This creates a clickable link in the notebook
FileLink("final_submission.csv")

In [106]:
nf_forecast

Unnamed: 0,unique_id,ds,NHITS,y_hat,store_id
276,California – Sunset Plaza,2015-10-01,30604.535156,30604.535156,1
277,California – Sunset Plaza,2015-10-02,36917.597656,36917.597656,1
278,California – Sunset Plaza,2015-10-03,50028.234375,50028.234375,1
279,California – Sunset Plaza,2015-10-04,52371.781250,52371.781250,1
280,California – Sunset Plaza,2015-10-05,38112.125000,38112.125000,1
...,...,...,...,...,...
731,Wisconsin – Badger Crossing,2015-12-27,19863.480469,19863.480469,10
732,Wisconsin – Badger Crossing,2015-12-28,15937.326172,15937.326172,10
733,Wisconsin – Badger Crossing,2015-12-29,16156.546875,16156.546875,10
734,Wisconsin – Badger Crossing,2015-12-30,17472.130859,17472.130859,10


# Post Processing - Bias trick

We modify the overall prediction according to the bias ratio, calculated by the revenue avg in oct-dec from train to adjust the submission predictions

In [None]:
# info about monthes from train

train_others["month"] = train_others["ds"].dt.month
seasonal_train = train_others[train_others["month"].isin([10, 11, 12])]

train_monthly_avg = seasonal_train.groupby("month")["y"].mean()
train_monthly_avg

In [None]:
# Convert to datetime
final_submission["ds"] = pd.to_datetime(final_submission["id"].str.split("_").str[1])

# Extract store ID
# final_submission["store_id"] = final_submission["id"].str.split("_").str[0].astype(int)

# Keep only individual stores (exclude store 0)
individual_store_preds = final_submission[final_submission["store_id"] != 0].copy()

# Add month column
individual_store_preds["month"] = individual_store_preds["ds"].dt.month

# Filter to October, November, December
seasonal_pred = individual_store_preds[individual_store_preds["month"].isin([10, 11, 12])]

# Compute average predictions per month
pred_monthly_avg = seasonal_pred.groupby("month")["prediction"].mean()

In [None]:
bias_ratio = train_monthly_avg / pred_monthly_avg
print(bias_ratio)

In [None]:
# Map month to correction factor
correction_map = bias_ratio.to_dict()

# Apply correction only to individual stores
# final_submission["store_id"] = final_submission["id"].str.split("_").str[0].astype(int)
# final_submission["ds"] = pd.to_datetime(final_submission["id"].str.split("_").str[1])
final_submission["month"] = final_submission["ds"].dt.month

# Apply correction only to store_id != 0 and months 10–12
final_submission["prediction_corrected"] = final_submission.apply(
    lambda row: row["prediction"] * correction_map.get(row["month"], 1.0)
    if row["store_id"] != 0 else row["prediction"],
    axis=1
)


In [None]:
# Make sure the corrected predictions exist
# assert "prediction_corrected" in final_submission.columns, "Missing corrected predictions!"

# Convert id into components
# final_submission["store_id"] = final_submission["id"].str.split("_").str[0].astype(int)
# final_submission["ds"] = pd.to_datetime(final_submission["id"].str.split("_").str[1])

# Filter only real stores (excluding store 0)
real_store_preds = final_submission[final_submission["store_id"] != 0].copy()

store0_updated = real_store_preds.groupby("ds")["prediction_corrected"].sum().reset_index()
store0_updated["store_id"] = 0
store0_updated["id"] = store0_updated["store_id"].astype(str) + "_" + store0_updated["ds"].dt.strftime("%Y%m%d")
.sort_values(by=['store_id', 'ds'])

# Drop old store 0 entries
final_submission = final_submission[final_submission["store_id"] != 0].copy()

# Create the final, merged submission
store0_updated = store0_updated.rename(columns={"prediction_corrected": "prediction"})
final_submission = pd.concat([
    final_submission[["id", "prediction_corrected"]].rename(columns={"prediction_corrected": "prediction"}),
    store0_updated[["id", "prediction"]]
], axis=0).sort_values("id").reset_index(drop=True)

In [None]:
final_submission["store_id"] = final_submission["id"].str.split("_").str[0].astype(int)
final_submission["ds"] = pd.to_datetime(final_submission["id"].str.split("_").str[1])
final_submission = final_submission.sort_values(by=['store_id', 'ds']).copy()
final_submission

In [None]:
import pandas as pd


# Extract date from 'id' (format is like "0_20151225")
# final_submission['ds'] = pd.to_datetime(final_submission['id'].str.split('_').str[1], format='%Y%m%d')

# Set predictions on Christmas Day to 0
christmas_mask = (final_submission['ds'].dt.month == 12) & (final_submission['ds'].dt.day == 25)
final_submission.loc[christmas_mask, 'prediction'] = 0

print(f"Set {christmas_mask.sum()} Christmas predictions to 0")

# Ensure no negative predictions
final_submission['prediction'] = final_submission['prediction'].clip(lower=0)
final_submission["prediction"] = final_submission["prediction"].round(2).copy()

# Drop temporary 'ds' column if you don’t want it in the CSV
final_submission = final_submission.drop(columns=['ds','store_id'])

# Report stats
print(f"Final submission shape: {final_submission.shape}")
print(f"Prediction statistics:")
print(f"  Mean: {final_submission['prediction'].mean():.2f}")
print(f"  Min: {final_submission['prediction'].min():.2f}")
print(f"  Max: {final_submission['prediction'].max():.2f}")
print(f"  NaN values: {final_submission['prediction'].isna().sum()}")

# Save submission file
final_submission.to_csv('final_submission.csv', index=False)
print(f"\nSubmission saved to 'final_submission.csv'")

# Show sample of submission
print(f"\nSample submission:")
print(final_submission.tail(10))

In [None]:
from IPython.display import FileLink

# Save submission file
final_submission.to_csv('final_submission.csv', index=False)
print(f"\nSubmission saved to 'final_submission.csv'")

# This creates a clickable link in the notebook
FileLink("final_submission.csv")

# Ensemble model

In [None]:
# Create mapping dictionary
store_mapping = {
   'California – Sunset Plaza': 1,
   'California – Ocean View': 2,
   'California – Golden Hills': 3,
   'California – Redwood Center': 4,
   'Texas – Lone Star Mall': 5,
   'Texas – Riverwalk Market': 6,
   'Texas – Alamo Heights': 7,
   'Wisconsin – Maple Grove': 8,
   'Wisconsin – Lakeview Plaza': 9,
   'Wisconsin – Badger Crossing': 10
}

# Ensemble method
mlf_forecast_all['store_id'] = mlf_forecast_all['unique_id'].map(store_mapping)
nf_forecast['store_id'] = nf_forecast['unique_id'].map(store_mapping)

mlf_forecast_all=mlf_forecast_all.sort_values(by=['store_id', 'ds'])
nf_forecast=nf_forecast.sort_values(by=['store_id', 'ds'])


#---------------------------------------

def create_results_df(mlf_forecast, nf_forecast):
   # Merge on index
   merged_df = mlf_forecast.merge(nf_forecast[['NHITS']], left_index=True, right_index=True)
   
   return merged_df

# merged all models predictions
results = create_results_df(mlf_forecast_all, nf_forecast)



#---------------------------------------


# Create weighted preidciton using diffrent models outputs
class StoreSpecificEnsemble:
   def __init__(self, strategy='inverse_rmse'):
       self.strategy = strategy
       self.models = ['CatBoostRegressor', 'HistGradientBoostingRegressor', 
                     'LGBMRegressor', 'XGBRegressor', 'NHITS']
       
       rmse_data = {
           1: [2474.43, 2389.84, 2398.40, 2495.82, 2738.14],
           2: [2256.06, 2334.11, 2085.16, 2415.22, 3217.97],
           3: [3755.46, 3871.92, 3991.23, 4379.58, 4342.67],
           4: [1929.34, 2007.67, 1783.69, 2111.14, 1606.26],
           5: [2010.15, 2247.20, 2156.13, 1864.35, 2107.06],
           6: [2539.36, 2329.24, 2384.43, 2380.53, 2912.74],
           7: [2249.64, 2059.98, 2171.54, 2119.05, 2585.45],
           8: [2072.15, 1641.65, 1987.03, 1855.08, 2251.23],
           9: [3847.61, 3662.71, 3930.24, 3865.09, 2375.04],
           10: [1775.24, 1600.33, 1723.86, 1881.33, 1899.07]
       }
       
       self.store_weights = {}
       for store_id, rmse_vals in rmse_data.items():
           if strategy == 'inverse_rmse':
               inverse_rmse = 1 / np.array(rmse_vals)
               self.store_weights[store_id] = inverse_rmse / inverse_rmse.sum()
           elif strategy == 'equal':
               self.store_weights[store_id] = [0.2] * 5
           elif strategy == 'best_only':
               best_idx = np.argmin(rmse_vals)
               weights = [0] * 5
               weights[best_idx] = 1.0
               self.store_weights[store_id] = weights
   
   
    def predict(self, predictions_dict, store_ids):
       ensemble_preds = np.zeros(len(store_ids))
       
       for i, store_id in enumerate(store_ids):
           weights = self.store_weights.get(store_id, [0.2] * 5)
           pred_sum = sum(weights[j] * predictions_dict[model][i] 
                         for j, model in enumerate(self.models) 
                         if model in predictions_dict)
           ensemble_preds[i] = pred_sum
       
       return ensemble_preds

    
    
#---------------------------------------
    
# Create ensemble
ensemble = StoreSpecificEnsemble('best_only') 

# Extract predictions from results_df
predictions_dict = {
   'CatBoostRegressor': results['CatBoostRegressor'].values,
   'HistGradientBoostingRegressor': results['HistGradientBoostingRegressor'].values,
   'LGBMRegressor': results['LGBMRegressor'].values,
   'XGBRegressor': results['XGBRegressor'].values,
   'NHITS': results['NHITS'].values
}


#---------------------------------------

# Get ensemble predictions
results['prediction'] = ensemble.predict(predictions_dict, results['store_id'].values)

# Group by date and sum HistGradientBoostingRegressor predictions across all stores
summary_df = results.groupby('ds')['HistGradientBoostingRegressor'].sum().reset_index()
summary_df['store_id'] = 0
summary_df = summary_df.rename(columns={'HistGradientBoostingRegressor': 'prediction'})
summary_df = summary_df[['ds', 'store_id', 'prediction']]

final_submission = pd.concat([
    summary_df[["ds", "prediction","store_id"]],
    results[["ds", "prediction","store_id"]]
])



#---------------------------------------

final_submission = final_submission.sort_values(by=['store_id', 'ds']).copy()

# Set predictions on Christmas Day to 0
christmas_mask = (final_submission['ds'].dt.month == 12) & (final_submission['ds'].dt.day == 25)
final_submission.loc[christmas_mask, 'prediction'] = 0
print(f"Set {christmas_mask.sum()} Christmas predictions to 0")

# Ensure no negative predictions
final_submission['prediction'] = final_submission['prediction'].clip(lower=0)
final_submission["prediction"] = final_submission["prediction"].round(2)

# Create id column
final_submission['id'] = final_submission['store_id'].astype(str) + '_' + pd.to_datetime(final_submission['ds']).dt.strftime('%Y%m%d')

# Remove store_id and ds columns
final_submission = final_submission.drop(['store_id', 'ds'], axis=1)

final_submission = final_submission[['id', 'prediction']]


#---------------------------------------


from IPython.display import FileLink

# Save submission file
final_submission.to_csv('final_submission.csv', index=False)
print(f"\nSubmission saved to 'final_submission.csv'")

# This creates a clickable link in the notebook
FileLink("final_submission.csv")