# Beach Crowd Prediction — 3-Dataset Comparison

Compare model performance across three dataset strategies:
1. **Daytime only** — remove night hours (08:00–18:00), sklearn models only
2. **Full 24h** — keep all data including noisy night counts
3. **Night = 0** — keep 24h but replace night counts with 0

Sklearn models run on all 3 datasets. NeuralForecast models run on datasets 2 and 3 only (they need continuous hourly series).

In [1]:
CACHE_DIR = "cache/predictions"
COUNTING_MODEL = "bayesian_vgg19"
SAVE_DIR = "models/dataset_comparison"

MAX_STEPS = 500
EARLY_STOP_PATIENCE = 30
BATCH_SIZE = 64
LEARNING_RATE = 1e-3
INPUT_SIZE = 24
MAX_GAP_HOURS = 48
MIN_SEGMENT_HOURS = 72

NIGHT_START = 20
NIGHT_END = 6

In [2]:
import subprocess, sys
for pkg in ["neuralforecast", "xgboost", "lightgbm", "catboost", "utilsforecast"]:
    subprocess.check_call([sys.executable, "-m", "pip", "install", pkg, "-q"])
print("Done!")


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m26.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3.13 install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m26.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3.13 install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m26.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3.13 install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m26.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3.13 instal

Done!



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m26.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3.13 install --upgrade pip[0m


In [3]:
import copy, json, pickle, warnings, time
from pathlib import Path
from datetime import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Lasso

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-whitegrid')

try:
    from xgboost import XGBRegressor
    HAS_XGB = True
except: HAS_XGB = False

try:
    from lightgbm import LGBMRegressor
    HAS_LGBM = True
except: HAS_LGBM = False

try:
    from catboost import CatBoostRegressor
    HAS_CATBOOST = True
except: HAS_CATBOOST = False

try:
    from neuralforecast import NeuralForecast
    from neuralforecast.models import (
        NHITS, NBEATSx, TFT, PatchTST, iTransformer,
        TimeMixer, TSMixerx, TCN, TiDE, MLP as NF_MLP, LSTM as NF_LSTM
    )
    from utilsforecast.preprocessing import fill_gaps as uf_fill_gaps
    HAS_NF = True
except Exception as e:
    print(f"NeuralForecast error: {e}")
    HAS_NF = False

device = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
print(f"Device: {device}, XGB: {HAS_XGB}, LGBM: {HAS_LGBM}, CatBoost: {HAS_CATBOOST}, NF: {HAS_NF}")

Device: mps, XGB: True, LGBM: True, CatBoost: True, NF: True


In [4]:
def beach_metrics(y_true, y_pred, max_cap):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    if max_cap == 0 or len(y_true) == 0:
        return None
    mae = np.mean(np.abs(y_true - y_pred))
    rmse = np.sqrt(np.mean((y_true - y_pred) ** 2))
    rel_mae = (mae / max_cap) * 100
    ss_res = np.sum((y_true - y_pred) ** 2)
    ss_tot = np.sum((y_true - np.mean(y_true)) ** 2)
    r2 = 1 - (ss_res / ss_tot) if ss_tot > 0 else 0
    return {'MAE': mae, 'RMSE': rmse, 'R2': r2, 'RelMAE (%)': rel_mae}

def evaluate_per_beach(df, y_pred, beach_col='beach'):
    beach_max = df.groupby(beach_col)['count'].max().to_dict()
    results = []
    for b in df[beach_col].unique():
        mask = df[beach_col] == b
        if mask.sum() < 3:
            continue
        m = beach_metrics(df.loc[mask, 'count'].values, y_pred[mask.values], beach_max.get(b, 1))
        if m:
            m['beach'] = b
            m['max_count'] = beach_max.get(b, 0)
            m['n_samples'] = mask.sum()
            results.append(m)
    return pd.DataFrame(results)

print("Metrics defined")

Metrics defined


In [5]:
def load_cache(cache_dir, model=None):
    cache_path = Path(cache_dir)
    if model:
        cache_path = cache_path / model
    records = []
    json_files = list(cache_path.rglob("*.json"))
    print(f"Found {len(json_files)} JSON files")
    for jf in json_files:
        try:
            with open(jf, 'r') as f:
                r = json.load(f)
            if 'error' not in r:
                records.append(r)
        except: pass
    print(f"Loaded {len(records)} valid records")
    rows = []
    for r in records:
        row = {'filename': r.get('filename'), 'beach': r.get('beach') or r.get('name'),
               'beach_folder': r.get('beach_folder'), 'datetime': r.get('datetime'), 'count': r.get('count')}
        weather = r.get('weather', {})
        for k, v in weather.items():
            row[k] = v
        rows.append(row)
    df = pd.DataFrame(rows)
    df['datetime'] = pd.to_datetime(df['datetime'])
    df = df.sort_values('datetime').reset_index(drop=True)
    return df

df_raw = load_cache(CACHE_DIR, model=COUNTING_MODEL)

Found 169241 JSON files
Loaded 169241 valid records


In [6]:
EXCLUDE_FOLDERS = ['livecampro/001', 'livecampro/011', 'livecampro/018', 'livecampro/021',
    'livecampro/030', 'livecampro/039', 'livecampro/070', 'MultimediaTres/PortAndratx',
    'SeeTheWorld/mallorca_pancam', 'skyline/es-pujols', 'youtube/mCxR-gnn6iA',
    'youtube/TbttHwabtfE', 'youtube/WvZWS3D1tHw', 'youtube/Z9F_jN6xpFs', 'youtube/DsrQa_tZoWw']
EXCLUDE_PREFIXES = ['ibred', 'ClubNauticSoller', 'Guenthoer']

beach_col = 'beach' if 'beach' in df_raw.columns else 'beach_folder'

before = len(df_raw)
df_raw = df_raw[~df_raw[beach_col].isin(EXCLUDE_FOLDERS)].copy()
for prefix in EXCLUDE_PREFIXES:
    df_raw = df_raw[~df_raw[beach_col].str.startswith(prefix, na=False)].copy()
print(f"Filtered cameras: {before} -> {len(df_raw)}")

Filtered cameras: 169241 -> 169241


In [7]:
def add_features(df):
    df = df.copy()
    df['hour'] = df['datetime'].dt.hour
    df['day_of_week'] = df['datetime'].dt.dayofweek
    df['month'] = df['datetime'].dt.month
    df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
    df['is_summer'] = df['month'].isin([6, 7, 8]).astype(int)
    df['is_night'] = ((df['hour'] >= NIGHT_START) | (df['hour'] <= NIGHT_END)).astype(int)
    if 'om_temperature_2m' in df.columns:
        df['temp_x_summer'] = df['om_temperature_2m'] * df['is_summer']
        df['temp_x_winter'] = df['om_temperature_2m'] * (df['month'].isin([12, 1, 2])).astype(int)
    df['weekend_x_summer'] = df['is_weekend'] * df['is_summer']
    df['hour_x_summer'] = df['hour'] * df['is_summer']
    return df

df_all = add_features(df_raw)

WEATHER_FEATURES = [c for c in df_all.columns if c.startswith('ae_') or c.startswith('om_')]
TEMPORAL_FEATURES = ['hour', 'day_of_week', 'month', 'is_weekend', 'is_summer', 'is_night',
                     'temp_x_summer', 'temp_x_winter', 'weekend_x_summer', 'hour_x_summer']
TEMPORAL_FEATURES = [f for f in TEMPORAL_FEATURES if f in df_all.columns]
ALL_FEATURES = [f for f in WEATHER_FEATURES + TEMPORAL_FEATURES if f in df_all.columns]

df_all = df_all.dropna(subset=ALL_FEATURES + ['count']).copy()
beach_max_global = df_all.groupby(beach_col)['count'].max().to_dict()
good_beaches = [b for b, m in beach_max_global.items() if m > 20]
df_all = df_all[df_all[beach_col].isin(good_beaches)].reset_index(drop=True)

print(f"Features: {len(ALL_FEATURES)} | Beaches: {len(good_beaches)} | Samples: {len(df_all)}")
print(f"Date range: {df_all['datetime'].min()} to {df_all['datetime'].max()}")

Features: 39 | Beaches: 38 | Samples: 157384
Date range: 2022-07-14 11:00:00 to 2023-01-24 23:00:00


## Create 3 Datasets

1. **Daytime** — filter to 08:00–18:00
2. **Full 24h** — keep everything (noisy night counts)
3. **Night = 0** — keep 24h, set count=0 when hour ∈ [20:00–06:00]

In [8]:
ds_daytime = df_all[~df_all['is_night'].astype(bool)].copy().reset_index(drop=True)
ds_full24h = df_all.copy()
ds_night0 = df_all.copy()
ds_night0.loc[ds_night0['is_night'] == 1, 'count'] = 0.0

datasets = {
    'Daytime': ds_daytime,
    'Full 24h': ds_full24h,
    'Night=0': ds_night0,
}

for name, ds in datasets.items():
    night_pct = ds['is_night'].mean() * 100 if 'is_night' in ds.columns else 0
    print(f"{name:12s}: {len(ds):6d} samples | night%: {night_pct:.1f}% | mean count: {ds['count'].mean():.1f}")

Daytime     :  87492 samples | night%: 0.0% | mean count: 46.2
Full 24h    : 157384 samples | night%: 44.4% | mean count: 45.5
Night=0     : 157384 samples | night%: 44.4% | mean count: 25.7


In [9]:
def split_data(df, features, train_frac=0.7, val_frac=0.15):
    X = df[features]
    y = df['count']
    n = len(X)
    t1, t2 = int(n * train_frac), int(n * (train_frac + val_frac))
    return {
        'X_train': X.iloc[:t1], 'X_val': X.iloc[t1:t2], 'X_test': X.iloc[t2:],
        'y_train': y.iloc[:t1], 'y_val': y.iloc[t1:t2], 'y_test': y.iloc[t2:],
        'df_train': df.iloc[:t1], 'df_val': df.iloc[t1:t2], 'df_test': df.iloc[t2:],
    }

splits = {}
for name, ds in datasets.items():
    splits[name] = split_data(ds, ALL_FEATURES)
    s = splits[name]
    print(f"{name:12s}: train={len(s['X_train'])}, val={len(s['X_val'])}, test={len(s['X_test'])}")

Daytime     : train=61244, val=13124, test=13124
Full 24h    : train=110168, val=23608, test=23608
Night=0     : train=110168, val=23608, test=23608


## Sklearn Models — All 3 Datasets

In [10]:
def get_sklearn_models():
    models = {
        'Lasso': Lasso(alpha=0.1),
        'RandomForest': RandomForestRegressor(n_estimators=200, max_depth=15, random_state=42, n_jobs=-1),
        'GradientBoosting': GradientBoostingRegressor(n_estimators=200, max_depth=8, random_state=42),
    }
    if HAS_XGB:
        models['XGBoost'] = XGBRegressor(n_estimators=300, max_depth=8, learning_rate=0.1, random_state=42, n_jobs=-1, verbosity=0)
    if HAS_LGBM:
        models['LightGBM'] = LGBMRegressor(n_estimators=300, max_depth=8, learning_rate=0.1, random_state=42, n_jobs=-1, verbose=-1)
    if HAS_CATBOOST:
        models['CatBoost'] = CatBoostRegressor(n_estimators=300, max_depth=8, learning_rate=0.1, random_state=42, verbose=0)
    return models

print(f"Sklearn models: {list(get_sklearn_models().keys())}")

Sklearn models: ['Lasso', 'RandomForest', 'GradientBoosting', 'XGBoost', 'LightGBM', 'CatBoost']


In [None]:
all_results = []
all_beach_results = []

for ds_name, ds in datasets.items():
    s = splits[ds_name]
    X_trainval = pd.concat([s['X_train'], s['X_val']])
    y_trainval = pd.concat([s['y_train'], s['y_val']])

    print(f"\n{'='*70}")
    print(f"SKLEARN — {ds_name} ({len(s['X_test'])} test samples)")
    print(f"{'='*70}")

    for model_name, model in get_sklearn_models().items():
        start = time.time()
        model.fit(X_trainval, y_trainval)
        train_time = time.time() - start

        y_pred = np.clip(model.predict(s['X_test']), 0, None)
        y_true = s['y_test'].values

        mae = mean_absolute_error(y_true, y_pred)
        rmse = np.sqrt(mean_squared_error(y_true, y_pred))
        r2 = r2_score(y_true, y_pred)

        beach_df = evaluate_per_beach(s['df_test'], y_pred, beach_col)
        beach_df['model'] = model_name
        beach_df['dataset'] = ds_name
        beach_df['model_type'] = 'Sklearn'
        all_beach_results.append(beach_df)

        avg_rel_mae = beach_df['RelMAE (%)'].mean()

        all_results.append({
            'Model': model_name, 'Dataset': ds_name, 'Type': 'Sklearn',
            'MAE': mae, 'RMSE': rmse, 'R2': r2, 'Avg RelMAE (%)': avg_rel_mae,
            'Time (s)': train_time, 'N_test': len(y_true)
        })
        print(f"  {model_name:20s} | {train_time:5.1f}s | MAE: {mae:7.2f} | RelMAE: {avg_rel_mae:6.2f}% | R2: {r2:.4f}")

print(f"\nSklearn done: {len(all_results)} results")


SKLEARN — Daytime (13124 test samples)
  Lasso                |   0.6s | MAE:   51.33 | RelMAE: 122.38% | R2: -0.1751
  RandomForest         |  20.1s | MAE:   40.16 | RelMAE:  85.46% | R2: 0.0070
  GradientBoosting     | 168.8s | MAE:   60.43 | RelMAE: 153.74% | R2: -0.7950
  XGBoost              |   2.2s | MAE:   45.28 | RelMAE: 101.83% | R2: -0.1385
  LightGBM             |   2.2s | MAE:   33.08 | RelMAE:  61.58% | R2: 0.1207
  CatBoost             |   1.6s | MAE:   32.80 | RelMAE:  61.31% | R2: 0.1318

SKLEARN — Full 24h (23608 test samples)
  Lasso                |   1.2s | MAE:   46.91 | RelMAE: 100.98% | R2: -0.0382
  RandomForest         |  36.8s | MAE:   40.53 | RelMAE:  86.23% | R2: 0.0621
  GradientBoosting     | 305.3s | MAE:   56.38 | RelMAE: 138.48% | R2: -0.4825
  XGBoost              |   1.7s | MAE:   50.82 | RelMAE: 121.92% | R2: -0.2117
  LightGBM             |   2.4s | MAE:   34.08 | RelMAE:  64.72% | R2: 0.1744
  CatBoost             |   2.5s | MAE:   33.34 | RelMAE

KeyboardInterrupt: 

## NeuralForecast Models — Full 24h and Night=0 Only

Daytime dataset has 14h gaps every night — NF models need continuous hourly series.

In [None]:
def to_nf_format(df, features, target='count', id_col='beach'):
    nf_df = df[['datetime', id_col, target] + features].copy()
    nf_df = nf_df.rename(columns={'datetime': 'ds', id_col: 'unique_id', target: 'y'})
    return nf_df

def split_at_long_gaps(nf_df, max_gap_hours=MAX_GAP_HOURS, min_segment_hours=MIN_SEGMENT_HOURS):
    result = []
    for uid in nf_df['unique_id'].unique():
        s = nf_df[nf_df['unique_id'] == uid].sort_values('ds').copy()
        gaps = s['ds'].diff().dt.total_seconds() / 3600
        split_points = gaps[gaps > max_gap_hours].index.tolist()

        segments = []
        prev = s.index[0]
        for sp in split_points:
            segments.append(s.loc[prev:s.index[s.index.get_loc(sp) - 1]])
            prev = sp
        segments.append(s.loc[prev:])

        for i, seg in enumerate(segments):
            if len(seg) >= min_segment_hours:
                seg = seg.copy()
                seg['unique_id'] = f"{uid}__seg{i}" if len(segments) > 1 else uid
                result.append(seg)
    return pd.concat(result, ignore_index=True)

def prepare_nf_data(ds_name):
    s = splits[ds_name]
    trainval = pd.concat([s['df_train'], s['df_val']])
    nf_trainval = to_nf_format(trainval, ALL_FEATURES, id_col=beach_col)
    nf_test = to_nf_format(s['df_test'], ALL_FEATURES, id_col=beach_col)

    nf_trainval = split_at_long_gaps(nf_trainval)
    nf_test = split_at_long_gaps(nf_test)

    nf_trainval = nf_trainval.groupby(['unique_id', 'ds']).mean(numeric_only=True).reset_index()
    nf_test = nf_test.groupby(['unique_id', 'ds']).mean(numeric_only=True).reset_index()

    nf_trainval = uf_fill_gaps(nf_trainval, freq='h')
    nf_test = uf_fill_gaps(nf_test, freq='h')

    for col in ['y'] + ALL_FEATURES:
        if col in nf_trainval.columns:
            nf_trainval[col] = nf_trainval.groupby('unique_id')[col].ffill().bfill()
        if col in nf_test.columns:
            nf_test[col] = nf_test.groupby('unique_id')[col].ffill().bfill()

    common = set(nf_trainval['unique_id'].unique()) & set(nf_test['unique_id'].unique())
    nf_trainval = nf_trainval[nf_trainval['unique_id'].isin(common)]
    nf_test = nf_test[nf_test['unique_id'].isin(common)]

    return nf_trainval, nf_test, len(common)

nf_datasets = {}
for ds_name in ['Full 24h', 'Night=0']:
    tv, te, ns = prepare_nf_data(ds_name)
    nf_datasets[ds_name] = {'trainval': tv, 'test': te, 'n_series': ns}
    print(f"{ds_name}: train+val={len(tv)}, test={len(te)}, series={ns}")

In [None]:
def get_nf_models(n_series, hist_exog):
    common = dict(
        h=1, input_size=INPUT_SIZE, max_steps=MAX_STEPS,
        early_stop_patience_steps=EARLY_STOP_PATIENCE, scaler_type='robust',
        learning_rate=LEARNING_RATE, batch_size=BATCH_SIZE,
        val_check_steps=50, random_seed=42, start_padding_enabled=True,
    )
    return [
        ('LSTM', NF_LSTM(hist_exog_list=hist_exog, encoder_n_layers=2, encoder_hidden_size=128, decoder_hidden_size=128, decoder_layers=2, **common)),
        ('NHITS', NHITS(hist_exog_list=hist_exog, **common)),
        ('NBEATSx', NBEATSx(hist_exog_list=hist_exog, stack_types=['identity', 'identity', 'identity'], **common)),
        ('TFT', TFT(hist_exog_list=hist_exog, hidden_size=64, **common)),
        ('TCN', TCN(hist_exog_list=hist_exog, **common)),
        ('TiDE', TiDE(hist_exog_list=hist_exog, **common)),
        ('NF_MLP', NF_MLP(hist_exog_list=hist_exog, **common)),
        ('PatchTST', PatchTST(**common)),
        ('TimeMixer', TimeMixer(n_series=n_series, **common)),
        ('TSMixerx', TSMixerx(n_series=n_series, **common)),
        ('iTransformer', iTransformer(n_series=n_series, **common)),
    ]

if HAS_NF:
    for ds_name, nf_data in nf_datasets.items():
        tv, te, ns = nf_data['trainval'], nf_data['test'], nf_data['n_series']
        min_len = tv.groupby('unique_id').size().min()
        val_size = max(24, min(min_len // 5, 200))

        print(f"\n{'='*70}")
        print(f"NEURALFORECAST — {ds_name} (series={ns}, val_size={val_size})")
        print(f"{'='*70}")

        for model_name, model in get_nf_models(ns, ALL_FEATURES):
            print(f"\n  {model_name}...")
            try:
                start = time.time()
                nf = NeuralForecast(models=[model], freq='h')
                nf.fit(df=tv, val_size=val_size)
                train_time = time.time() - start

                preds = nf.predict(df=tv).reset_index()
                pred_col = [c for c in preds.columns if c not in ['unique_id', 'ds']][0]
                merged = te.merge(preds[['unique_id', 'ds', pred_col]], on=['unique_id', 'ds'], how='inner')

                if len(merged) == 0:
                    raise ValueError("No matching predictions")

                y_pred = np.clip(merged[pred_col].values, 0, None)
                y_true = merged['y'].values

                mae = mean_absolute_error(y_true, y_pred)
                rmse = np.sqrt(mean_squared_error(y_true, y_pred))
                r2 = r2_score(y_true, y_pred)

                eval_df = merged.rename(columns={'unique_id': beach_col, 'y': 'count'}).copy()
                beach_df = evaluate_per_beach(eval_df, y_pred, beach_col)
                beach_df['model'] = model_name
                beach_df['dataset'] = ds_name
                beach_df['model_type'] = 'NeuralForecast'
                all_beach_results.append(beach_df)

                avg_rel_mae = beach_df['RelMAE (%)'].mean()
                all_results.append({
                    'Model': model_name, 'Dataset': ds_name, 'Type': 'NeuralForecast',
                    'MAE': mae, 'RMSE': rmse, 'R2': r2, 'Avg RelMAE (%)': avg_rel_mae,
                    'Time (s)': train_time, 'N_test': len(merged)
                })
                print(f"    {train_time:.1f}s | MAE: {mae:.2f} | RelMAE: {avg_rel_mae:.2f}% | R2: {r2:.4f} | matched: {len(merged)}/{len(te)}")

            except Exception as e:
                print(f"    ERROR: {e}")
                all_results.append({
                    'Model': model_name, 'Dataset': ds_name, 'Type': 'NeuralForecast',
                    'MAE': np.nan, 'RMSE': np.nan, 'R2': np.nan, 'Avg RelMAE (%)': np.nan,
                    'Time (s)': np.nan, 'N_test': 0
                })

## Results

In [None]:
results_df = pd.DataFrame(all_results).sort_values(['Dataset', 'Avg RelMAE (%)'])
beach_results_df = pd.concat(all_beach_results, ignore_index=True)

save_dir = Path(SAVE_DIR)
save_dir.mkdir(parents=True, exist_ok=True)
results_df.to_csv(save_dir / 'results_comparison.csv', index=False)
beach_results_df.to_csv(save_dir / 'beach_results_comparison.csv', index=False)

for ds_name in datasets.keys():
    sub = results_df[results_df['Dataset'] == ds_name].copy()
    print(f"\n{'='*80}")
    print(f"  {ds_name}")
    print(f"{'='*80}")
    print(sub[['Model', 'Type', 'MAE', 'RMSE', 'R2', 'Avg RelMAE (%)', 'Time (s)']].to_string(index=False))

In [None]:
pivot = results_df.pivot_table(index='Model', columns='Dataset', values='Avg RelMAE (%)', aggfunc='first')
pivot = pivot.sort_values(pivot.columns[0], na_position='last')
print("\nRelMAE (%) — Model × Dataset:")
print(pivot.round(2).to_string())

## Visualizations

In [None]:
fig, ax = plt.subplots(figsize=(14, 7))

pivot_plot = results_df.pivot_table(index='Model', columns='Dataset', values='Avg RelMAE (%)', aggfunc='first')
models_sorted = pivot_plot.mean(axis=1).sort_values().index
pivot_plot = pivot_plot.loc[models_sorted]

x = np.arange(len(pivot_plot))
width = 0.25
ds_names = list(datasets.keys())
colors = ['#2196F3', '#FF9800', '#4CAF50']

for i, ds in enumerate(ds_names):
    if ds in pivot_plot.columns:
        vals = pivot_plot[ds].values
        bars = ax.bar(x + i * width, vals, width, label=ds, color=colors[i], edgecolor='black', linewidth=0.5)

ax.set_xlabel('Model', fontsize=12)
ax.set_ylabel('Avg RelMAE (%)', fontsize=12)
ax.set_title('Model Performance Across Datasets — Relative MAE (lower is better)', fontsize=14, fontweight='bold')
ax.set_xticks(x + width)
ax.set_xticklabels(pivot_plot.index, rotation=45, ha='right')
ax.legend(title='Dataset')
ax.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig(save_dir / 'grouped_bar_relmae.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
fig, axes = plt.subplots(1, len(datasets), figsize=(20, 8), sharey=True)

for idx, ds_name in enumerate(datasets.keys()):
    sub = beach_results_df[beach_results_df['dataset'] == ds_name]
    if len(sub) == 0:
        axes[idx].set_title(f'{ds_name}\n(no NF results)')
        continue
    heatmap_data = sub.pivot_table(index='beach', columns='model', values='RelMAE (%)', aggfunc='first')
    heatmap_data = heatmap_data.reindex(columns=heatmap_data.mean().sort_values().index)

    im = axes[idx].imshow(heatmap_data.values, cmap='RdYlGn_r', aspect='auto')
    axes[idx].set_xticks(range(len(heatmap_data.columns)))
    axes[idx].set_xticklabels(heatmap_data.columns, rotation=45, ha='right', fontsize=8)
    if idx == 0:
        axes[idx].set_yticks(range(len(heatmap_data.index)))
        axes[idx].set_yticklabels(heatmap_data.index, fontsize=8)
    axes[idx].set_title(f'{ds_name}', fontsize=12, fontweight='bold')

    for i in range(len(heatmap_data.index)):
        for j in range(len(heatmap_data.columns)):
            val = heatmap_data.values[i, j]
            if not np.isnan(val):
                axes[idx].text(j, i, f'{val:.0f}', ha='center', va='center', fontsize=6)

fig.colorbar(im, ax=axes, label='RelMAE (%)', shrink=0.6)
fig.suptitle('Per-Beach Relative MAE (%) — Heatmap by Dataset', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig(save_dir / 'beach_heatmap.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(14, 7))

pivot_r2 = results_df.pivot_table(index='Model', columns='Dataset', values='R2', aggfunc='first')
pivot_r2 = pivot_r2.loc[models_sorted]

x = np.arange(len(pivot_r2))
for i, ds in enumerate(ds_names):
    if ds in pivot_r2.columns:
        vals = pivot_r2[ds].values
        ax.bar(x + i * width, vals, width, label=ds, color=colors[i], edgecolor='black', linewidth=0.5)

ax.set_xlabel('Model', fontsize=12)
ax.set_ylabel('R²', fontsize=12)
ax.set_title('R² Score Across Datasets (higher is better)', fontsize=14, fontweight='bold')
ax.set_xticks(x + width)
ax.set_xticklabels(pivot_r2.index, rotation=45, ha='right')
ax.legend(title='Dataset')
ax.axhline(y=0, color='red', linestyle='--', alpha=0.3)
ax.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig(save_dir / 'grouped_bar_r2.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(18, 6), sharey=True)

for idx, ds_name in enumerate(datasets.keys()):
    sub = results_df[results_df['Dataset'] == ds_name].dropna()
    ax = axes[idx]
    for _, row in sub.iterrows():
        c = '#2196F3' if row['Type'] == 'Sklearn' else '#FF5722'
        ax.scatter(row['Time (s)'], row['Avg RelMAE (%)'], s=100, c=c, edgecolor='black', zorder=5)
        ax.annotate(row['Model'], (row['Time (s)'], row['Avg RelMAE (%)']),
                   fontsize=7, ha='left', va='bottom', xytext=(4, 4), textcoords='offset points')
    ax.set_xlabel('Training Time (s)')
    ax.set_title(f'{ds_name}', fontweight='bold')
    ax.set_xscale('log')
    ax.grid(alpha=0.3)

axes[0].set_ylabel('Avg RelMAE (%)')

from matplotlib.patches import Patch
fig.legend(handles=[Patch(color='#2196F3', label='Sklearn'), Patch(color='#FF5722', label='NeuralForecast')],
           loc='upper right', fontsize=10)
fig.suptitle('Efficiency: RelMAE vs Training Time (lower-left is better)', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig(save_dir / 'efficiency_scatter.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
models_both = results_df.groupby('Model').filter(lambda x: len(x) >= 2)['Model'].unique()
fig, ax = plt.subplots(figsize=(12, 6))

for model_name in sorted(models_both):
    sub = results_df[results_df['Model'] == model_name].sort_values('Dataset')
    marker = 'o' if sub['Type'].iloc[0] == 'Sklearn' else 's'
    ax.plot(sub['Dataset'], sub['Avg RelMAE (%)'], marker=marker, label=model_name, linewidth=2, markersize=8)

ax.set_xlabel('Dataset Strategy', fontsize=12)
ax.set_ylabel('Avg RelMAE (%)', fontsize=12)
ax.set_title('How Dataset Strategy Affects Each Model', fontsize=14, fontweight='bold')
ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=9)
ax.grid(alpha=0.3)
plt.tight_layout()
plt.savefig(save_dir / 'dataset_impact.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
fig, axes = plt.subplots(1, len(datasets), figsize=(18, 6), sharey=True)

for idx, ds_name in enumerate(datasets.keys()):
    sub = beach_results_df[beach_results_df['dataset'] == ds_name]
    if len(sub) == 0:
        continue
    best_per_beach = sub.loc[sub.groupby('beach')['RelMAE (%)'].idxmin()]
    counts = best_per_beach['model'].value_counts()

    axes[idx].barh(counts.index, counts.values, color='steelblue', edgecolor='black')
    axes[idx].set_xlabel('# Beaches where best')
    axes[idx].set_title(f'{ds_name}', fontweight='bold')
    axes[idx].grid(axis='x', alpha=0.3)

fig.suptitle('Which Model Wins on Most Beaches? (by RelMAE)', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig(save_dir / 'best_per_beach.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
print("\n" + "="*80)
print("SUMMARY")
print("="*80)

for ds_name in datasets.keys():
    sub = results_df[results_df['Dataset'] == ds_name].dropna()
    if len(sub) == 0:
        continue
    best = sub.loc[sub['Avg RelMAE (%)'].idxmin()]
    print(f"\n{ds_name}:")
    print(f"  Best: {best['Model']} ({best['Type']}) — RelMAE: {best['Avg RelMAE (%)']:.2f}%, R2: {best['R2']:.4f}")
    sk = sub[sub['Type'] == 'Sklearn']
    if len(sk) > 0:
        best_sk = sk.loc[sk['Avg RelMAE (%)'].idxmin()]
        print(f"  Best Sklearn: {best_sk['Model']} — RelMAE: {best_sk['Avg RelMAE (%)']:.2f}%")
    nf = sub[sub['Type'] == 'NeuralForecast']
    if len(nf) > 0:
        best_nf = nf.loc[nf['Avg RelMAE (%)'].idxmin()]
        print(f"  Best NF: {best_nf['Model']} — RelMAE: {best_nf['Avg RelMAE (%)']:.2f}%")

print(f"\nTotal results: {len(results_df)}")
print(f"Beach-level results: {len(beach_results_df)}")
print(f"Saved to: {save_dir}")