# Beach Crowd Prediction — 3-Dataset Comparison

Compare model performance across three dataset strategies:
1. **Daytime only** — remove night hours, sklearn models only
2. **Full 24h** — keep all data including noisy night counts  
3. **Night = 0** — keep 24h but replace night counts with 0

Models with `stat_exog_list` (NBEATSx, NHITS, TFT, TiDE, BiTCN) can use beach metadata for zero-shot prediction on new beaches.

In [1]:
# === PATHS ===
CACHE_DIR = "cache/predictions"
COUNTING_MODEL = "bayesian_vgg19"
SAVE_DIR = "models/dataset_comparison"

# === SAMPLING (for quick testing) ===
SAMPLE_FRAC = 1.0
MAX_BEACHES = 3

# === MODEL PARAMETERS ===
MAX_STEPS = 500
EARLY_STOP_PATIENCE = 30
BATCH_SIZE = 64
LEARNING_RATE = 1e-3
INPUT_SIZE = 24

# === TIME ===
NIGHT_START = 20
NIGHT_END = 6

# === FLAGS ===
RUN_SKLEARN = True
RUN_NEURALFORECAST = True

In [2]:
import subprocess, sys
for pkg in ["neuralforecast", "xgboost", "lightgbm", "catboost", "utilsforecast"]:
    subprocess.check_call([sys.executable, "-m", "pip", "install", pkg, "-q"])
print("Packages installed")


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m26.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3.13 install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m26.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3.13 install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m26.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3.13 install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m26.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3.13 instal

Packages installed



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m26.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3.13 install --upgrade pip[0m


In [3]:
import json, time, warnings
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Lasso

import torch
warnings.filterwarnings('ignore')

try:
    from xgboost import XGBRegressor
    HAS_XGB = True
except: HAS_XGB = False

try:
    from lightgbm import LGBMRegressor
    HAS_LGBM = True
except: HAS_LGBM = False

try:
    from catboost import CatBoostRegressor
    HAS_CATBOOST = True
except: HAS_CATBOOST = False

try:
    from neuralforecast import NeuralForecast
    from neuralforecast.models import NBEATSx, NHITS, TFT, TiDE, BiTCN
    HAS_NF = True
except Exception as e:
    print(f"NeuralForecast error: {e}")
    HAS_NF = False

if torch.cuda.is_available():
    accelerator = 'gpu'
elif torch.backends.mps.is_available():
    accelerator = 'mps'
else:
    accelerator = 'cpu'

print(f"Accelerator: {accelerator}")
print(f"XGB: {HAS_XGB}, LGBM: {HAS_LGBM}, CatBoost: {HAS_CATBOOST}, NF: {HAS_NF}")

Accelerator: mps
XGB: True, LGBM: True, CatBoost: True, NF: True


In [4]:
def calc_metrics(y_true, y_pred, max_count):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    rel_mae = (mae / max_count) * 100 if max_count > 0 else 0
    return {'MAE': mae, 'RMSE': rmse, 'R2': r2, 'RelMAE': rel_mae}

def eval_per_beach(df, y_pred, beach_col='unique_id'):
    results = []
    for b in df[beach_col].unique():
        mask = df[beach_col] == b
        if mask.sum() < 3:
            continue
        y_true = df.loc[mask, 'y'].values if 'y' in df.columns else df.loc[mask, 'count'].values
        y_p = y_pred[mask.values]
        max_count = y_true.max()
        m = calc_metrics(y_true, y_p, max_count)
        m['camera'] = b
        m['max_count'] = max_count
        m['n'] = mask.sum()
        results.append(m)
    return pd.DataFrame(results)

In [5]:
def load_cache(cache_dir, model):
    cache_path = Path(cache_dir) / model
    records = []
    for jf in cache_path.rglob("*.json"):
        try:
            with open(jf) as f:
                r = json.load(f)
            if 'error' not in r:
                records.append(r)
        except: pass
    
    rows = []
    for r in records:
        row = {
            'beach': r.get('beach') or r.get('beach_folder'),
            'beach_folder': r.get('beach_folder'),
            'datetime': r.get('datetime'),
            'count': r.get('count')
        }
        for k, v in r.get('weather', {}).items():
            row[k] = v
        rows.append(row)
    
    df = pd.DataFrame(rows)
    df['datetime'] = pd.to_datetime(df['datetime'])
    df = df.sort_values('datetime').reset_index(drop=True)
    return df

df_raw = load_cache(CACHE_DIR, COUNTING_MODEL)
print(f"Loaded: {len(df_raw)} rows, {df_raw['beach'].nunique()} beaches")
print(f"Date range: {df_raw['datetime'].min()} to {df_raw['datetime'].max()}")



Loaded: 169241 rows, 41 beaches
Date range: 1970-01-01 01:00:00 to 2023-01-27 08:00:00


In [6]:
print(df_raw.head())

                                               beach    beach_folder  \
0                                        Camp de Mar  livecampro/070   
1                                       Port Andratx  livecampro/001   
2                                         Cala Major  livecampro/002   
3  Son Serra Marina (des de El Sol Sunshine Bar &...  livecampro/035   
4              Playa de Muro (Hotel Playa Esperanza)  livecampro/024   

             datetime      count  ae_ta  ae_hr  ae_prec  ae_vv  ae_dv  \
0 1970-01-01 01:00:00  29.308895    NaN    NaN      NaN    NaN    NaN   
1 1970-01-01 01:00:00  13.774030    NaN    NaN      NaN    NaN    NaN   
2 1970-01-01 01:00:00  69.105835    NaN    NaN      NaN    NaN    NaN   
3 1970-01-01 01:00:00  40.888012    NaN    NaN      NaN    NaN    NaN   
4 1970-01-01 01:00:00  36.653721    NaN    NaN      NaN    NaN    NaN   

   ae_pres  ...  om_wind_direction_10m  om_wind_gusts_10m  om_cloud_cover  \
0      NaN  ...               262.0000            4

In [7]:
EXCLUDE = ['livecampro/001', 'livecampro/011', 'livecampro/018', 'livecampro/021',
    'livecampro/030', 'livecampro/039', 'livecampro/070', 'MultimediaTres/PortAndratx',
    'SeeTheWorld/mallorca_pancam', 'skyline/es-pujols']
EXCLUDE_PREFIX = ['ibred', 'ClubNauticSoller', 'Guenthoer', 'youtube']

before = len(df_raw)
df_raw = df_raw[~df_raw['beach_folder'].isin(EXCLUDE)]
for p in EXCLUDE_PREFIX:
    df_raw = df_raw[~df_raw['beach_folder'].str.startswith(p, na=False)]
print(f"Filtered: {before} -> {len(df_raw)}")

Filtered: 169241 -> 70501


In [8]:
if SAMPLE_FRAC < 1.0:
    df_raw = df_raw.sample(frac=SAMPLE_FRAC, random_state=42).sort_values('datetime').reset_index(drop=True)
    print(f"Sampled to {len(df_raw)}")

if MAX_BEACHES:
    top = df_raw['beach'].value_counts().head(MAX_BEACHES).index.tolist()
    df_raw = df_raw[df_raw['beach'].isin(top)].reset_index(drop=True)
    print(f"Limited to {MAX_BEACHES} beaches: {len(df_raw)} rows")

print(f"Final: {len(df_raw)} rows, {df_raw['beach'].nunique()} beaches")

Limited to 3 beaches: 22167 rows
Final: 22167 rows, 3 beaches


In [9]:
df = df_raw.copy()
df['hour'] = df['datetime'].dt.hour
df['day_of_week'] = df['datetime'].dt.dayofweek
df['month'] = df['datetime'].dt.month
df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
df['is_summer'] = df['month'].isin([6, 7, 8]).astype(int)
df['is_night'] = ((df['hour'] >= NIGHT_START) | (df['hour'] <= NIGHT_END)).astype(int)

WEATHER_COLS = [c for c in df.columns if c.startswith('ae_') or c.startswith('om_')]
TEMPORAL_COLS = ['hour', 'day_of_week', 'month', 'is_weekend', 'is_summer', 'is_night']
ALL_FEATURES = WEATHER_COLS + TEMPORAL_COLS

df = df.dropna(subset=ALL_FEATURES + ['count']).reset_index(drop=True)
good = df.groupby('beach')['count'].max()
good = good[good > 20].index.tolist()
df = df[df['beach'].isin(good)].reset_index(drop=True)

print(f"After cleaning: {len(df)} rows, {len(good)} beaches")
print(f"Features: {len(ALL_FEATURES)}")

After cleaning: 21962 rows, 3 beaches
Features: 35


In [10]:
ds_daytime = df[df['is_night'] == 0].copy().reset_index(drop=True)
ds_full24h = df.copy()
ds_night0 = df.copy()
ds_night0.loc[ds_night0['is_night'] == 1, 'count'] = 0.0

datasets = {'Daytime': ds_daytime, 'Full24h': ds_full24h, 'Night0': ds_night0}

print("="*80)
print("DATASET COMPARISON")
print("="*80)

for name, d in datasets.items():
    night_rows = d[d['is_night'] == 1] if 'is_night' in d.columns else pd.DataFrame()
    day_rows = d[d['is_night'] == 0] if 'is_night' in d.columns else d
    
    print(f"\n{name}:")
    print(f"  Total rows:     {len(d)}")
    print(f"  Beaches:        {d['beach'].nunique()}")
    print(f"  Night rows:     {len(night_rows)} ({len(night_rows)/len(d)*100:.1f}%)")
    print(f"  Day rows:       {len(day_rows)} ({len(day_rows)/len(d)*100:.1f}%)")
    print(f"  Count - mean:   {d['count'].mean():.1f}")
    print(f"  Count - median: {d['count'].median():.1f}")
    print(f"  Count - std:    {d['count'].std():.1f}")
    print(f"  Count - min:    {d['count'].min():.1f}")
    print(f"  Count - max:    {d['count'].max():.1f}")
    if len(day_rows) > 0:
        print(f"  Day mean:       {day_rows['count'].mean():.1f}")
    if len(night_rows) > 0:
        print(f"  Night mean:     {night_rows['count'].mean():.1f}")
    print(f"  Zeros:          {(d['count'] == 0).sum()} ({(d['count'] == 0).sum()/len(d)*100:.1f}%)")

DATASET COMPARISON

Daytime:
  Total rows:     11807
  Beaches:        3
  Night rows:     0 (0.0%)
  Day rows:       11807 (100.0%)
  Count - mean:   76.7
  Count - median: 32.0
  Count - std:    86.2
  Count - min:    4.4
  Count - max:    538.0
  Day mean:       76.7
  Zeros:          0 (0.0%)

Full24h:
  Total rows:     21962
  Beaches:        3
  Night rows:     10155 (46.2%)
  Day rows:       11807 (53.8%)
  Count - mean:   74.9
  Count - median: 27.7
  Count - std:    93.9
  Count - min:    4.4
  Count - max:    538.0
  Day mean:       76.7
  Night mean:     72.9
  Zeros:          0 (0.0%)

Night0:
  Total rows:     21962
  Beaches:        3
  Night rows:     10155 (46.2%)
  Day rows:       11807 (53.8%)
  Count - mean:   41.2
  Count - median: 16.3
  Count - std:    73.9
  Count - min:    0.0
  Count - max:    538.0
  Day mean:       76.7
  Night mean:     0.0
  Zeros:          10155 (46.2%)


In [11]:
print(ds_night0.head())

                       beach                 beach_folder            datetime  \
0  Platja dor (Can Pastilla)        HeliosHotel/frontline 2022-07-14 11:00:00   
1             Badia dAlcúdia  Monnaber/webcam-alcudia000M 2022-07-14 11:00:00   
2  Platja dor (Can Pastilla)      HeliosHotel/frontline-2 2022-07-14 11:00:00   
3             Badia dAlcúdia     skyline/mallorca-alcudia 2022-07-14 11:00:00   
4             Port de Soller   SeeTheWorld/port_de_soller 2022-07-14 11:00:00   

        count    ae_ta    ae_hr  ae_prec   ae_vv     ae_dv    ae_pres  ...  \
0   82.135231  30.0154  56.8880      0.0  4.4184  218.8209  1018.7906  ...   
1   33.591919  31.2604  50.3663      0.0  2.0734   61.8588  1015.1991  ...   
2   43.394043  30.0154  56.8880      0.0  4.4184  218.8209  1018.7906  ...   
3   19.267483  31.7492  47.8123      0.0  2.1776   68.3430  1015.1705  ...   
4  232.566895  28.6074  65.7055      0.0  2.5767   26.7109  1015.0849  ...   

   om_sunshine_duration  om_vapour_pressure_

In [12]:
def split_data(df, train_frac=0.7, val_frac=0.15):
    n = len(df)
    t1 = int(n * train_frac)
    t2 = int(n * (train_frac + val_frac))
    return df.iloc[:t1], df.iloc[t1:t2], df.iloc[t2:]

splits = {}
for name, d in datasets.items():
    train, val, test = split_data(d)
    splits[name] = {'train': train, 'val': val, 'test': test}
    print(f"{name}: train={len(train)}, val={len(val)}, test={len(test)}")

Daytime: train=8264, val=1771, test=1772
Full24h: train=15373, val=3294, test=3295
Night0: train=15373, val=3294, test=3295


## Sklearn Models

In [13]:
def get_sklearn_models():
    models = {
        'Lasso': Lasso(alpha=0.1),
        'RandomForest': RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1),
        'GradientBoosting': GradientBoostingRegressor(n_estimators=100, max_depth=6, random_state=42),
    }
    if HAS_XGB:
        models['XGBoost'] = XGBRegressor(n_estimators=200, max_depth=6, random_state=42, n_jobs=-1, verbosity=0)
    if HAS_LGBM:
        models['LightGBM'] = LGBMRegressor(n_estimators=200, max_depth=6, random_state=42, n_jobs=-1, verbose=-1)
    if HAS_CATBOOST:
        models['CatBoost'] = CatBoostRegressor(n_estimators=200, max_depth=6, random_state=42, verbose=0)
    return models

print(f"Sklearn models: {list(get_sklearn_models().keys())}")

Sklearn models: ['Lasso', 'RandomForest', 'GradientBoosting', 'XGBoost', 'LightGBM', 'CatBoost']


In [14]:
all_results = []
all_beach_results = []

if RUN_SKLEARN:
    for ds_name in datasets.keys():
        s = splits[ds_name]
        X_train = pd.concat([s['train'], s['val']])[ALL_FEATURES]
        y_train = pd.concat([s['train'], s['val']])['count']
        X_test = s['test'][ALL_FEATURES]
        y_test = s['test']['count']

        print(f"\n=== SKLEARN - {ds_name} ===")
        
        for model_name, model in get_sklearn_models().items():
            t0 = time.time()
            model.fit(X_train, y_train)
            y_pred = np.clip(model.predict(X_test), 0, None)
            elapsed = time.time() - t0

            m = calc_metrics(y_test.values, y_pred, y_test.max())
            beach_df = eval_per_beach(s['test'], y_pred, 'beach')
            beach_df['model'] = model_name
            beach_df['dataset'] = ds_name
            all_beach_results.append(beach_df)

            avg_rel = beach_df['RelMAE'].mean()
            all_results.append({
                'Model': model_name, 'Dataset': ds_name, 'Type': 'Sklearn',
                'MAE': m['MAE'], 'RMSE': m['RMSE'], 'R2': m['R2'],
                'AvgRelMAE': avg_rel, 'Time': elapsed
            })
            print(f"  {model_name:20s} | {elapsed:5.1f}s | MAE={m['MAE']:.1f} | RelMAE={avg_rel:.1f}%")


=== SKLEARN - Daytime ===
  Lasso                |   0.2s | MAE=73.4 | RelMAE=110.5%
  RandomForest         |   1.0s | MAE=26.7 | RelMAE=37.2%
  GradientBoosting     |   8.0s | MAE=30.9 | RelMAE=47.2%
  XGBoost              |   0.4s | MAE=34.8 | RelMAE=51.5%
  LightGBM             |   1.1s | MAE=25.1 | RelMAE=34.4%
  CatBoost             |   0.4s | MAE=27.5 | RelMAE=38.3%

=== SKLEARN - Full24h ===
  Lasso                |   0.3s | MAE=45.0 | RelMAE=54.9%
  RandomForest         |   1.7s | MAE=23.9 | RelMAE=27.0%
  GradientBoosting     |  15.1s | MAE=31.4 | RelMAE=39.4%
  XGBoost              |   0.5s | MAE=43.2 | RelMAE=54.5%
  LightGBM             |   1.2s | MAE=24.8 | RelMAE=28.6%
  CatBoost             |   0.4s | MAE=28.0 | RelMAE=32.3%

=== SKLEARN - Night0 ===
  Lasso                |   0.3s | MAE=49.0 | RelMAE=67.1%
  RandomForest         |   1.6s | MAE=14.4 | RelMAE=19.8%
  GradientBoosting     |  13.9s | MAE=20.1 | RelMAE=30.5%
  XGBoost              |   0.5s | MAE=25.4 | RelM

## NeuralForecast Models


In [19]:
import subprocess, sys
subprocess.check_call([sys.executable, "-m", "pip", "install", "mlforecast", "optuna", "window-ops", "-q"])

from mlforecast import MLForecast
from mlforecast.auto import (
    AutoMLForecast,
    AutoLightGBM,
    AutoXGBoost,
    AutoCatboost,
    AutoRidge,
)

print("MLForecast imported successfully")

MLForecast imported successfully



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m26.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3.13 install --upgrade pip[0m


In [20]:
# === PREPARE DATASETS WITH FILLED GAPS ===
# This cell creates clean, gap-filled datasets for all models (NeuralForecast, MLForecast, etc.)

from utilsforecast.preprocessing import fill_gaps

def to_nf_format(df, id_col='beach_folder'):
    cols = ['datetime', id_col, 'count'] + ALL_FEATURES
    cols = [c for c in cols if c in df.columns]
    nf_df = df[cols].copy()
    nf_df = nf_df.rename(columns={'datetime': 'ds', id_col: 'unique_id', 'count': 'y'})
    return nf_df

def prepare_dataset_with_filled_gaps(train_df, test_df, freq='h'):
    """Prepare train/test with filled gaps and interpolation"""
    
    nf_train = to_nf_format(train_df)
    nf_test = to_nf_format(test_df)
    
    print(f"  Raw: train={len(nf_train)}, test={len(nf_test)}")
    
    # Step 1: Deduplicate (multiple images per hour)
    nf_train = nf_train.groupby(['unique_id', 'ds']).mean(numeric_only=True).reset_index()
    nf_test = nf_test.groupby(['unique_id', 'ds']).mean(numeric_only=True).reset_index()
    print(f"  After dedup: train={len(nf_train)}, test={len(nf_test)}")
    
    # Step 2: Check gaps before filling
    print(f"\n  Gaps BEFORE fill_gaps:")
    for uid in nf_train['unique_id'].unique()[:3]:
        s = nf_train[nf_train['unique_id'] == uid].sort_values('ds')
        gaps = s['ds'].diff().dt.total_seconds() / 3600
        n_gaps_gt1 = (gaps > 1).sum()
        max_gap = gaps.max()
        print(f"    {uid[:40]}: {len(s)} rows, max_gap={max_gap:.0f}h, gaps>1h={n_gaps_gt1}")
    
    # Step 3: Fill gaps to create continuous hourly series
    nf_train = fill_gaps(nf_train, freq=freq)
    nf_test = fill_gaps(nf_test, freq=freq)
    print(f"\n  After fill_gaps: train={len(nf_train)}, test={len(nf_test)}")
    
    # Step 4: Check gaps after filling
    print(f"\n  Gaps AFTER fill_gaps:")
    for uid in nf_train['unique_id'].unique()[:3]:
        s = nf_train[nf_train['unique_id'] == uid].sort_values('ds')
        gaps = s['ds'].diff().dt.total_seconds() / 3600
        n_gaps_gt1 = (gaps > 1).sum()
        max_gap = gaps.max()
        nan_count = s['y'].isna().sum()
        print(f"    {uid[:40]}: {len(s)} rows, max_gap={max_gap:.0f}h, gaps>1h={n_gaps_gt1}, NaN={nan_count}")
    
    # Step 5: Interpolate NaN values (linear interpolation + ffill/bfill for edges)
    numeric_cols = nf_train.select_dtypes(include=[np.number]).columns.tolist()
    numeric_cols = [c for c in numeric_cols if c in nf_train.columns]
    
    for col in numeric_cols:
        nf_train[col] = nf_train.groupby('unique_id')[col].transform(
            lambda x: x.interpolate(method='linear').ffill().bfill()
        )
        nf_test[col] = nf_test.groupby('unique_id')[col].transform(
            lambda x: x.interpolate(method='linear').ffill().bfill()
        )
    
    print(f"\n  After interpolation:")
    print(f"    Train NaN: {nf_train['y'].isna().sum()}")
    print(f"    Test NaN: {nf_test['y'].isna().sum()}")
    
    # Step 6: Keep only series present in both train and test
    common_ids = set(nf_train['unique_id'].unique()) & set(nf_test['unique_id'].unique())
    nf_train = nf_train[nf_train['unique_id'].isin(common_ids)].reset_index(drop=True)
    nf_test = nf_test[nf_test['unique_id'].isin(common_ids)].reset_index(drop=True)
    
    print(f"\n  Final: train={len(nf_train)}, test={len(nf_test)}, series={len(common_ids)}")
    
    return nf_train, nf_test, list(common_ids)

# Process all datasets
print("="*70)
print("PREPARING DATASETS WITH FILLED GAPS")
print("="*70)

prepared_data = {}

for ds_name in ['Daytime', 'Full24h', 'Night0']:
    print(f"\n{'='*70}")
    print(f"Processing: {ds_name}")
    print(f"{'='*70}")
    
    s = splits[ds_name]
    train_val = pd.concat([s['train'], s['val']])
    
    nf_train, nf_test, series_ids = prepare_dataset_with_filled_gaps(train_val, s['test'])
    
    prepared_data[ds_name] = {
        'train': nf_train,
        'test': nf_test,
        'series_ids': series_ids,
        'n_series': len(series_ids),
    }

# Summary
print("\n" + "="*70)
print("SUMMARY")
print("="*70)
for ds_name, data in prepared_data.items():
    print(f"\n{ds_name}:")
    print(f"  Train: {len(data['train'])} rows")
    print(f"  Test: {len(data['test'])} rows")
    print(f"  Series: {data['n_series']}")
    print(f"  Features: {[c for c in data['train'].columns if c not in ['unique_id', 'ds', 'y']][:5]}...")

PREPARING DATASETS WITH FILLED GAPS

Processing: Daytime
  Raw: train=10035, test=1772
  After dedup: train=10035, test=1772

  Gaps BEFORE fill_gaps:
    HeliosHotel/frontline: 2106 rows, max_gap=23h, gaps>1h=191
    HeliosHotel/frontline-2: 2122 rows, max_gap=23h, gaps>1h=181
    Monnaber/webcam-alcudia000M: 2106 rows, max_gap=23h, gaps>1h=191

  After fill_gaps: train=19835, test=3491

  Gaps AFTER fill_gaps:
    HeliosHotel/frontline: 3967 rows, max_gap=1h, gaps>1h=0, NaN=1861
    HeliosHotel/frontline-2: 3967 rows, max_gap=1h, gaps>1h=0, NaN=1845
    Monnaber/webcam-alcudia000M: 3967 rows, max_gap=1h, gaps>1h=0, NaN=1861

  After interpolation:
    Train NaN: 0
    Test NaN: 0

  Final: train=19835, test=3491, series=5

Processing: Full24h
  Raw: train=18667, test=3295
  After dedup: train=18662, test=3295

  Gaps BEFORE fill_gaps:
    HeliosHotel/frontline: 3896 rows, max_gap=13h, gaps>1h=58
    HeliosHotel/frontline-2: 3902 rows, max_gap=13h, gaps>1h=51
    Monnaber/webcam-alcud

In [17]:
# For NeuralForecast
nf_train = prepared_data['Night0']['train']
nf_test = prepared_data['Night0']['test']

# For AutoMLForecast (only needs unique_id, ds, y)
mlf_train = prepared_data['Night0']['train'][['unique_id', 'ds', 'y']]
mlf_test = prepared_data['Night0']['test'][['unique_id', 'ds', 'y']]

In [18]:
def to_mlf_format(df, id_col='beach_folder'):
    mlf_df = df[['datetime', id_col, 'count']].copy()
    mlf_df = mlf_df.rename(columns={'datetime': 'ds', id_col: 'unique_id', 'count': 'y'})
    return mlf_df

if RUN_SKLEARN:
    for ds_name in ['Night0']:
        s = splits[ds_name]
        
        train_val = pd.concat([s['train'], s['val']])
        
        mlf_train = mlf_train.groupby(['unique_id', 'ds']).mean(numeric_only=True).reset_index()
        mlf_test = mlf_test.groupby(['unique_id', 'ds']).mean(numeric_only=True).reset_index()
        
        print(f"\n{'='*60}")
        print(f"AutoMLForecast - {ds_name}")
        print(f"{'='*60}")
        print(f"Train: {len(mlf_train)}, Test: {len(mlf_test)}, Series: {mlf_train['unique_id'].nunique()}")
        
        auto_mlf = AutoMLForecast(
            models={
                'AutoLightGBM': AutoLightGBM(),
                'AutoCatboost': AutoCatboost(),
                'AutoXGBoost': AutoXGBoost(),
            },
            freq='h',
            season_length=24,
            num_threads=-1,
        )
        
        t0 = time.time()
        auto_mlf.fit(
            df=mlf_train,
            n_windows=3,
            h=72,
            num_samples=15,
            optimize_kwargs={'timeout': 180, 'show_progress_bar': False},
        )
        elapsed = time.time() - t0
        print(f"Training time: {elapsed:.1f}s")
        
        preds = auto_mlf.predict(h=1)
        print(f"Predictions shape: {preds.shape}")
        print(f"Predictions sample:\n{preds.head()}")
        
        merged = mlf_test.merge(preds, on=['unique_id', 'ds'], how='inner')
        print(f"Matched predictions: {len(merged)}/{len(mlf_test)}")
        
        # Get actual model column names from predictions
        model_cols = [c for c in preds.columns if c not in ['unique_id', 'ds']]
        print(f"Model columns: {model_cols}")
        
        for model_name in model_cols:
            y_true = merged['y'].values
            y_pred = np.clip(merged[model_name].values, 0, None)
            
            m = calc_metrics(y_true, y_pred, y_true.max())
            
            eval_df = merged[['unique_id', 'y']].copy()
            eval_df['count'] = eval_df['y']
            eval_df['beach'] = eval_df['unique_id']
            beach_df = eval_per_beach(eval_df, y_pred, 'beach')
            avg_rel = beach_df['RelMAE'].mean() if len(beach_df) > 0 else np.nan
            
            all_results.append({
                'Model': model_name, 'Dataset': ds_name, 'Type': 'AutoMLForecast',
                'MAE': m['MAE'], 'RMSE': m['RMSE'], 'R2': m['R2'],
                'AvgRelMAE': avg_rel, 'Time': elapsed / len(model_cols)
            })
            
            print(f"  {model_name}: MAE={m['MAE']:.1f} | RelMAE={avg_rel:.1f}% | R2={m['R2']:.3f}")
        
        print(f"\nBest models found:")
        for name, model in auto_mlf.models_.items():
            print(f"  {name}: {type(model).__name__}")


AutoMLForecast - Night0
Train: 19865, Test: 3481, Series: 5


NameError: name 'AutoCatboost' is not defined


Models with stat_exog_list: NBEATSx, NHITS, TFT, TiDE, BiTCN

In [None]:
if RUN_NEURALFORECAST and HAS_NF:
    for ds_name in ['Full24h', 'Night0']:
        if ds_name not in prepared_data:
            print(f"Skipping {ds_name} - not in prepared_data")
            continue
        
        nf_train = prepared_data[ds_name]['train']
        nf_test = prepared_data[ds_name]['test']
        n_series = prepared_data[ds_name]['n_series']
        
        print(f"\n{'='*60}")
        print(f"NF - {ds_name}")
        print(f"{'='*60}")
        print(f"Train: {len(nf_train)}, Test: {len(nf_test)}, Series: {n_series}")
        print(f"NaN check - train: {nf_train['y'].isna().sum()}, test: {nf_test['y'].isna().sum()}")
        
        # Store in splits for compatibility
        splits[ds_name]['nf_train'] = nf_train
        splits[ds_name]['nf_test'] = nf_test
        splits[ds_name]['n_series'] = n_series

In [None]:
def get_nf_models(hist_exog):
    common = dict(
        h=72, input_size=INPUT_SIZE, max_steps=MAX_STEPS,
        early_stop_patience_steps=EARLY_STOP_PATIENCE,
        learning_rate=LEARNING_RATE, batch_size=BATCH_SIZE,
        scaler_type='robust', random_seed=42,
        accelerator=accelerator,
    )
    return [
        ('NBEATSx', NBEATSx(hist_exog_list=hist_exog, stack_types=['identity','trend','seasonality'], **common)),
        ('NHITS', NHITS(hist_exog_list=hist_exog, **common)),
        ('TFT', TFT(hist_exog_list=hist_exog, hidden_size=64, **common)),
        ('TiDE', TiDE(hist_exog_list=hist_exog, **common)),
        ('BiTCN', BiTCN(hist_exog_list=hist_exog, **common)),
    ]

if HAS_NF:
    print(f"NF models: NBEATSx, NHITS, TFT, TiDE, BiTCN")
    print(f"hist_exog: {len(ALL_FEATURES)} features")

In [None]:
if RUN_NEURALFORECAST and HAS_NF:
    for ds_name in ['Full24h', 'Night0']:
        if 'nf_train' not in splits[ds_name]:
            continue
            
        nf_train = splits[ds_name]['nf_train']
        nf_test = splits[ds_name]['nf_test']
        
        min_len = nf_train.groupby('unique_id').size().min()
        val_size = max(24, min(min_len // 5, 168))
        
        print(f"\n{'='*60}")
        print(f"NF - {ds_name} (val_size={val_size})")
        print(f"{'='*60}")
        
        for model_name, model in get_nf_models(ALL_FEATURES):
            print(f"\n  {model_name}...")
            try:
                t0 = time.time()
                nf = NeuralForecast(models=[model], freq='h')
                nf.fit(df=nf_train, val_size=val_size)
                elapsed = time.time() - t0
                
                preds = nf.predict(df=nf_train).reset_index()
                pred_col = [c for c in preds.columns if c not in ['unique_id','ds']][0]
                
                merged = nf_test.merge(preds[['unique_id','ds',pred_col]], on=['unique_id','ds'], how='inner')
                print(f"    Matched: {len(merged)}/{len(nf_test)}")
                
                if len(merged) == 0:
                    raise ValueError("No predictions matched")
                
                y_true = merged['y'].values
                y_pred = np.clip(merged[pred_col].values, 0, None)
                
                m = calc_metrics(y_true, y_pred, y_true.max())
                
                eval_df = merged.copy()
                eval_df['beach'] = eval_df['unique_id']
                eval_df['count'] = eval_df['y']
                beach_df = eval_per_beach(eval_df, y_pred, 'beach')
                beach_df['model'] = model_name
                beach_df['dataset'] = ds_name
                all_beach_results.append(beach_df)
                
                avg_rel = beach_df['RelMAE'].mean() if len(beach_df) > 0 else np.nan
                all_results.append({
                    'Model': model_name, 'Dataset': ds_name, 'Type': 'NeuralForecast',
                    'MAE': m['MAE'], 'RMSE': m['RMSE'], 'R2': m['R2'],
                    'AvgRelMAE': avg_rel, 'Time': elapsed
                })
                print(f"    {elapsed:.1f}s | MAE={m['MAE']:.1f} | RelMAE={avg_rel:.1f}% | R2={m['R2']:.3f}")
                
            except Exception as e:
                print(f"    ERROR: {e}")
                all_results.append({
                    'Model': model_name, 'Dataset': ds_name, 'Type': 'NeuralForecast',
                    'MAE': np.nan, 'RMSE': np.nan, 'R2': np.nan,
                    'AvgRelMAE': np.nan, 'Time': np.nan
                })

## Results

In [None]:
results_df = pd.DataFrame(all_results)
beach_df = pd.concat(all_beach_results, ignore_index=True) if all_beach_results else pd.DataFrame()

save_dir = Path(SAVE_DIR)
save_dir.mkdir(parents=True, exist_ok=True)
results_df.to_csv(save_dir / 'results.csv', index=False)
if len(beach_df) > 0:
    beach_df.to_csv(save_dir / 'beach_results.csv', index=False)

print("\n" + "="*70)
print("RESULTS BY DATASET")
print("="*70)
for ds in datasets.keys():
    sub = results_df[results_df['Dataset'] == ds].sort_values('AvgRelMAE')
    if len(sub) == 0:
        continue
    print(f"\n{ds}:")
    print(sub[['Model','Type','MAE','R2','AvgRelMAE','Time']].to_string(index=False))

In [None]:
pivot = results_df.pivot_table(index='Model', columns='Dataset', values='AvgRelMAE')
print("\nRelMAE (%) by Model x Dataset:")
print(pivot.round(1).to_string())

In [None]:
if len(results_df) > 0:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # RelMAE bar chart
    pivot = results_df.pivot_table(index='Model', columns='Dataset', values='AvgRelMAE')
    pivot = pivot.loc[pivot.mean(axis=1).sort_values().index]
    pivot.plot(kind='bar', ax=axes[0], width=0.8)
    axes[0].set_ylabel('Avg RelMAE (%)')
    axes[0].set_title('Model Performance (lower is better)')
    axes[0].legend(title='Dataset')
    axes[0].tick_params(axis='x', rotation=45)
    
    # R2 bar chart
    pivot_r2 = results_df.pivot_table(index='Model', columns='Dataset', values='R2')
    pivot_r2 = pivot_r2.loc[pivot_r2.mean(axis=1).sort_values(ascending=False).index]
    pivot_r2.plot(kind='bar', ax=axes[1], width=0.8)
    axes[1].set_ylabel('R²')
    axes[1].set_title('R² Score (higher is better)')
    axes[1].legend(title='Dataset')
    axes[1].tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.savefig(save_dir / 'comparison.png', dpi=150)
    plt.show()

In [None]:
print("\n" + "="*70)
print("SUMMARY")
print("="*70)
for ds in datasets.keys():
    sub = results_df[results_df['Dataset'] == ds].dropna()
    if len(sub) == 0:
        continue
    best = sub.loc[sub['AvgRelMAE'].idxmin()]
    print(f"\n{ds}: Best = {best['Model']} ({best['Type']}) - RelMAE={best['AvgRelMAE']:.1f}%, R2={best['R2']:.3f}")