# 00. Data Setup

This notebook generates the synthetic training data for Chronos-2 and saves it to disk. This ensures that the training process uses a fixed, reproducible dataset.

In [None]:
!git clone https://github.com/emanueleromito/voyagers-forecasting.git
%cd voyagers-forecasting

# Install package
!pip install -e .
!pip install huggingface_hub gluonts scikit-learn

In [None]:
from google.colab import userdata
import sys
import os
import torch
import numpy as np
from joblib import Parallel, delayed
import functools
from typing import Optional
from pathlib import Path
from huggingface_hub import login, HfApi
from tqdm.auto import tqdm
from gluonts.dataset.arrow import ArrowWriter
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import (
    RBF,
    ConstantKernel,
    DotProduct,
    ExpSineSquared,
    Kernel,
    RationalQuadratic,
    WhiteKernel,
)

# Add src to path
sys.path.append(os.path.abspath("src"))

## Configuration

In [None]:
# Data Generation Configuration
SEED = 42
NUM_SAMPLES = 1000  # Reduced for demo purposes, increase for real training
DATA_LENGTH = 1024   # Length of each time series
OUTPUT_PATH = "synthetic_dataset.arrow"
MAX_KERNELS = 5

# Hugging Face Hub Configuration
HF_REPO_ID = "voyagersnlppolito/model-data"
HF_TOKEN = userdata.get('HF_TOKEN')

## KernelSynth Logic

The following code implements the KernelSynth logic for generating synthetic time series using Gaussian Processes with random kernels.

In [None]:
LENGTH = DATA_LENGTH
KERNEL_BANK = [
    ExpSineSquared(periodicity=24 / LENGTH),  # H
    ExpSineSquared(periodicity=48 / LENGTH),  # 0.5H
    ExpSineSquared(periodicity=96 / LENGTH),  # 0.25H
    ExpSineSquared(periodicity=24 * 7 / LENGTH),  # H
    ExpSineSquared(periodicity=48 * 7 / LENGTH),  # 0.5H
    ExpSineSquared(periodicity=96 * 7 / LENGTH),  # 0.25H
    ExpSineSquared(periodicity=7 / LENGTH),  # D
    ExpSineSquared(periodicity=14 / LENGTH),  # 0.5D
    ExpSineSquared(periodicity=30 / LENGTH),  # D
    ExpSineSquared(periodicity=60 / LENGTH),  # 0.5D
    ExpSineSquared(periodicity=365 / LENGTH),  # D
    ExpSineSquared(periodicity=365 * 2 / LENGTH),  # 0.5D
    ExpSineSquared(periodicity=4 / LENGTH),  # W
    ExpSineSquared(periodicity=26 / LENGTH),  # W
    ExpSineSquared(periodicity=52 / LENGTH),  # W
    ExpSineSquared(periodicity=4 / LENGTH),  # M
    ExpSineSquared(periodicity=6 / LENGTH),  # M
    ExpSineSquared(periodicity=12 / LENGTH),  # M
    ExpSineSquared(periodicity=4 / LENGTH),  # Q
    ExpSineSquared(periodicity=4 * 10 / LENGTH),  # Q
    ExpSineSquared(periodicity=10 / LENGTH),  # Y
    DotProduct(sigma_0=0.0),
    DotProduct(sigma_0=1.0),
    DotProduct(sigma_0=10.0),
    RBF(length_scale=0.1),
    RBF(length_scale=1.0),
    RBF(length_scale=10.0),
    RationalQuadratic(alpha=0.1),
    RationalQuadratic(alpha=1.0),
    RationalQuadratic(alpha=10.0),
    WhiteKernel(noise_level=0.1),
    WhiteKernel(noise_level=1.0),
    ConstantKernel(),
]

def random_binary_map(a: Kernel, b: Kernel):
    """
    Applies a random binary operator (+ or *) with equal probability
    on kernels ``a`` and ``b``.
    """
    binary_maps = [lambda x, y: x + y, lambda x, y: x * y]
    return np.random.choice(binary_maps)(a, b)

def sample_from_gp_prior(
    kernel: Kernel, X: np.ndarray, random_seed: Optional[int] = None
):
    """
    Draw a sample from a GP prior.
    """
    if X.ndim == 1:
        X = X[:, None]

    assert X.ndim == 2
    gpr = GaussianProcessRegressor(kernel=kernel)
    ts = gpr.sample_y(X, n_samples=1, random_state=random_seed)

    return ts

def generate_time_series(max_kernels: int = 5, seed: int = None):
    """Generate a synthetic time series from KernelSynth."""
    if seed is not None:
        np.random.seed(seed)
        
    while True:
        X = np.linspace(0, 1, LENGTH)

        # Randomly select upto max_kernels kernels from the KERNEL_BANK
        selected_kernels = np.random.choice(
            KERNEL_BANK, np.random.randint(1, max_kernels + 1), replace=True
        )

        # Combine the sampled kernels using random binary operators
        kernel = functools.reduce(random_binary_map, selected_kernels)

        # Sample a time series from the GP prior
        try:
            ts = sample_from_gp_prior(kernel=kernel, X=X, random_seed=seed)
        except np.linalg.LinAlgError as err:
            # print("Error caught:", err)
            continue

        # The timestamp is arbitrary
        return {"start": np.datetime64("2000-01-01 00:00", "s"), "target": ts.squeeze()}

## Generate Data

In [None]:
print(f"Generating {NUM_SAMPLES} synthetic tasks...")
dataset = []

# Parallel generation using joblib
dataset = Parallel(n_jobs=-1)(
    delayed(generate_time_series)(max_kernels=MAX_KERNELS, seed=SEED + i)
    for i in tqdm(range(NUM_SAMPLES))
)

print("Generation complete.")

## Save Data

In [None]:
print(f"Saving dataset to {OUTPUT_PATH}...")
ArrowWriter(compression="lz4").write_to_file(
    dataset,
    path=Path(OUTPUT_PATH),
)
print("Done.")

# Upload to Hugging Face Hub
print(f"Uploading to {HF_REPO_ID}...")
if HF_TOKEN:
    login(token=HF_TOKEN)

api = HfApi()
api.create_repo(repo_id=HF_REPO_ID, exist_ok=True, repo_type="dataset")
api.upload_file(
    path_or_fileobj=OUTPUT_PATH,
    path_in_repo="synthetic_dataset.arrow",
    repo_id=HF_REPO_ID,
    repo_type="dataset"
)
print("Upload complete.")
