# 00. Data Setup

This notebook generates the synthetic training data for Chronos-2 and saves it to disk. This ensures that the training process uses a fixed, reproducible dataset.

In [None]:
!git clone https://github.com/emanueleromito/voyagers-forecasting.git
%cd voyagers-forecasting

# Install package
!pip install -e .
!pip install huggingface_hub

In [None]:
from google.colab import userdata
import sys
import os
import torch
import numpy as np
from huggingface_hub import login, HfApi
from tqdm.auto import tqdm

# Add src to path
sys.path.append(os.path.abspath("src"))

from chronos2.data.generation.univariate import (
    KernelSynthGenerator,
    ARGenerator,
    TrendSeasonalityGenerator,
)
from chronos2.data.generation.tasks import TaskSampler

## Configuration

In [None]:
# Hugging Face Hub Configuration
HF_REPO_ID = "voyagersnlppolito/model-data"
HF_TOKEN = userdata.get('HF_TOKEN')

## Initialize Generators

In [None]:
base_generators = [
    KernelSynthGenerator(),
    ARGenerator(order=1),
    ARGenerator(order=2),
    TrendSeasonalityGenerator(),
]

task_sampler = TaskSampler(
    base_generators=base_generators,
    univariate_prob=0.4,
    multivariate_prob=0.3,
    covariate_prob=0.3,
)

## Generate Data

In [None]:
print(f"Generating {NUM_SAMPLES} synthetic tasks...")
dataset = []

for i in tqdm(range(NUM_SAMPLES)):
    # Use a deterministic seed for each sample
    task = task_sampler.sample(length=DATA_LENGTH, random_state=SEED + i)
    dataset.append(task)

print("Generation complete.")

## Save Data

In [None]:
print(f"Saving dataset to {OUTPUT_PATH}...")
torch.save(dataset, OUTPUT_PATH)
print("Done.")

# Upload to Hugging Face Hub
print(f"Uploading to {HF_REPO_ID}...")
if HF_TOKEN:
    login(token=HF_TOKEN)

api = HfApi()
api.create_repo(repo_id=HF_REPO_ID, exist_ok=True, repo_type="dataset")
api.upload_file(
    path_or_fileobj=OUTPUT_PATH,
    path_in_repo="synthetic_dataset.pt",
    repo_id=HF_REPO_ID,
    repo_type="dataset"
)
print("Upload complete.")
