Simple test-run of chronos2

In [1]:
import pandas as pd
from chronos import Chronos2Pipeline

In [11]:
# ----------------------------
# Read data
# ----------------------------
df = pd.read_parquet("/storage/bln-aq/data/2024-citsci-pollutants-hourly.parquet")

# Ensure datetime
df['timestamp_hour'] = pd.to_datetime(df['timestamp_hour'])

# Aggregate PM2.5 per sensor/location + hour
df = df.groupby(['lat', 'lon', 'timestamp_hour'], as_index=False)['PM2_5'].mean()

# Create a unique key for each sensor location
unique_coords = df.drop_duplicates(subset=["lat", "lon"]).reset_index(drop=True)
unique_coords["loc_id"] = range(1, len(unique_coords) + 1)
df = df.merge(unique_coords, on = ["lat", "lon"], how = "left")

In [13]:
df = (
    df.drop(columns=[c for c in df.columns if c.endswith("_y")])
      .rename(columns={c: c[:-2] for c in df.columns if c.endswith("_x")})
)

In [15]:
# Define cutoff
cutoff = pd.Timestamp("2024-12-17 00:00:00")

# Split per loc_id
train_parts = []
test_parts = []

for loc, group in df.groupby("loc_id"):
    group = group.sort_values("timestamp_hour")
    train_parts.append(group[group["timestamp_hour"] < cutoff])
    test_parts.append(group[group["timestamp_hour"] >= cutoff])

train_df = pd.concat(train_parts).reset_index(drop=True)
test_df = pd.concat(test_parts).reset_index(drop=True)

print(f"Train: {len(train_df)} rows, Test: {len(test_df)} rows")

Train: 1428189 rows, Test: 60828 rows


In [19]:
train_df.head()


Unnamed: 0,lat,lon,timestamp_hour,PM2_5,loc_id
0,52.341125,13.404164,2024-06-06 09:00:00,5.2475,1
1,52.341125,13.404164,2024-06-06 10:00:00,5.20375,1
2,52.341125,13.404164,2024-06-06 11:00:00,4.3536,1
3,52.341125,13.404164,2024-06-06 12:00:00,4.639167,1
4,52.341125,13.404164,2024-06-06 13:00:00,4.7144,1


In [3]:
pipeline = Chronos2Pipeline.from_pretrained("amazon/chronos-2", device_map="cuda")

In [17]:
# Generate predictions with covariates
pred_df = pipeline.predict_df(
    train_df,
    prediction_length=24,  # Number of steps to forecast
    quantile_levels=[0.1, 0.5, 0.9],  # Quantiles for probabilistic forecast
    id_column="loc_id",  # Column identifying different time series
    timestamp_column="timestamp_hour",  # Column with datetime information
    target="PM2_5",  # Column(s) with time series values to predict
)

ValueError: Could not infer frequency for series 1