In [None]:
import pandas as pd
import pyarrow.parquet as pq
import pyarrow.dataset as ds
from pathlib import Path
from chronos import Chronos2Pipeline


# Path to the directory
parquet_dir = Path("tester.parquet")

dataset = ds.dataset(parquet_dir, format="parquet")
df = dataset.to_table().to_pandas()
df.head()

Unnamed: 0,sensor_id,sensor_type,location,lat,lon,timestamp,P1,durP1,ratioP1,P2,durP2,ratioP2,timestamp_rounded
0,85556,SDS011,76002,52.5258996,13.3077671,2025-11-08T00:00:03,30.38,,,18.78,,,2025-11-08
1,85556,SDS011,76002,52.5258996,13.3077671,2025-11-08T00:00:10,29.85,,,18.8,,,2025-11-08
2,85556,SDS011,76002,52.5258996,13.3077671,2025-11-08T00:00:16,28.58,,,18.74,,,2025-11-08
3,85556,SDS011,76002,52.5258996,13.3077671,2025-11-08T00:00:22,28.58,,,19.05,,,2025-11-08
4,85556,SDS011,76002,52.5258996,13.3077671,2025-11-08T00:00:28,28.27,,,19.48,,,2025-11-08


In [None]:
# ----------------------------
# Convert timestamp to datetime
# ----------------------------
df['timestamp'] = pd.to_datetime(df['timestamp'])

# ----------------------------
# Aggregate P2 by sensor location and day
# ----------------------------
df['date'] = df['timestamp'].dt.floor('D')

# Ensure P2 is float before aggregation
df['P2'] = pd.to_numeric(df['P2'], errors='coerce')  # invalid parsing becomes NaN

# Create date column for daily aggregation
df['date'] = pd.to_datetime(df['timestamp']).dt.floor('D')

# Aggregate by sensor location and day
daily_avg = (
    df.groupby(['lat', 'lon', 'date', 'sensor_id'], as_index=False)['P2']
      .mean()
      .rename(columns={'P2': 'target'})  # Chronos expects 'target'
)

# Drop any rows where aggregation produced NaN (if all values were NaN that day)
daily_avg = daily_avg.dropna(subset=['target']).reset_index(drop=True)

# ---------------------------
print(daily_avg.head())

# Use sensor_id as item_id
daily_avg['item_id'] = daily_avg['sensor_id']

# Build mapping for kriging
location_dict = daily_avg.groupby('item_id')[['lat','lon']].first().apply(tuple, axis=1).to_dict()

# Keep only relevant columns -- CHRONOS DF
daily_df = daily_avg[['date', 'item_id', 'target']].rename(columns={'date':'timestamp'})

print(daily_df.columns)



           lat          lon       date sensor_id    target
0  52.34112525  13.40416440 2025-11-06     88152  3.073469
1  52.34112525  13.40416440 2025-11-07     88152  5.041851
2  52.34112525  13.40416440 2025-11-08     88152  7.573447
3  52.34112525  13.40416440 2025-11-09     88152  6.755453
4  52.34112525  13.40416440 2025-11-10     88152  5.223844
Index(['timestamp', 'item_id', 'target'], dtype='object')
{'10162': ('52.556', '13.384'), '11957': ('52.576', '13.348'), '12171': ('52.418', '13.668'), '12603': ('52.508', '13.284'), '12762': ('52.490', '13.378'), '13090': ('52.604', '13.438'), '13197': ('52.514', '13.440'), '13366': ('52.612', '13.630'), '13368': ('52.520', '13.450'), '13588': ('52.552', '13.434'), '13733': ('52.560', '13.508'), '1376': ('52.502', '13.488'), '13834': ('52.614', '13.500'), '1412': ('52.526', '13.408'), '14681': ('52.498', '13.480'), '15293': ('52.534', '13.436'), '15317': ('52.528', '13.346'), '15536': ('52.448', '13.270'), '15563': ('52.554', '13.364'), 

In [None]:
# ----------------------------
# Config
# ----------------------------
FORECAST_DAYS = 3
QUANTILES = [0.1, 0.5, 0.9]

# Ensure correct columns for Chronos
daily_df = daily_df.rename(columns={"sensor_id":"item_id", "date":"timestamp"})
daily_df['timestamp'] = pd.to_datetime(daily_df['timestamp'])

# ----------------------------
# Pivot to wide format if needed
# ----------------------------
df_wide = daily_df.pivot(index="timestamp", columns="item_id", values="target").asfreq("D")

# Filter sensors with enough data
sensor_counts = df_wide.notna().sum()
valid_sensors = sensor_counts[sensor_counts >= 10].index
df_wide = df_wide[valid_sensors]

# ----------------------------
# Convert back to long format for Chronos
# ----------------------------
train_long = df_wide.reset_index().melt(id_vars="timestamp", var_name="item_id", value_name="target")

# ----------------------------
# Load Chronos model
# ----------------------------
pipeline = Chronos2Pipeline.from_pretrained("amazon/chronos-2", device_map="cuda")

# ----------------------------
# Predict
# ----------------------------
pred_df = pipeline.predict_df(
    train_long,
    prediction_length=FORECAST_DAYS,
    quantile_levels=QUANTILES,
    id_column="item_id",
    timestamp_column="timestamp",
    target="target"
)
