## Spatiotemporal Forecasting + Interpolation w/ GPR

Take everything seen so far, and predict P1 and P2 on a regular grid over Berlin at a future timestamp using a spatiotemporal Gaussian Process.

In [1]:
import pandas as pd
import numpy as np
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C

BBOX = {
    "lat_min": 52.3383,
    "lat_max": 52.6755,
    "lon_min": 13.0884,
    "lon_max": 13.7612,
}

In [None]:
# Read Parquet
df = pd.read_parquet("../data/2024-citsci-pollutants-hourly.parquet")

df.head()

: 

In [None]:


# Parse timestamp and round to nearest minute
df["timestamp"] = pd.to_datetime(df["timestamp"])

# Round timestamp to nearest hour
df["timestamp_hour"] = df["timestamp"].dt.round("1h")

# Filter to Berlin bounding box
BBOX = {"lat_min": 52.3, "lat_max": 52.7, "lon_min": 13.0, "lon_max": 13.7}

df = df[
    (df["lat"].between(BBOX["lat_min"], BBOX["lat_max"])) &
    (df["lon"].between(BBOX["lon_min"], BBOX["lon_max"]))
].copy()

In [None]:
df.head()

timestamp,P1,P2,lat,lon,sensor_id
datetime[μs],f64,f64,f64,f64,i64
2024-03-28 00:01:00,14.9,10.93,52.508,13.422,9392
2024-03-28 00:04:00,16.17,12.1,52.508,13.422,9392
2024-03-28 00:07:00,13.1,12.1,52.508,13.422,9392
2024-03-28 00:10:00,12.8,11.63,52.508,13.422,9392
2024-03-28 00:13:00,12.57,11.7,52.508,13.422,9392


In [None]:


# Aggregate by sensor + hour
df_hourly = df.groupby(["sensor_id", "timestamp_hour"]).agg({
    "P1": "mean",
    "P2": "mean",
    "lat": "first",
    "lon": "first"
}).reset_index()

# Start timestamp
start_time = df_hourly["timestamp_hour"].min()

# Minutes since start
df_hourly["minutes_since_start"] = (df_hourly["timestamp_hour"] - start_time).dt.total_seconds() / 60

: 

In [None]:
# Features: lat, lon, time
X = df_hourly[["lat", "lon", "minutes_since_start"]].values

# Targets
y1 = df_hourly["P1"].values
y2 = df_hourly["P2"].values

print(X.shape, y1.shape, y2.shape)
df_hourly.head()