In [1]:
# ===========================================================
# ONE-CELL FULL INSTALL: PyTorch 2.5.1 + PyG 2.6.1 (Kaggle GPU)
# ===========================================================

print("Installing PyTorch 2.5.1 + cu121 ...")
!pip install -q torch==2.5.1+cu121 torchvision==0.20.1+cu121 torchaudio==2.5.1+cu121 \
    --index-url https://download.pytorch.org/whl/cu121

print("Installing PyTorch Geometric 2.6.1 ...")
!pip install -q pyg-lib torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric \
    -f https://data.pyg.org/whl/torch-2.5.1+cu121.html

print("Testing imports...")
import torch
import torch_geometric

print("\n====================================")
print("Torch version:", torch.__version__)
print("PyG version:", torch_geometric.__version__)
print("CUDA available:", torch.cuda.is_available())
print("====================================")
print("INSTALLATION COMPLETE. DO NOT RESTART.")


Installing PyTorch 2.5.1 + cu121 ...
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m780.5/780.5 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.3/7.3 MB[0m [31m35.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m70.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m188.7/188.7 MB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.5/209.5 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling PyTorch Geometric 2.6.1 ...
Testing imports...


  import torch_geometric.typing
  import torch_geometric.typing
  import torch_geometric.typing
  import torch_geometric.typing
  import torch_geometric.typing



Torch version: 2.5.1+cu121
PyG version: 2.7.0
CUDA available: True
INSTALLATION COMPLETE. DO NOT RESTART.


In [2]:
# === CELL 1: CONFIG, PATHS, DEPENDENCY SETUP ===
INPATH = '/kaggle/input/ada-gnn2/'
OUTPATH = '/kaggle/working/'
TIME_BIN = '15T'
K_NEIGHBORS = 6
L = 4                 # history length (settable)
HORIZON = 1
TMP_H = 64            # temporal conv output channels
GCN_H = 128           # graph conv hidden size
BATCHING = False      # per-time full-graph training: keep False (simpler)
DEVICE = 'cuda' if __import__('torch').cuda.is_available() else 'cpu'
print("INPATH:", INPATH, "OUTPATH:", OUTPATH, "DEVICE:", DEVICE)
# Try imports
import os, math, sys, time
import numpy as np, pandas as pd
import torch
try:
    import h3
except Exception as e:
    print("h3 import failed:", e)
try:
    import torch_geometric
    from torch_geometric.data import Data
    from torch_geometric.nn import SAGEConv
except Exception as e:
    print("torch_geometric import error (best-effort):", e)
    # We will try to continue; Kaggle often already has PyG installed.
print("Base libraries imported.")


INPATH: /kaggle/input/ada-gnn2/ OUTPATH: /kaggle/working/ DEVICE: cuda
h3 import failed: No module named 'h3'
Base libraries imported.


In [3]:
# === CELL 2: UTILITIES (robust datetime parser, zone repair) ===
from dateutil import parser as dparser
import re

def robust_parse_datetime(s):
    if pd.isna(s): return pd.NaT
    s = str(s).strip().replace("\uFEFF","").replace("\xa0"," ").strip('"').strip("'")
    fmts = ["%d-%m-%Y %H:%M","%d-%m-%Y %H:%M:%S","%Y-%m-%d %H:%M","%Y-%m-%d %H:%M:%S"]
    for f in fmts:
        try:
            return pd.to_datetime(s, format=f)
        except:
            pass
    try:
        return pd.to_datetime(s, dayfirst=True)
    except:
        pass
    try:
        return dparser.parse(s, dayfirst=True)
    except:
        return pd.NaT

def floor_to_bin(ts, freq=TIME_BIN):
    if pd.isna(ts): return pd.NaT
    return pd.to_datetime(ts).floor(freq)

def repair_zone_token(tok):
    if tok is None: return tok
    s = str(tok).strip()
    if s.endswith(".0"): s = s[:-2]
    s = s.strip('"').strip("'")
    return s

def is_valid_h3(h):
    if h is None: return False
    s = str(h).strip().lower()
    if not re.fullmatch(r'[0-9a-f]+', s): return False
    try:
        h3.h3_to_geo(s); return True
    except: return False

print("Utils ready.")


Utils ready.


In [4]:
# === CELL 3: READ CSVs (robustly, all-as-string initial read) ===
agg_path = os.path.join(INPATH, "aggregated_zone_data.csv")
lgb_path = os.path.join(INPATH, "forecast_15min_predictions.csv")
raw_path = os.path.join(INPATH, "Data_set.csv")

if not os.path.exists(agg_path):
    raise FileNotFoundError(f"aggregated_zone_data.csv not found in {agg_path}")
agg_df = pd.read_csv(agg_path, dtype=str, low_memory=False)
print("Loaded aggregated:", agg_df.shape)
lgb_df = None
if os.path.exists(lgb_path):
    lgb_df = pd.read_csv(lgb_path, dtype=str, low_memory=False)
    print("Loaded LGB preds:", lgb_df.shape)
else:
    print("No LGB preds found.")
raw_df = None
if os.path.exists(raw_path):
    try:
        raw_df = pd.read_csv(raw_path, dtype=str, low_memory=False)
        print("Loaded raw Data_set:", raw_df.shape)
    except Exception as e:
        print("raw read failed:", e)
display(agg_df.head(3))


Loaded aggregated: (17332, 12)
Loaded LGB preds: (67, 3)
Loaded raw Data_set: (18000, 41)


Unnamed: 0,zone,Datetime,Completed Trips,Traffic Volume,Average Speed,Congestion Level,Temperature,Drivers' Earnings,Distance Travelled (km),event importance,Bookings,event type
0,8986d880d77ffff,2024-01-05 18:30:00,85.0,481.0,47.121,16.321,23.504,3519.747,513.369,0.0,112.0,No Event
1,8986d880d77ffff,2024-01-07 18:45:00,241.0,1324.398950135941,32.5,42.12779439509311,30.3,85825.04,1954.51,0.87,526.1780508958601,Local
2,8986d880d77ffff,2024-01-09 18:30:00,91.0,466.0,47.54,16.596,23.281,2065.448,502.273,0.049,106.0,No Event


In [5]:
# === CELL 4: CLEAN DATETIME and ZONE column ===
# Detect Datetime column
datetime_cols = [c for c in agg_df.columns if "date" in c.lower() or "time" in c.lower()]
dt_col = "Datetime" if "Datetime" in agg_df.columns else (datetime_cols[0] if datetime_cols else None)
if dt_col is None:
    raise ValueError("No datetime-like column found")
agg_df["Datetime_raw"] = agg_df[dt_col].astype(str)
agg_df["Datetime_parsed"] = agg_df["Datetime_raw"].apply(robust_parse_datetime)
if agg_df["Datetime_parsed"].notna().sum() == 0:
    print("First 50 datetime strings:", agg_df["Datetime_raw"].head(50).tolist())
    raise ValueError("Datetime parsing failed")
agg_df["Datetime"] = agg_df["Datetime_parsed"].apply(lambda x: floor_to_bin(x, TIME_BIN))
# Zone column
zone_candidates = [c for c in agg_df.columns if c.lower() in ("zone","h3_index","area","zone_id","h3")]
zone_col = zone_candidates[0] if zone_candidates else agg_df.columns[0]
agg_df[zone_col] = agg_df[zone_col].astype(str).apply(repair_zone_token)
agg_df = agg_df.dropna(subset=[zone_col, "Datetime"])
agg_df[zone_col] = agg_df[zone_col].astype(str)
print("Using zone column:", zone_col, "rows after drop:", len(agg_df))


  return pd.to_datetime(ts).floor(freq)


Using zone column: zone rows after drop: 17332


In [6]:
# === CELL 5: FEATURE DETECTION & TARGET SETUP ===
# Normalize column names and detect target
agg_df.columns = [c.strip() for c in agg_df.columns]
target_col = None
for cand in ["Bookings","bookings","Completed Trips","completed trips"]:
    if cand in agg_df.columns:
        target_col = cand; break
if target_col is None:
    for c in agg_df.columns:
        if "book" in c.lower() or "trip" in c.lower():
            target_col = c; break
if target_col is None:
    raise ValueError("Could not detect Booking target column.")
# Convert target numeric
agg_df[target_col] = pd.to_numeric(agg_df[target_col].astype(str).str.replace(",",""), errors='coerce')
# Auto-detect numeric features (exclude id columns)
exclude = {zone_col, "Datetime", "Datetime_raw", "Datetime_parsed", target_col}
feature_cols = []
for c in agg_df.columns:
    if c in exclude: continue
    try:
        agg_df[c] = pd.to_numeric(agg_df[c].astype(str).str.replace(",","").replace(" ", ""), errors='coerce')
    except:
        pass
    if pd.api.types.is_numeric_dtype(agg_df[c]):
        feature_cols.append(c)
print("Target:", target_col)
print("Detected features:", feature_cols)


Target: Bookings
Detected features: ['Completed Trips', 'Traffic Volume', 'Average Speed', 'Congestion Level', 'Temperature', "Drivers' Earnings", 'Distance Travelled (km)', 'event importance', 'event type']


In [7]:
# === CELL 6: MERGE LGB PREDICTIONS SAFELY (optional) ===
if lgb_df is None:
    print("No LGB file to merge.")
else:
    lgb = lgb_df.copy()
    # detect zone/time/pred columns
    l_time = [c for c in lgb.columns if "time" in c.lower() or "next" in c.lower()]
    l_zone = [c for c in lgb.columns if "zone" in c.lower() or "h3" in c.lower()]
    l_pred = [c for c in lgb.columns if "pred" in c.lower() and "book" in c.lower()]
    if not l_time or not l_zone:
        print("Could not detect LGB columns; skipping merge.")
    else:
        time_col = l_time[0]
        zone_col_lgb = l_zone[0]
        pred_col = l_pred[0] if l_pred else lgb.columns[-1]
        lgb["Datetime"] = lgb[time_col].apply(robust_parse_datetime).apply(lambda x: floor_to_bin(x, TIME_BIN))
        lgb[zone_col_lgb] = lgb[zone_col_lgb].astype(str)
        lgb["pred_bookings"] = pd.to_numeric(lgb[pred_col].astype(str).str.replace(",",""), errors='coerce')
        merge_src = lgb[[zone_col_lgb, "Datetime", "pred_bookings"]].rename(columns={zone_col_lgb:zone_col})
        agg_df = agg_df.merge(merge_src, on=[zone_col, "Datetime"], how="left")
        if "pred_bookings" in agg_df.columns and "pred_bookings" not in feature_cols:
            feature_cols.append("pred_bookings")
        print("Merged LGB preds. New feature count:", len(feature_cols))
# Sort and impute simple missing values in features (per-zone)
agg_df = agg_df.sort_values([zone_col, "Datetime"])
existing_feature_cols = [c for c in feature_cols if c in agg_df.columns]
for c in existing_feature_cols:
    agg_df[c] = agg_df.groupby(zone_col)[c].transform(lambda x: x.interpolate().bfill().ffill())
# impute target
agg_df[target_col] = agg_df.groupby(zone_col)[target_col].transform(lambda x: x.interpolate().bfill().ffill())
print("After imputation, any NaN in target?", agg_df[target_col].isna().any())


Merged LGB preds. New feature count: 10


  return pd.to_datetime(ts).floor(freq)


After imputation, any NaN in target? False


In [8]:
# === CELL 7: BUILD KNN ADJACENCY FROM H3 CENTROIDS (robust) ===
from sklearn.neighbors import NearestNeighbors
import csv

# build candidate zones from agg_df (unique tokens)
candidate_zones = sorted(agg_df[zone_col].unique())
print("Candidate zones:", len(candidate_zones))
zone_centroids = {}
invalid = []
for z in candidate_zones:
    zt = str(z).strip().lower()
    if is_valid_h3(zt):
        try:
            lat, lon = h3.h3_to_geo(zt)
            zone_centroids[zt] = (lat, lon)
        except:
            invalid.append(zt)
    else:
        invalid.append(zt)
print("Valid centroids:", len(zone_centroids), "Invalid tokens:", len(invalid))
# fallback to raw_df lat/lon if available for invalid tokens
if len(zone_centroids) < 2 and raw_df is not None:
    print("Attempt fallback from raw_df")
    possible_zone_cols = [c for c in raw_df.columns if "h3" in c.lower() or "zone" in c.lower()]
    lat_cols = [c for c in raw_df.columns if "lat" in c.lower()]
    lon_cols = [c for c in raw_df.columns if "lon" in c.lower() or "lng" in c.lower()]
    if possible_zone_cols and lat_cols and lon_cols:
        rz = possible_zone_cols[0]; latc = lat_cols[0]; lonc = lon_cols[0]
        for token, grp in raw_df.groupby(rz):
            tok = str(token).strip()
            if tok in zone_centroids: continue
            try:
                lat = pd.to_numeric(grp[latc], errors='coerce').mean()
                lon = pd.to_numeric(grp[lonc], errors='coerce').mean()
                if not np.isnan(lat) and not np.isnan(lon):
                    zone_centroids[tok] = (lat, lon)
            except:
                pass
print("Total centroids:", len(zone_centroids))
if len(zone_centroids) < 2:
    raise ValueError("Not enough centroids to build adjacency.")
# zone_list (order) & coords
zone_list = sorted(zone_centroids.keys())
coords = np.array([zone_centroids[z] for z in zone_list])
# knn
k = min(K_NEIGHBORS, max(1, len(zone_list)-1))
n_neighbors = min(k+1, len(zone_list))
nn = NearestNeighbors(n_neighbors=n_neighbors)
nn.fit(coords)
dists, idxs = nn.kneighbors(coords)
edges = set()
for i, nbrs in enumerate(idxs):
    for j in nbrs[1:]:
        edges.add((i,j)); edges.add((j,i))
edges_list = sorted(edges)
# save edges csv mapping indices -> zone tokens
with open(os.path.join(OUTPATH, "graph_edges.csv"), "w", newline="") as f:
    w = csv.writer(f); w.writerow(["src_idx","dst_idx","src_zone","dst_zone"])
    for a,b in edges_list:
        w.writerow([a,b,zone_list[a],zone_list[b]])
print("Saved graph_edges.csv with", len(edges_list), "edges to", OUTPATH)


Candidate zones: 67
Valid centroids: 0 Invalid tokens: 67
Attempt fallback from raw_df
Total centroids: 67
Saved graph_edges.csv with 504 edges to /kaggle/working/


In [9]:
# === CELL 8: BUILD DENSE TENSORS (fast vectorized) X_tensor, Y_tensor ===
# Build timeline
all_times = pd.date_range(start=agg_df["Datetime"].min(), end=agg_df["Datetime"].max(), freq=TIME_BIN)
T = len(all_times); Z = len(zone_list); F = len(feature_cols)
print("Time bins:", T, "Zones:", Z, "Features:", F)
zone_to_idx = {z:i for i,z in enumerate(zone_list)}
time_to_idx = {t:i for i,t in enumerate(all_times)}
# prepare arrays
X_tensor = np.full((T, Z, F), np.nan, dtype=np.float32)
Y_tensor = np.full((T, Z), np.nan, dtype=np.float32)
# map feature columns in agg_df to feature_cols order; create copy of values as float
for _, row in agg_df.iterrows():
    z = str(row[zone_col]).strip().lower()
    dt = row["Datetime"]
    if z in zone_to_idx and pd.notna(dt):
        ti = time_to_idx.get(pd.Timestamp(dt), None)
        if ti is None: continue
        zi = zone_to_idx[z]
        vals = []
        for c in feature_cols:
            try:
                v = float(row[c]) if pd.notna(row[c]) else np.nan
            except:
                v = np.nan
            vals.append(v)
        X_tensor[ti, zi, :] = np.array(vals, dtype=np.float32)
        Y_tensor[ti, zi] = float(row[target_col]) if pd.notna(row[target_col]) else np.nan
print("Observed filled cells (feature0):", np.sum(~np.isnan(X_tensor[:,:,0])))
# Impute per-zone along time axis
for zi in range(Z):
    for fi in range(F):
        col = X_tensor[:, zi, fi]
        mask = ~np.isnan(col)
        if mask.sum() == 0:
            X_tensor[:, zi, fi] = 0.0
        elif mask.sum() == 1:
            X_tensor[:, zi, fi] = col[mask][0]
        else:
            idxs = np.where(mask)[0]; vals = col[mask]
            X_tensor[:, zi, fi] = np.interp(np.arange(T), idxs, vals)
    # target
    coly = Y_tensor[:, zi]; masky = ~np.isnan(coly)
    if masky.sum() == 0:
        Y_tensor[:, zi] = 0.0
    elif masky.sum() == 1:
        Y_tensor[:, zi] = coly[masky][0]
    else:
        idxs = np.where(masky)[0]; vals = coly[masky]
        Y_tensor[:, zi] = np.interp(np.arange(T), idxs, vals)
print("Imputation done. Any NaNs left?", np.isnan(X_tensor).any(), np.isnan(Y_tensor).any())
# quick save (optional)
np.save(os.path.join(OUTPATH,"X_tensor.npy"), X_tensor)
np.save(os.path.join(OUTPATH,"Y_tensor.npy"), Y_tensor)


Time bins: 70075 Zones: 67 Features: 10


  all_times = pd.date_range(start=agg_df["Datetime"].min(), end=agg_df["Datetime"].max(), freq=TIME_BIN)


Observed filled cells (feature0): 17332
Imputation done. Any NaNs left? False False


In [10]:
# === CELL 9: Build ST-GCN sliding sequences (N, Z, F, L) and splits ===
# sequences: for t in [L-1 .. T-1) we predict t+1
seq_X = []
seq_Y = []
time_from_list = []
time_to_list = []
for t in range(L-1, T-1):
    Xw = X_tensor[t-(L-1):t+1, :, :]    # (L, Z, F)
    Xw_tf = np.transpose(Xw, (1,2,0)).astype(np.float32)  # (Z, F, L)
    yw = Y_tensor[t+1, :].astype(np.float32)              # (Z,)
    seq_X.append(Xw_tf); seq_Y.append(yw)
    time_from_list.append(all_times[t]); time_to_list.append(all_times[t+1])
seq_X = np.stack(seq_X, axis=0); seq_Y = np.stack(seq_Y, axis=0)
N = seq_X.shape[0]
print("Built seqs N:", N, "seq_X shape:", seq_X.shape, "seq_Y shape:", seq_Y.shape)
# temporal split
n_train = int(0.7 * N); n_val = int(0.15 * N)
train_idx = np.arange(0, n_train); val_idx = np.arange(n_train, n_train+n_val); test_idx = np.arange(n_train+n_val, N)
print("Train/Val/Test sizes:", len(train_idx), len(val_idx), len(test_idx))
# Save small metadata for checks
np.save(os.path.join(OUTPATH,"zone_list.npy"), np.array(zone_list))
np.save(os.path.join(OUTPATH,"feature_cols.npy"), np.array(feature_cols))


Built seqs N: 70071 seq_X shape: (70071, 67, 10, 4) seq_Y shape: (70071, 67)
Train/Val/Test sizes: 49049 10510 10512


In [11]:
# === CELL 10: Build PyG Data list (seq in data.seq) ===
import torch
from torch_geometric.data import Data
edge_index = torch.tensor(edges_list, dtype=torch.long).t().contiguous()
def build_data_from_seq(x_np, y_np):
    data = Data()
    data.seq = torch.from_numpy(x_np).float()   # (Z, F, L)
    data.y = torch.from_numpy(y_np).float().unsqueeze(1)  # (Z,1)
    data.edge_index = edge_index
    data.mask = torch.ones(x_np.shape[0], dtype=torch.bool)
    return data

train_data = [build_data_from_seq(seq_X[i], seq_Y[i]) for i in train_idx]
val_data = [build_data_from_seq(seq_X[i], seq_Y[i]) for i in val_idx]
test_data = [build_data_from_seq(seq_X[i], seq_Y[i]) for i in test_idx]
print("Data counts -> train:", len(train_data), "val:", len(val_data), "test:", len(test_data))
print("Example seq shape:", train_data[0].seq.shape, "y shape:", train_data[0].y.shape)


Data counts -> train: 49049 val: 10510 test: 10512
Example seq shape: torch.Size([67, 10, 4]) y shape: torch.Size([67, 1])


In [12]:
# === CELL 11: ST-GCN model definition (temporal conv + SAGE graph conv) ===
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv

F_count = len(feature_cols)
IN_CH = F_count
IN_L = L
TMP_H = TMP_H
GCN_H = GCN_H

class STGCN_simple(nn.Module):
    def __init__(self, in_ch, L_steps, tmp_out=64, gcn_hidden=128, num_gcn_layers=2, dropout=0.2):
        super().__init__()
        self.temp_conv = nn.Conv1d(in_ch, tmp_out, kernel_size=3, padding=1)  # input channels = features, conv along L
        self.gconvs = nn.ModuleList([SAGEConv(tmp_out, gcn_hidden)] + [SAGEConv(gcn_hidden, gcn_hidden) for _ in range(num_gcn_layers-1)])
        self.head = nn.Linear(gcn_hidden, 1)
        self.dropout = dropout
    def forward(self, data):
        # data.seq: (Z, F, L)
        seq = data.seq.to(next(self.parameters()).device)    # (Z, F, L)
        x = self.temp_conv(seq)    # (Z, tmp_out, L)
        x = F.relu(x)
        x = x.mean(dim=2)          # (Z, tmp_out)
        edge_index = data.edge_index.to(next(self.parameters()).device)
        for conv in self.gconvs:
            x = conv(x, edge_index)
            x = F.relu(x)
            x = F.dropout(x, p=self.dropout, training=self.training)
        out = self.head(x)   # (Z,1)
        return out

model = STGCN_simple(in_ch=IN_CH, L_steps=IN_L, tmp_out=TMP_H, gcn_hidden=GCN_H).to(DEVICE)
print(model)


STGCN_simple(
  (temp_conv): Conv1d(10, 64, kernel_size=(3,), stride=(1,), padding=(1,))
  (gconvs): ModuleList(
    (0): SAGEConv(64, 128, aggr=mean)
    (1): SAGEConv(128, 128, aggr=mean)
  )
  (head): Linear(in_features=128, out_features=1, bias=True)
)


In [13]:
# === CELL 12: TRAIN ST-GCN (early stopping on val MAE) ===
import torch.optim as optim
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np, time

optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)
criterion = torch.nn.MSELoss()
best_val_mae = float('inf'); best_state = None
patience = 8; patience_ct = 0
EPOCHS = 20

def eval_list(model, dlist):
    model.eval()
    preds, trues = [], []
    with torch.no_grad():
        for d in dlist:
            d = d.to(DEVICE)
            out = model(d).squeeze(1).cpu().numpy()
            true = d.y.squeeze(1).cpu().numpy()
            mask = d.mask.cpu().numpy()
            preds.extend(out[mask].tolist()); trues.extend(true[mask].tolist())
    if len(trues)==0: return {'mae':np.nan,'rmse':np.nan}
    return {'mae': mean_absolute_error(trues,preds), 'rmse': mean_squared_error(trues,preds,squared=False)}

print("Beginning training: train size", len(train_data))
for epoch in range(1, EPOCHS+1):
    t0 = time.time()
    model.train()
    losses = []
    for d in train_data:
        d = d.to(DEVICE)
        optimizer.zero_grad()
        out = model(d)
        loss = criterion(out[d.mask], d.y[d.mask])
        loss.backward()
        optimizer.step()
        losses.append(loss.item())
    valm = eval_list(model, val_data)
    avg_loss = np.mean(losses) if losses else np.nan
    print(f"Epoch {epoch:02d} loss={avg_loss:.4f} val_mae={valm['mae']:.4f} time={(time.time()-t0):.1f}s")
    if not np.isnan(valm['mae']) and valm['mae'] < best_val_mae:
        best_val_mae = valm['mae']; best_state = {k:v.cpu() for k,v in model.state_dict().items()}; patience_ct=0
        print(" New best val MAE:", best_val_mae)
    else:
        patience_ct += 1
        if patience_ct >= patience:
            print("Early stopping triggered."); break

if best_state is not None: model.load_state_dict(best_state)
torch.save(model.state_dict(), os.path.join(OUTPATH, "gnn_model_stgcn.pt"))
print("Saved model to", os.path.join(OUTPATH, "gnn_model_stgcn.pt"))


Beginning training: train size 49049
Epoch 01 loss=2521.4459 val_mae=64.7627 time=180.3s
 New best val MAE: 64.76268585821096
Epoch 02 loss=2084.3268 val_mae=55.3948 time=174.4s
 New best val MAE: 55.39478492775702
Epoch 03 loss=1991.1981 val_mae=63.1224 time=174.1s
Epoch 04 loss=1885.2894 val_mae=57.2578 time=173.7s
Epoch 05 loss=1911.7801 val_mae=52.8507 time=173.3s
 New best val MAE: 52.85072097254214
Epoch 06 loss=1885.5876 val_mae=54.1477 time=173.5s
Epoch 07 loss=1898.4580 val_mae=49.7597 time=173.5s
 New best val MAE: 49.75967053340369
Epoch 08 loss=1903.1978 val_mae=46.6492 time=173.2s
 New best val MAE: 46.64918510164349
Epoch 09 loss=2047.0667 val_mae=56.4196 time=173.2s
Epoch 10 loss=1879.1197 val_mae=51.1835 time=174.9s
Epoch 11 loss=1991.3431 val_mae=60.6739 time=175.3s
Epoch 12 loss=1862.9183 val_mae=56.0993 time=173.1s
Epoch 13 loss=1913.5823 val_mae=51.6606 time=174.2s
Epoch 14 loss=1746.4311 val_mae=52.2579 time=173.2s
Epoch 15 loss=1787.1309 val_mae=47.7654 time=174.5

In [14]:
# === CELL 13: EVALUATE ON TEST SET & SAVE PREDICTIONS ===
import pandas as pd
rows = []
model.eval()
with torch.no_grad():
    for idx, d in enumerate(test_data):
        t_from = time_from_list[test_idx[idx]]; t_to = time_to_list[test_idx[idx]]
        d = d.to(DEVICE)
        out = model(d).squeeze(1).cpu().numpy()
        true = d.y.squeeze(1).cpu().numpy()
        mask = d.mask.cpu().numpy()
        for node in range(len(zone_list)):
            if not mask[node]: continue
            rows.append({
                "zone": zone_list[node],
                "time_from": str(t_from),
                "time_to": str(t_to),
                "pred_bookings": float(out[node]),
                "true_bookings": float(true[node])
            })
preds_df = pd.DataFrame(rows)
preds_path = os.path.join(OUTPATH, "gnn_predictions_stgcn.csv")
preds_df.to_csv(preds_path, index=False)
print("Saved predictions to", preds_path)
if len(preds_df):
    from sklearn.metrics import mean_absolute_error, mean_squared_error
    print("Test MAE:", mean_absolute_error(preds_df["true_bookings"], preds_df["pred_bookings"]))
    print("Test RMSE:", mean_squared_error(preds_df["true_bookings"], preds_df["pred_bookings"], squared=False))
display(preds_df.head(20))


Saved predictions to /kaggle/working/gnn_predictions_stgcn.csv
Test MAE: 47.5944700421437
Test RMSE: 76.95262202453314


Unnamed: 0,zone,time_from,time_to,pred_bookings,true_bookings
0,8986d880d77ffff,2025-08-31 05:15:00,2025-08-31 05:30:00,415.294952,476.181854
1,8986d882c0bffff,2025-08-31 05:15:00,2025-08-31 05:30:00,282.574677,322.527435
2,8986d882e3bffff,2025-08-31 05:15:00,2025-08-31 05:30:00,258.475403,313.196167
3,8986d88412bffff,2025-08-31 05:15:00,2025-08-31 05:30:00,224.293365,264.367249
4,8986d88412fffff,2025-08-31 05:15:00,2025-08-31 05:30:00,172.463684,199.273544
5,8986d88440bffff,2025-08-31 05:15:00,2025-08-31 05:30:00,548.115967,608.123413
6,8986d88449bffff,2025-08-31 05:15:00,2025-08-31 05:30:00,387.760956,529.688965
7,8986d884537ffff,2025-08-31 05:15:00,2025-08-31 05:30:00,142.34819,152.436981
8,8986d88454bffff,2025-08-31 05:15:00,2025-08-31 05:30:00,253.062668,254.863693
9,8986d88454fffff,2025-08-31 05:15:00,2025-08-31 05:30:00,388.059326,461.934906


In [15]:
# === CELL 14: FINAL DIAGNOSTICS ===
print("Files saved in", OUTPATH)
for fn in ["graph_edges.csv","gnn_model_stgcn.pt","gnn_predictions_stgcn.csv"]:
    print("-", os.path.join(OUTPATH, fn), "exists:", os.path.exists(os.path.join(OUTPATH, fn)))
print("Zones:", len(zone_list), "Features:", feature_cols, "L:", L)


Files saved in /kaggle/working/
- /kaggle/working/graph_edges.csv exists: True
- /kaggle/working/gnn_model_stgcn.pt exists: True
- /kaggle/working/gnn_predictions_stgcn.csv exists: True
Zones: 67 Features: ['Completed Trips', 'Traffic Volume', 'Average Speed', 'Congestion Level', 'Temperature', "Drivers' Earnings", 'Distance Travelled (km)', 'event importance', 'event type', 'pred_bookings'] L: 4
