In [2]:
from google.colab import drive
# This code connects my Google Drive so I can access files stored there.
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import h5py

input_path = "/content/drive/MyDrive/traffic_data/pems-bay-dcrnn.h5"

with h5py.File(input_path, "r") as f:
    print("Keys:", list(f.keys()))


Keys: ['speed']


In [4]:
import numpy as np
import h5py

# ============================================================
# 1. Load dataset — ONLY SPEED
# ============================================================
data_dir = "/content/drive/MyDrive/traffic_data"
input_path = f"{data_dir}/pems-bay-dcrnn.h5"

with h5py.File(input_path, "r") as f:
    full = f["speed"][:]     # original shape: (T, 325, 3)
    speed = full[:, :, 0]    # extract ONLY speed → shape (T, 325)

print("Loaded speed-only shape:", speed.shape)
# Expected: (52116, 325)

# ============================================================
# 2. Replace NaNs with per-sensor mean (per-column)
# ============================================================
nan_mask = np.isnan(speed)
print("Total NaNs before cleaning:", np.sum(nan_mask))

means = np.nanmean(speed, axis=0)        # mean for each sensor → (325,)
speed_clean = np.where(nan_mask, means, speed)

print("Total NaNs after cleaning:", np.sum(np.isnan(speed_clean)))

# ============================================================
# 3. Z-score normalization (sensor-wise)
# ============================================================
mu = speed_clean.mean(axis=0)            # (325,)
sigma = speed_clean.std(axis=0)          # (325,)
sigma[sigma == 0] = 1.0                  # avoid division by zero

speed_norm = (speed_clean - mu) / sigma

print("Normalized speed shape:", speed_norm.shape)
# Should be (T, 325)

# ============================================================
# 4. 70/10/20 temporal split
# ============================================================
T = speed_norm.shape[0]

train_end = int(T * 0.7)
val_end   = int(T * 0.8)

train_data = speed_norm[:train_end]          # (70%, 325)
val_data   = speed_norm[train_end:val_end]   # (10%, 325)
test_data  = speed_norm[val_end:]            # (20%, 325)

print("Train shape:", train_data.shape)
print("Val shape:",   val_data.shape)
print("Test shape:",  test_data.shape)

# ============================================================
# 5. SAVE normalized splits (speed ONLY)
# ============================================================
np.savez_compressed(f"{data_dir}/train_speed_norm.npz", data=train_data)
np.savez_compressed(f"{data_dir}/val_speed_norm.npz",   data=val_data)
np.savez_compressed(f"{data_dir}/test_speed_norm.npz",  data=test_data)

# Save normalization stats
np.savez_compressed(f"{data_dir}/speed_norm_stats.npz", mu=mu, sigma=sigma)

print("\n✅ Speed-only preprocessing complete!")
print("Saved files:")
print(f"{data_dir}/train_speed_norm.npz")
print(f"{data_dir}/val_speed_norm.npz")
print(f"{data_dir}/test_speed_norm.npz")
print("Normalization stats saved as speed_norm_stats.npz")


Loaded speed-only shape: (52116, 325)
Total NaNs before cleaning: 0
Total NaNs after cleaning: 0
Normalized speed shape: (52116, 325)
Train shape: (36481, 325)
Val shape: (5211, 325)
Test shape: (10424, 325)

✅ Speed-only preprocessing complete!
Saved files:
/content/drive/MyDrive/traffic_data/train_speed_norm.npz
/content/drive/MyDrive/traffic_data/val_speed_norm.npz
/content/drive/MyDrive/traffic_data/test_speed_norm.npz
Normalization stats saved as speed_norm_stats.npz
