<a href="https://colab.research.google.com/github/elyannmarih/TrafficPrediction/blob/main/Data_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Mount Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#Verify that the file is in Drive

In [2]:
import os
print(os.listdir("/content/drive/MyDrive/traffic_data"))


['METR-LA.h5']


#Open and clean the dataset

##Open dataset

In [7]:
import pandas as pd

file_path = "/content/drive/MyDrive/traffic_data/METR-LA.h5"
df = pd.read_hdf(file_path, 'df')
print(df.shape)
print(df.head())


(34272, 207)
                        773869     767541     767542     717447     717446  \
2012-03-01 00:00:00  64.375000  67.625000  67.125000  61.500000  66.875000   
2012-03-01 00:05:00  62.666667  68.555556  65.444444  62.444444  64.444444   
2012-03-01 00:10:00  64.000000  63.750000  60.000000  59.000000  66.500000   
2012-03-01 00:15:00   0.000000   0.000000   0.000000   0.000000   0.000000   
2012-03-01 00:20:00   0.000000   0.000000   0.000000   0.000000   0.000000   

                        717445  773062  767620     737529     717816  ...  \
2012-03-01 00:00:00  68.750000  65.125  67.125  59.625000  62.750000  ...   
2012-03-01 00:05:00  68.111111  65.000  65.000  57.444444  63.333333  ...   
2012-03-01 00:10:00  66.250000  64.500  64.250  63.875000  65.375000  ...   
2012-03-01 00:15:00   0.000000   0.000   0.000   0.000000   0.000000  ...   
2012-03-01 00:20:00   0.000000   0.000   0.000   0.000000   0.000000  ...   

                        772167  769372     774204     7

##Convert Kaggle METR-LA to DCRNN format

In [9]:
import numpy as np
import pandas as pd
import h5py

# 1. Load the Kaggle DataFrame
file_path = "/content/drive/MyDrive/traffic_data/METR-LA.h5"
df = pd.read_hdf(file_path, 'df')   # shape (34272, 207)
print("Original shape:", df.shape)

# 2. Extract speed matrix
speed = df.values.astype(np.float32)   # [time, sensors]
timestamps = df.index                  # DateTimeIndex
num_timesteps, num_sensors = speed.shape

# 3. Build extra features (time-of-day, day-of-week)
time_of_day = (timestamps.hour * 60 + timestamps.minute) / (24 * 60)
day_of_week = timestamps.dayofweek / 7

# Repeat across all sensors so shapes match
tod_expanded = np.tile(time_of_day.values[:, None], (1, num_sensors))
dow_expanded = np.tile(day_of_week.values[:, None], (1, num_sensors))

# 4. Stack into final array [time, sensors, 3]
data = np.stack([speed, tod_expanded, dow_expanded], axis=-1)
print("New data shape:", data.shape)  # expected (34272, 207, 3)

# 5. Save to h5 file in DCRNN format
out_path = "/content/drive/MyDrive/traffic_data/metr-la-dcrnn.h5"
with h5py.File(out_path, 'w') as f:
    f.create_dataset("speed", data=data)                   # [T, N, 3]
    f.create_dataset("time", data=timestamps.astype(str).values)  # convert to string format
    f.create_dataset("dayofweek", data=timestamps.dayofweek.values)

print(f"Saved converted dataset to {out_path}")

Original shape: (34272, 207)
New data shape: (34272, 207, 3)
Saved converted dataset to /content/drive/MyDrive/traffic_data/metr-la-dcrnn.h5


##Verify the Converted Dataset

In [10]:
import h5py

# Path to the new file
out_path = "/content/drive/MyDrive/traffic_data/metr-la-dcrnn.h5"

with h5py.File(out_path, 'r') as f:
    print("Keys in file:", list(f.keys()))

    speed = f['speed'][:]
    time = f['time'][:]
    dow = f['dayofweek'][:]

    print("Speed shape:", speed.shape)   # should be (34272, 207, 3)
    print("Time shape:", time.shape)     # should be (34272,)
    print("Day-of-week shape:", dow.shape)  # should be (34272,)

    # Peek at first row
    print("First speed row (sensor 0):", speed[0, 0, :])
    print("First timestamp:", time[0])
    print("First day-of-week:", dow[0])


Keys in file: ['dayofweek', 'speed', 'time']
Speed shape: (34272, 207, 3)
Time shape: (34272,)
Day-of-week shape: (34272,)
First speed row (sensor 0): [64.375       0.          0.42857143]
First timestamp: b'2012-03-01 00:00:00'
First day-of-week: 3


##Replace zeros with NaN

In [12]:
#Zeros in METR-LA usually mean “no reading.” We’ll turn them into NaN so we can handle them properly.
import numpy as np

# data: (34272, 207, 3)
# feature 0 = speed
speeds = data[:, :, 0]   # extract just speed

# replace 0.0 with NaN
speeds = np.where(speeds == 0.0, np.nan, speeds)
print("After replacing 0.0:", np.isnan(speeds).sum(), "missing values")


After replacing 0.0: 575302 missing values


##Forward-fill per sensor

In [13]:
#We want to fill NaNs using the last valid value for each sensor, along the time axis.
import pandas as pd

# convert to DataFrame to use pandas forward fill
df_speeds = pd.DataFrame(speeds)

# forward fill along time
df_speeds = df_speeds.fillna(method='ffill')

# if still NaN at the very beginning, keep them for next step
print("Remaining NaNs after forward fill:", df_speeds.isna().sum().sum())


Remaining NaNs after forward fill: 66


  df_speeds = df_speeds.fillna(method='ffill')


##Fill leading NaNs (start of dataset)

In [14]:
# compute column medians (ignoring NaN)
col_medians = df_speeds.median()

# fill remaining NaNs with column medians
df_speeds = df_speeds.fillna(col_medians)

# convert back to numpy
speeds_clean = df_speeds.values.astype(np.float32)
print("Remaining NaNs after fixing:", np.isnan(speeds_clean).sum())


Remaining NaNs after fixing: 0


##Rebuild the 3-feature array

In [15]:
#Now put the cleaned speeds back together with time-of-day and day-of-week.
data_clean = np.stack([speeds_clean, data[:, :, 1], data[:, :, 2]], axis=-1)
print("Final cleaned data shape:", data_clean.shape)


Final cleaned data shape: (34272, 207, 3)


# z-score normalization

##Split chronologically

In [16]:
T = data_clean.shape[0]   # total timesteps
train_end = int(T * 0.7)
val_end   = int(T * 0.8)

train_data = data_clean[:train_end]
val_data   = data_clean[train_end:val_end]
test_data  = data_clean[val_end:]

print("Train:", train_data.shape, "Val:", val_data.shape, "Test:", test_data.shape)


Train: (23990, 207, 3) Val: (3427, 207, 3) Test: (6855, 207, 3)


##Compute mean and std from training only

In [17]:
# compute mean and std per sensor using training set. We normalize the speed feature (index 0) per sensor.
train_speeds = train_data[:, :, 0]   # shape (23990, 207)

mean_per_sensor = train_speeds.mean(axis=0, keepdims=True)   # shape (1, 207)
std_per_sensor  = train_speeds.std(axis=0, keepdims=True)    # shape (1, 207)

print("Mean shape:", mean_per_sensor.shape, "Std shape:", std_per_sensor.shape)


Mean shape: (1, 207) Std shape: (1, 207)


##Apply z-score normalization

In [18]:
#Apply (x - mean) / std to train, val, test.
def normalize(dataset, mean, std):
    speeds = dataset[:, :, 0]
    norm_speeds = (speeds - mean) / (std + 1e-6)  # avoid division by zero
    # rebuild with normalized speeds + original time/day features
    return np.stack([norm_speeds, dataset[:, :, 1], dataset[:, :, 2]], axis=-1)

train_norm = normalize(train_data, mean_per_sensor, std_per_sensor)
val_norm   = normalize(val_data, mean_per_sensor, std_per_sensor)
test_norm  = normalize(test_data, mean_per_sensor, std_per_sensor)

print("Normalized train shape:", train_norm.shape)


Normalized train shape: (23990, 207, 3)


##Save mean/std for later (denormalization)

In [21]:
# Save normalization stats (for later denormalization)
np.savez("/content/drive/MyDrive/traffic_data/normalization_stats.npz",
         mean=mean_per_sensor, std=std_per_sensor)

# Save normalized train/val/test splits
np.savez("/content/drive/MyDrive/traffic_data/train_norm.npz", data=train_norm)
np.savez("/content/drive/MyDrive/traffic_data/val_norm.npz", data=val_norm)
np.savez("/content/drive/MyDrive/traffic_data/test_norm.npz", data=test_norm)

print("Saved normalization stats and train/val/test normalized splits.")



Saved normalization stats and train/val/test normalized splits.
