In [2]:
import numpy as np
import pandas as pd
import h5py
import os

# --- Step 1: Load the raw data from the .h5 file ---
file_path = os.path.join('..', 'data', 'raw', 'pems-bay.h5')
print(f"Loading data from: {file_path}")

try:
    with h5py.File(file_path, 'r') as hf:
        traffic_data = hf['speed']['block0_values'][:]
        # The [:] loads the entire HDF5 dataset into memory as a NumPy array
        print(f"Loaded traffic data with shape: {traffic_data.shape}")

except Exception as e:
    print(f"An error occurred during data loading: {e}")
    traffic_data = None # Set to None so the rest of the code doesn't fail

if traffic_data is not None:
    # --- Step 2: Create a time index for the DataFrame ---
    # The data starts on Jan 1, 2017, at 5-minute intervals.
    # This is a key piece of information we need to know about the dataset.
    start_date = '2017-01-01 00:00:00'
    num_intervals = traffic_data.shape[0]

    # Use pandas to create a datetime index
    date_range = pd.date_range(start=start_date, periods=num_intervals, freq='5min')
    print(f"Created a date range from {date_range[0]} to {date_range[-1]}")

    # --- Step 3: Create a Pandas DataFrame ---
    # Each column in the DataFrame corresponds to a sensor.
    num_sensors = traffic_data.shape[1]
    sensor_columns = [f'sensor_{i}' for i in range(num_sensors)]

    df = pd.DataFrame(traffic_data, index=date_range, columns=sensor_columns)

    print("\nDataFrame created successfully:")
    print(df.head())
    print(f"\nDataFrame shape: {df.shape}")
    print(f"DataFrame info:")
    df.info()

Loading data from: ..\data\raw\pems-bay.h5
Loaded traffic data with shape: (52116, 325)
Created a date range from 2017-01-01 00:00:00 to 2017-06-30 22:55:00

DataFrame created successfully:
                     sensor_0  sensor_1  sensor_2  sensor_3  sensor_4  \
2017-01-01 00:00:00      71.4      67.8      70.5      67.4      68.8   
2017-01-01 00:05:00      71.6      67.5      70.6      67.5      68.7   
2017-01-01 00:10:00      71.6      67.6      70.2      67.4      68.7   
2017-01-01 00:15:00      71.1      67.5      70.3      68.0      68.5   
2017-01-01 00:20:00      71.7      67.8      70.2      68.1      68.4   

                     sensor_5  sensor_6  sensor_7  sensor_8  sensor_9  ...  \
2017-01-01 00:00:00      66.6      66.8      68.0      66.8      69.0  ...   
2017-01-01 00:05:00      66.6      66.8      67.8      66.5      68.2  ...   
2017-01-01 00:10:00      66.1      66.8      67.8      66.2      67.8  ...   
2017-01-01 00:15:00      66.7      66.6      67.7      65.9

In [3]:
# --- Step 4: Check for missing values ---
print("Checking for missing values...")
missing_values = df.isnull().sum().sum()
print(f"Total number of missing values: {missing_values}")

# Note: For this specific dataset, the raw data is often clean, but in a real-world project,
# you would handle missing values here (e.g., with interpolation, mean imputation, etc.).
# Since the sum is likely 0, we can proceed.


# --- Step 5: Add time-based features ---
print("\nAdding time-based features...")

# Extracting features from the DataFrame's index
df['dayofweek'] = df.index.dayofweek # Monday is 0, Sunday is 6
df['hour'] = df.index.hour
df['minute'] = df.index.minute
df['dayofyear'] = df.index.dayofyear
df['weekofyear'] = df.index.isocalendar().week # Use isocalendar for ISO week number

print("New features added. Here's the updated DataFrame with new columns:")
print(df.head())
print("\nDataFrame columns:")
print(df.columns)

Checking for missing values...
Total number of missing values: 0

Adding time-based features...
New features added. Here's the updated DataFrame with new columns:
                     sensor_0  sensor_1  sensor_2  sensor_3  sensor_4  \
2017-01-01 00:00:00      71.4      67.8      70.5      67.4      68.8   
2017-01-01 00:05:00      71.6      67.5      70.6      67.5      68.7   
2017-01-01 00:10:00      71.6      67.6      70.2      67.4      68.7   
2017-01-01 00:15:00      71.1      67.5      70.3      68.0      68.5   
2017-01-01 00:20:00      71.7      67.8      70.2      68.1      68.4   

                     sensor_5  sensor_6  sensor_7  sensor_8  sensor_9  ...  \
2017-01-01 00:00:00      66.6      66.8      68.0      66.8      69.0  ...   
2017-01-01 00:05:00      66.6      66.8      67.8      66.5      68.2  ...   
2017-01-01 00:10:00      66.1      66.8      67.8      66.2      67.8  ...   
2017-01-01 00:15:00      66.7      66.6      67.7      65.9      67.8  ...   
2017-01-

In [4]:
# --- Step 6: Install pyarrow if not already installed ---
# This is needed to save a DataFrame to a Parquet file.
# If you get a ModuleNotFoundError, uncomment the line below and run it.
# Then, comment it again and rerun the cell.
# !pip install pyarrow fastparquet

# --- Step 7: Save the processed DataFrame to a file ---
print("\nSaving the processed DataFrame...")

# Define the output file path in the 'data/processed' folder
output_file_path = os.path.join('..', 'data', 'processed', 'traffic_data_processed.parquet')

try:
    # Save the DataFrame to a Parquet file
    df.to_parquet(output_file_path, index=True)
    print(f"DataFrame successfully saved to {output_file_path}")

    # Optional: Verify by loading it back
    loaded_df = pd.read_parquet(output_file_path)
    print("Verification successful. Loaded DataFrame head:")
    print(loaded_df.head())

except Exception as e:
    print(f"An error occurred while saving or loading the Parquet file: {e}")


Saving the processed DataFrame...
DataFrame successfully saved to ..\data\processed\traffic_data_processed.parquet
Verification successful. Loaded DataFrame head:
                     sensor_0  sensor_1  sensor_2  sensor_3  sensor_4  \
2017-01-01 00:00:00      71.4      67.8      70.5      67.4      68.8   
2017-01-01 00:05:00      71.6      67.5      70.6      67.5      68.7   
2017-01-01 00:10:00      71.6      67.6      70.2      67.4      68.7   
2017-01-01 00:15:00      71.1      67.5      70.3      68.0      68.5   
2017-01-01 00:20:00      71.7      67.8      70.2      68.1      68.4   

                     sensor_5  sensor_6  sensor_7  sensor_8  sensor_9  ...  \
2017-01-01 00:00:00      66.6      66.8      68.0      66.8      69.0  ...   
2017-01-01 00:05:00      66.6      66.8      67.8      66.5      68.2  ...   
2017-01-01 00:10:00      66.1      66.8      67.8      66.2      67.8  ...   
2017-01-01 00:15:00      66.7      66.6      67.7      65.9      67.8  ...   
2017-01

In [5]:
import numpy as np
import pandas as pd
import os

# --- Step 8: Load the processed data ---
print("Loading the processed DataFrame...")
processed_file_path = os.path.join('..', 'data', 'processed', 'traffic_data_processed.parquet')
df = pd.read_parquet(processed_file_path)
print(f"DataFrame loaded with shape: {df.shape}")

# Separate the target sensors (traffic speeds) from the exogenous features (time-based features)
target_columns = [col for col in df.columns if col.startswith('sensor_')]
feature_columns = ['dayofweek', 'hour', 'minute', 'dayofyear', 'weekofyear']

target_data = df[target_columns].values
feature_data = df[feature_columns].values

# --- Step 9: Split the data into training and validation sets ---
# We will use the first 80% of the data for training and the last 20% for validation.
# It's important to keep the time order for time-series data.
train_split = 0.8
num_samples = df.shape[0]
num_train_samples = int(num_samples * train_split)
num_val_samples = num_samples - num_train_samples

print(f"Total samples: {num_samples}")
print(f"Training samples: {num_train_samples}")
print(f"Validation samples: {num_val_samples}")

X_train_targets = target_data[:num_train_samples]
X_train_features = feature_data[:num_train_samples]

X_val_targets = target_data[num_train_samples:]
X_val_features = feature_data[num_train_samples:]

# We will create sequences in a later step. For now, this split is a good first step.

print("\nData splitting complete.")
print(f"X_train_targets shape: {X_train_targets.shape}")
print(f"X_val_targets shape: {X_val_targets.shape}")

Loading the processed DataFrame...
DataFrame loaded with shape: (52116, 330)
Total samples: 52116
Training samples: 41692
Validation samples: 10424

Data splitting complete.
X_train_targets shape: (41692, 325)
X_val_targets shape: (10424, 325)


In [6]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

# --- Step 10: Normalize the target data using MinMaxScaler ---
print("Normalizing the target data...")

# We fit the scaler ONLY on the training data to prevent data leakage.
scaler = MinMaxScaler()
scaler.fit(X_train_targets)

# Now we transform both the training and validation data.
X_train_targets_scaled = scaler.transform(X_train_targets)
X_val_targets_scaled = scaler.transform(X_val_targets)

print("Data successfully normalized.")
print(f"X_train_targets_scaled mean: {X_train_targets_scaled.mean():.4f}, std: {X_train_targets_scaled.std():.4f}")
print(f"X_val_targets_scaled mean: {X_val_targets_scaled.mean():.4f}, std: {X_val_targets_scaled.std():.4f}")

# For the features (day of week, etc.), they are already small integer values
# and we can handle them with embeddings in our model, so we won't scale them for now.

# --- Step 11: Create the sequential data format for the Transformer ---
def create_sequences(data, input_steps, output_steps):
    X, y = [], []
    for i in range(len(data) - input_steps - output_steps):
        X.append(data[i:(i + input_steps)])
        y.append(data[(i + input_steps):(i + input_steps + output_steps)])
    return np.array(X), np.array(y)

# Let's define the sequence lengths for our Transformer
input_sequence_length = 12 # 1 hour of history (12 * 5-minute intervals)
output_sequence_length = 6 # 30 minutes to predict (6 * 5-minute intervals)

# We will use the scaled targets to create our sequences
X_train_seq, y_train_seq = create_sequences(X_train_targets_scaled, input_sequence_length, output_sequence_length)
X_val_seq, y_val_seq = create_sequences(X_val_targets_scaled, input_sequence_length, output_sequence_length)

print("\nSequential data creation complete.")
print(f"Training input sequences shape: {X_train_seq.shape}")
print(f"Training output sequences shape: {y_train_seq.shape}")
print(f"Validation input sequences shape: {X_val_seq.shape}")
print(f"Validation output sequences shape: {y_val_seq.shape}")

Normalizing the target data...
Data successfully normalized.
X_train_targets_scaled mean: 0.8229, std: 0.1384
X_val_targets_scaled mean: 0.8198, std: 0.1405

Sequential data creation complete.
Training input sequences shape: (41674, 12, 325)
Training output sequences shape: (41674, 6, 325)
Validation input sequences shape: (10406, 12, 325)
Validation output sequences shape: (10406, 6, 325)
