In [89]:
%pip install -U torch scikit-learn pandas tensorflow keras

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [90]:
# setting up seed and device
from typing import Any
import numpy as np
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

seed = 100
torch.manual_seed(seed)
np.random.seed(seed)

First, we'll read the relevant data.

In [91]:
# read the data, fill missing data with 0
import pandas as pd
train_df = pd.read_csv("./data/train.csv", encoding="utf-8").fillna(0)
test_df = pd.read_csv("./data/test.csv", encoding="utf-8").fillna(0)



I just need to make sense of this transformation. Apparently, it removes the last column...

In [92]:
import numpy as np
# remove the last column
A = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])

print(f"A = {A}")
print(f"A[:, :-1] = {A[:, :-1]}")

A = [[1 2 3]
 [4 5 6]
 [7 8 9]]
A[:, :-1] = [[1 2]
 [4 5]
 [7 8]]


Then, we'll process it.

In [93]:
# basic data preview
train_df.head(5)

Unnamed: 0,5 Minutes,Lane 1 Flow (Veh/5 Minutes),# Lane Points,% Observed
0,04/01/2016 0:00,12,1,100
1,04/01/2016 0:05,13,1,100
2,04/01/2016 0:10,11,1,100
3,04/01/2016 0:15,13,1,100
4,04/01/2016 0:20,10,1,100


In [94]:
# hyperparameter

lag = 12

In [95]:
import torch
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import numpy as np

train_col: str = "Lane 1 Flow (Veh/5 Minutes)"

# fit scaler on training column

# FIX: Fit scaler on TRAINING data, not test data
scaler = MinMaxScaler(feature_range=(0, 1)).fit(
    train_df[train_col].values.reshape(-1, 1)  # ✅ Use train_df
)

train_flow = scaler.transform(train_df[train_col].values.reshape(-1, 1)).reshape(-1, 1)
test_flow = scaler.transform(test_df[train_col].values.reshape(-1, 1)).reshape(-1, 1)

train = []
test = []

for i in range(lag, len(train_flow)):
    train.append(train_flow[i - lag : i + 1])
for i in range(lag, len(test_flow)):
    test.append(test_flow[i - lag : i + 1])
    
train_array = np.array(train, )
test_array = np.array(test,)
np.random.shuffle(train_array)

# train: all values other than last column
# test: last column
X_train = train_array[:, :-1]
y_train = train_array[:, -1]
X_test = test_array[:, :-1]
y_test = test_array[:, -1]





In [96]:
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (7764, 12, 1)
Shape of X_test: (4308, 12, 1)
Shape of y_train: (7764, 1)
Shape of y_test: (4308, 1)


We then process the dataset to a suitable form of training.

In [97]:
from torch.utils.data import TensorDataset, DataLoader

batch = 256
epochs = 10 #test
lr = 0.001
val = 0.05 # validation ratio

X_train_tensor = torch.from_numpy(X_train).to(torch.float32) # Add feature dimension
y_train_tensor = torch.from_numpy(y_train).to(torch.float32)
X_test_tensor = torch.from_numpy(X_test).to(torch.float32)
y_test_tensor = torch.from_numpy(y_test).to(torch.float32)

val_size = int(len(X_train_tensor) * 0.05)
X_val = X_train_tensor[-val_size:]
y_val = y_train_tensor[-val_size:]
X_train_split = X_train_tensor[:-val_size]
y_train_split = y_train_tensor[:-val_size]

train_dataset = TensorDataset(X_train_split, y_train_split)
train_loader = DataLoader(train_dataset, batch_size=batch, shuffle=True)

We implement the custom loss used in the original training: MAPE

In [98]:
import torch
from torch import nn, Tensor, optim

class MAPELoss(nn.Module):
    def __init__(self, eps: float = 1e-7):
        super().__init__()
        self.eps = eps  # avoid division by zero
        
    def forward(self, input: Tensor, target: Tensor) -> Tensor:
        # Formula: mean( |(target - input) / clip(|target|, eps, inf)| ) * 100
        denom = torch.clamp(torch.abs(target), min=self.eps)
        loss = torch.mean(torch.abs((target - input) / denom)) * 100
        return loss

criterion = nn.MSELoss()

In [99]:

class LSTM(nn.Module):
    def __init__(self, units: list[int], *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.lstm1 = nn.LSTM(input_size=1, hidden_size=units[1], batch_first=True, num_layers=1)
        self.lstm2 = nn.LSTM(input_size=units[1], hidden_size=units[2], batch_first=True, num_layers=1)
        self.dropout = nn.Dropout(0.2)
        self.linear = nn.Linear(in_features = units[2], out_features=units[3])
        
        
    def forward(self, x: Tensor) -> Tensor:
        # lstm layer 1
        x, _ = self.lstm1(x)
        x, _ = self.lstm2(x)
        
        # keeps every sequence in the batch (:) but selects only the last 
        # time-step (-1) from the sequence dimension, leaving a tensor of shape 
        # (batch_size, hidden_size) for the final linear layer.
        x = x[:, -1, :]
        x = self.dropout(x)
        return self.linear(x)

lstm = LSTM([12, 64, 64, 1])


Optimize the LSTM model


Note that Criterion here is MSE.

In [100]:
optimizer = optim.AdamW(lstm.parameters(), lr=lr)

mse_metric = criterion
mape_metric = MAPELoss()

history = {'mse': [], 'val_mse': [], 'mape': [], 'val_mape': []}

epochs = 600

for epoch in range(epochs):
    lstm.train()
    train_mse: float = 0
    train_mape: float = 0
    
    # evaluate error over batch
    for batch_X, batch_y in train_loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        
        optimizer.zero_grad()
        outputs = lstm(batch_X)
        mse = mse_metric(outputs, batch_y)
        mape = mape_metric(outputs, batch_y)
        
        mse.backward()
        optimizer.step()
        
        train_mse += mse.item()
        train_mape += mape.item()
        
    train_mse /= len(train_loader)
    train_mape /= len(train_loader)
    
    # validation
    lstm.eval()
    with torch.no_grad():
        X_val_device = X_val.to(device)
        y_val_device = y_val.to(device)
        val_outputs = lstm(X_val_device)
        val_mse = mse_metric(val_outputs, y_val_device).item()
        val_mape = mape_metric(val_outputs, y_val_device).item()
        
    history['mse'].append(train_mse)
    history['val_mse'].append(val_mse)
    history['mape'].append(train_mape)
    history['val_mape'].append(val_mape)
    
    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{epochs}], '
              f'Loss: {train_mse:.6f}, Val Loss: {val_mse:.6f}, '
              f'MAPE: {train_mape:.2f}%, Val MAPE: {val_mape:.2f}%')
        

Epoch [10/600], Loss: 0.003918, Val Loss: 0.003348, MAPE: 27237.13%, Val MAPE: 20.30%
Epoch [20/600], Loss: 0.003432, Val Loss: 0.002891, MAPE: 22354.42%, Val MAPE: 18.60%
Epoch [30/600], Loss: 0.003397, Val Loss: 0.002735, MAPE: 19836.77%, Val MAPE: 19.47%
Epoch [40/600], Loss: 0.003186, Val Loss: 0.003009, MAPE: 21085.36%, Val MAPE: 17.01%
Epoch [50/600], Loss: 0.003158, Val Loss: 0.002965, MAPE: 14432.30%, Val MAPE: 16.50%
Epoch [60/600], Loss: 0.003034, Val Loss: 0.002649, MAPE: 26060.27%, Val MAPE: 16.60%
Epoch [70/600], Loss: 0.002983, Val Loss: 0.002683, MAPE: 27737.96%, Val MAPE: 17.98%
Epoch [80/600], Loss: 0.002900, Val Loss: 0.002630, MAPE: 26245.36%, Val MAPE: 21.35%
Epoch [90/600], Loss: 0.002897, Val Loss: 0.002561, MAPE: 23123.21%, Val MAPE: 17.70%
Epoch [100/600], Loss: 0.002891, Val Loss: 0.002461, MAPE: 17680.54%, Val MAPE: 16.97%
Epoch [110/600], Loss: 0.002808, Val Loss: 0.002437, MAPE: 27124.40%, Val MAPE: 18.12%
Epoch [120/600], Loss: 0.002752, Val Loss: 0.002684,

In [101]:

optimizer = optim.AdamW(lstm.parameters(), lr=0.001)
lstm.to(device)

epochs = 10
for _ in range(epochs):
    lstm.train()
    


In [102]:
class GRU(nn.Module):
    def __init__(self, units: list[int], *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.gru1 = nn.GRU(input_size=1, hidden_size=units[1], batch_first=True, num_layers=1)
        self.gru2 = nn.GRU(input_size=units[1], hidden_size=units[2], batch_first=True, num_layers=1)
        self.dropout = nn.Dropout(0.2)
        self.linear = nn.Linear(in_features = units[2], out_features=units[3])
        # print(len(self.lstm1))
        # print(len(self.lstm2))
        # print(len(self.dropout))
        
    def forward(self, x: Tensor) -> Tensor:
        # lstm layer 1
        x, _ = self.gru1(x)
        x, _ = self.gru2(x)
        
        # keeps every sequence in the batch (:) but selects only the last 
        # time-step (-1) from the sequence dimension, leaving a tensor of shape 
        # (batch_size, hidden_size) for the final linear layer.
        x = x[:, -1, :]
        x = self.dropout(x)
        return self.linear(x)
    
gru = GRU([12, 64, 64, 1])
gru(
    torch.from_numpy(X_train).to(dtype=torch.float32)
    )

for epoch in range(10):
    gru.train()

with torch.no_grad():
    out = gru(torch.from_numpy(X_train[:10]).float())

print(out.min(), out.max(), out.mean())



tensor(-0.0987) tensor(-0.0317) tensor(-0.0537)
