In [1]:
import math
import random
import pickle

import numpy as np
import pandas as pd
import geopandas as gpd
import tqdm
import wandb

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as T
from torch.utils.data import DataLoader, Dataset, Subset

import torch_geometric
from torch_geometric.data import Data, Batch
from torch_geometric.transforms import LineGraph

from shapely.geometry import LineString

# Abstract

This is the current working version.
The steps are the following:

1. Load data
2. Pick a loss function
3. Split into train and test data
4. Training loop

In [2]:
# Define parameters
num_epochs = 10
batch_size = 50
lr = 0.001
wandb_name = 'gnn_6'
train_ratio = 0.7

## 1. Load data and create the dataset

In [3]:
with open('../results/results_pop_1pm_first_1400.pkl', 'rb') as f:
    results_dict = pickle.load(f)
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33menatterer[0m ([33mtum-traffic-engineering[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

In [4]:
class GnnModel(nn.Module):
    def __init__(self):
        super().__init__()
        torch.manual_seed(12345)
        self.conv1 = torch_geometric.nn.GCNConv(1, 16)
        self.conv2 = torch_geometric.nn.GCNConv(16, 1)
        
    def forward(self, x):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        return x

def validate_model(model, valid_dl, loss_func, device):
    model.eval()
    val_loss = 0
    with torch.inference_mode():
        for idx, data in enumerate(valid_dl):
            data.x.to(device)
            targets = data.y.to(device)
            # input_node_feats, targets = data.x.to(device), data.y.to(device)
            predicted = model(data)
            val_loss += loss_func(predicted, targets)*targets.size(0)
    return val_loss 

def create_dataloader(is_train, batch_size, dataset):
    dataset_length = len(dataset)
    print(f"Total dataset length: {dataset_length}")

    # Calculate split index for training and validation
    split_idx = int(dataset_length * train_ratio)
    
    # Calculate the maximum number of samples that fit into complete batches for training and validation
    train_samples = (split_idx // batch_size) * batch_size
    valid_samples = ((dataset_length - split_idx) // batch_size) * batch_size

    if is_train:
        indices = range(0, train_samples)
    else:
        indices = range(split_idx, split_idx + valid_samples)
    
    sub_dataset = Subset(dataset, indices)
    print(f"{'Training' if is_train else 'Validation'} subset length: {len(sub_dataset)}")
    return DataLoader(dataset=sub_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

class MyGeometricDataset(Dataset):
    def __init__(self, data_list):
        self.data_list = data_list
        
    def __len__(self):
        return len(self.data_list)

    def __getitem__(self, idx):
        return self.data_list[idx]
    
def collate_fn(data_list):
    return Batch.from_data_list(data_list)

In [5]:
# Create your data objects
datalist = []
counter = 0
linegraph_transformation = LineGraph()

for key, df in results_dict.items():
    counter += 1
    if isinstance(df, pd.DataFrame):
        gdf = gpd.GeoDataFrame(df, geometry='geometry')
        gdf.crs = "EPSG:2154"  # Assuming the original CRS is EPSG:2154
        gdf.to_crs("EPSG:4326", inplace=True)
        
        # Create dictionaries for nodes and edges
        nodes = pd.concat([gdf['from_node'], gdf['to_node']]).unique()
        node_to_idx = {node: idx for idx, node in enumerate(nodes)}
        
        gdf['from_idx'] = gdf['from_node'].map(node_to_idx)
        gdf['to_idx'] = gdf['to_node'].map(node_to_idx)
        
        edges = gdf[['from_idx', 'to_idx']].values
        edge_car_volumes = gdf['vol_car'].values
        capacities = gdf['capacity'].values
        edge_positions = np.array([((geom.coords[0][0] + geom.coords[-1][0]) / 2, 
                                    (geom.coords[0][1] + geom.coords[-1][1]) / 2) 
                                   for geom in gdf.geometry])

        # Convert lists to tensors
        edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()
        edge_positions_tensor = torch.tensor(edge_positions, dtype=torch.float)
        x = torch.zeros((len(nodes), 1), dtype=torch.float)
        
        # Create Data object
        target_values = torch.tensor(edge_car_volumes, dtype=torch.float).unsqueeze(1)
        data = Data(edge_index=edge_index, x=x, pos=edge_positions_tensor)
        
        # Transform to line graph
        linegraph_data = linegraph_transformation(data)
        
        # Prepare the x for line graph: index and capacity
        linegraph_x = torch.tensor(capacities, dtype=torch.float).unsqueeze(1)
        linegraph_data.x = linegraph_x
        
        # Target tensor for car volumes
        linegraph_data.y = target_values
        
        if linegraph_data.validate(raise_on_error=True):
            datalist.append(linegraph_data)
        else:
            print("Invalid line graph data")
            
dataset = MyGeometricDataset(datalist)

In [None]:
len(dataset)

1382

## Train and test the model

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
wandb.init(
        project=wandb_name,
        config={
            "epochs": num_epochs,
            "batch_size": batch_size,
            "lr": lr,
            "dropout": random.uniform(0.01, 0.80),
            })
config = wandb.config

model = GnnModel().to(device)

# Define loss and optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=5e-4)
loss_fct = torch.nn.MSELoss()

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train_loss,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_loss,█▇▇▆▆▅▅▄▄▄▃▃▄▂▂▂▂▃▃▂▂▂▂▁▁▁▂▂▁▁▁▁▁▁▁▁▁▁▁▁

0,1
train_loss,215.16132
val_loss,1665498880.0


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011157198611181229, max=1.0…

In [None]:
train_dl = create_dataloader(dataset=dataset, is_train=True, batch_size=config.batch_size)
valid_dl = create_dataloader(dataset=dataset, is_train=False, batch_size=config.batch_size)
n_steps_per_epoch = math.ceil(len(train_dl.dataset) / config.batch_size)

Total dataset length: 1382
Training subset length: 1100
Total dataset length: 1382
Validation subset length: 250


In [None]:
# Train the model
for epoch in range(config.epochs):
    model.train()
    for step, data in enumerate(train_dl):
        input_node_features, targets = data.x.to(device), data.y.to(device)
        predicted = model(data)
        train_loss = loss_fct(predicted, targets)
        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()
        wandb.log({"train_loss": train_loss.item()})
        # print(f"epoch: {epoch}, step: {step}, loss: {train_loss.item()}")
        
    val_loss = validate_model(model, valid_dl, loss_fct, device)
    wandb.log({"val_loss": val_loss})
    print(f"epoch: {epoch}, val_loss: {val_loss}")
        
wandb.summary["val_loss"] = val_loss
wandb.finish()

epoch: 0, step: 0, loss: 92403.8203125
epoch: 0, step: 1, loss: 81413.0
epoch: 0, step: 2, loss: 70463.890625
epoch: 0, step: 3, loss: 61922.67578125
epoch: 0, step: 4, loss: 53736.08203125
epoch: 0, step: 5, loss: 46164.12890625
epoch: 0, step: 6, loss: 39773.41796875
epoch: 0, step: 7, loss: 33934.484375
epoch: 0, step: 8, loss: 29176.30078125
epoch: 0, step: 9, loss: 24949.392578125
epoch: 0, step: 10, loss: 21355.7265625
epoch: 0, step: 11, loss: 18376.630859375
epoch: 0, step: 12, loss: 15542.197265625
epoch: 0, step: 13, loss: 13142.541015625
epoch: 0, step: 14, loss: 11058.7919921875
epoch: 0, step: 15, loss: 9408.4609375
epoch: 0, step: 16, loss: 7937.03076171875
epoch: 0, step: 17, loss: 6720.37255859375
epoch: 0, step: 18, loss: 5666.1591796875
epoch: 0, step: 19, loss: 4838.82861328125
epoch: 0, step: 20, loss: 4096.30615234375
epoch: 0, step: 21, loss: 3446.8291015625
epoch: 0, val_loss: 1918638592.0
epoch: 1, step: 0, loss: 2932.89794921875
epoch: 1, step: 1, loss: 2502.73

: 

: 

## Analysing the model

In [None]:
# Evaluate the model
# model.eval()
# with torch.no_grad():
#     pred = model(data).cpu()
#     target = data.y.view(-1, 1).cpu()
#     mse = F.mse_loss(pred, target).item()
#     rmse = torch.sqrt(torch.tensor(mse)).item()
#     print(f'Mean Squared Error: {mse:.4f}')
#     print(f'Root Mean Squared Error: {rmse:.4f}')

# # Calculate target value statistics for comparison
# target_values = target.numpy()
# mean_target = target_values.mean()
# std_target = target_values.std()
# min_target = target_values.min()
# max_target = target_values.max()

# print(f'Mean of target values: {mean_target:.4f}')
# print(f'Standard deviation of target values: {std_target:.4f}')
# print(f'Minimum target value: {min_target:.4f}')
# print(f'Maximum target value: {max_target:.4f}')