In [10]:
import math
import random
import pickle

import numpy as np
import pandas as pd
import geopandas as gpd
import tqdm
import wandb

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as T
from torch.utils.data import DataLoader, Dataset, Subset

import torch_geometric
from torch_geometric.data import Data, Batch
from torch_geometric.transforms import LineGraph

from shapely.geometry import LineString

from sklearn.preprocessing import MinMaxScaler, StandardScaler

def create_dataloader(is_train, batch_size, dataset, train_ratio):
    dataset_length = len(dataset)
    print(f"Total dataset length: {dataset_length}")

    # Calculate split index for training and validation
    split_idx = int(dataset_length * train_ratio)
    
    # Calculate the maximum number of samples that fit into complete batches for training and validation
    train_samples = (split_idx // batch_size) * batch_size
    valid_samples = ((dataset_length - split_idx) // batch_size) * batch_size
    if is_train:
        indices = range(0, train_samples)
    else:
        indices = range(split_idx, split_idx + valid_samples)
    sub_dataset = Subset(dataset, indices)
    print(f"{'Training' if is_train else 'Validation'} subset length: {len(sub_dataset)}")
    return DataLoader(dataset=sub_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

class MyGeometricDataset(Dataset):
    def __init__(self, data_list):
        self.data_list = data_list
    def __len__(self):
        return len(self.data_list)
    def __getitem__(self, idx):
        return self.data_list[idx]
    
def collate_fn(data_list):
    return Batch.from_data_list(data_list)

def normalize_data(dataset):
    # Collect all node features
    all_node_features = []
    for data in dataset:
        all_node_features.append(data.x)

    # Stack all node features into a single tensor
    all_node_features = torch.cat(all_node_features, dim=0)

    # Fit the min-max scaler on the node features
    scaler = MinMaxScaler()
    scaler.fit(all_node_features)

    # Apply the scaler to each data instance
    for data in dataset:
        data.x = torch.tensor(scaler.transform(data.x), dtype=torch.float)

    return dataset

def normalize_positional_features(dataset):
    # Collect all positional features
    all_pos_features = []
    for data in dataset:
        all_pos_features.append(data.pos)

    # Stack all positional features into a single tensor
    all_pos_features = torch.cat(all_pos_features, dim=0)

    # Fit the standard scaler on the positional features
    scaler = StandardScaler()
    scaler.fit(all_pos_features)

    # Apply the scaler to each data instance
    for data in dataset:
        data.pos = torch.tensor(scaler.transform(data.pos), dtype=torch.float)

    return dataset

def normalize_dataset(dataset):
    # Normalize node features
    dataset = normalize_data(dataset)
    
    # Normalize positional features (if any)
    dataset = normalize_positional_features(dataset)
    
    return dataset

# Abstract

This is the current working version. The steps are the following:

1. Load data
2. Load model and loss function
3. Split into train and test data
4. Training loop

In [11]:
# Define parameters
num_epochs = 20
batch_size = 20
lr = 0.01
wandb_name = 'gnn_8'
train_ratio = 0.7
wandb.login()



True

## 1. Load data and create the dataset

In [26]:
class GnnModel(nn.Module):
    def __init__(self):
        super().__init__()
        torch.manual_seed(12345)
        self.conv1 = torch_geometric.nn.GATConv(1, 64)
        self.conv2 = torch_geometric.nn.GCNConv(64, 16)
        self.conv3 = torch_geometric.nn.GATConv(16, 16)
        
        self.gat1 = torch_geometric.nn.GATConv(16, 16)

        self.conv4 = torch_geometric.nn.GCNConv(16, 1)
                
        # self.convWithPos = torch_geometric.nn.conv.PointNetConv(1, 16, 3)
        
    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv3(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.gat1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv4(x, edge_index)
        return x

def validate_model(model, valid_dl, loss_func, device):
    model.eval()
    val_loss = 0
    num_batches = 0
    with torch.inference_mode():
        for idx, data in enumerate(valid_dl):
            input_node_features, targets = data.x.to(device), data.y.to(device)
            predicted = model(data)
            val_loss += loss_func(predicted, targets).item()
            num_batches += 1
    return val_loss / num_batches if num_batches > 0 else 0

In [27]:
# Load the list of dictionaries
data_dict_list = torch.load('dataset_1pm_0-1382.pt')

# Reconstruct the Data objects
datalist = [Data(x=d['x'], edge_index=d['edge_index'], pos=d['pos'], y=d['y']) for d in data_dict_list]

# Recreate the dataset
dataset = MyGeometricDataset(datalist)

# Apply normalization to your dataset
dataset_normalized = normalize_dataset(dataset)

In [28]:
dataset_normalized.data_list[0].pos

tensor([[-0.0564, -0.2530],
        [-0.0531, -0.2299],
        [-0.0531, -0.2299],
        ...,
        [-0.5742,  1.4291],
        [-1.4948, -0.8421],
        [-1.4128, -0.8876]])

In [29]:
len(dataset_normalized.data_list)

1382

## 2. Load model and loss function

In [30]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
wandb.init(
        project=wandb_name,
        config={
            "epochs": num_epochs,
            "batch_size": batch_size,
            "lr": lr,
            "dropout": 0.15,
            })
config = wandb.config

model = GnnModel().to(device)

# Define loss and optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=5e-4)
loss_fct = torch.nn.MSELoss()

## 3. Split into train and test set

In [31]:
train_dl = create_dataloader(dataset=dataset_normalized, is_train=True, batch_size=config.batch_size, train_ratio=train_ratio)
valid_dl = create_dataloader(dataset=dataset_normalized, is_train=False, batch_size=config.batch_size, train_ratio=train_ratio)
n_steps_per_epoch = math.ceil(len(train_dl.dataset) / config.batch_size)
print(n_steps_per_epoch)

Total dataset length: 1382
Training subset length: 960
Total dataset length: 1382
Validation subset length: 400
48


## 4. Train the model

In [32]:
# Train the model
for epoch in range(config.epochs):
    model.train()
    data = next(iter(train_dl))
    for idx in range(len(train_dl)):
        
    # for idx, data in enumerate(train_dl):
        input_node_features, targets = data.x.to(device), data.y.to(device)
        predicted = model(data)
        train_loss = loss_fct(predicted, targets)
        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()
        wandb.log({"train_loss": train_loss.item(), "epoch": epoch, "step": idx})
        print(f"epoch: {epoch}, step: {idx}, loss: {train_loss.item()}")
        
    val_loss = validate_model(model, valid_dl, loss_fct, device)
    wandb.log({"val_loss": val_loss})
    print(f"epoch: {epoch}, val_loss: {val_loss}")
        
wandb.summary["val_loss"] = val_loss
wandb.finish()

RuntimeError: mat1 and mat2 shapes cannot be multiplied (624320x1 and 16x16)