In [None]:
import math
import random
import pickle

import numpy as np
import pandas as pd
import geopandas as gpd
import tqdm
import wandb

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as T
from torch.utils.data import DataLoader, Dataset, Subset
import torch_geometric
from torch_geometric.data import Data, Batch
from torch_geometric.transforms import LineGraph
from shapely.geometry import LineString
import gnn_io as gio

from sklearn.preprocessing import MinMaxScaler, StandardScaler

ModuleNotFoundError: No module named 'torch'

# Abstract

Here we investigate the data.

In [None]:
# Define parameters
num_epochs = 40
batch_size = 20
lr = 0.001
project_name = 'multiple_features'
train_ratio = 0.8
wandb.login()



True

## 1. Load data and create the dataset

In [None]:
class GnnModel(nn.Module):
    def __init__(self):
        super().__init__()
        torch.manual_seed(12345)
        self.conv1 = torch_geometric.nn.GCNConv(2, 16)
        # self.conv2 = torch_geometric.nn.GATConv(16, 16)
        self.conv3 = torch_geometric.nn.GCNConv(16, 1)
        self.weight_first_dim = 2.0
        # self.conv3 = torch_geometric.nn.GCNConv(16, 1)
        # self.gat1 = torch_geometric.nn.GATConv(16, 16)
        # self.conv4 = torch_geometric.nn.GCNConv(16, 1)
                
        # self.convWithPos = torch_geometric.nn.conv.PointNetConv(1, 16, 3)
        
    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = x[:, [0, 2]]
        # x = x[:, [0, 3]]
        x[:, 0] *= self.weight_first_dim
        x = self.conv1(x, edge_index)
        # x = F.relu(x)
        # x = F.dropout(x, training=self.training)
        # x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv3(x, edge_index)
        # x = F.relu(x)
        # x = F.dropout(x, training=self.training)
        # x = self.conv3(x, edge_index)
        # x = F.relu(x)
        # x = F.dropout(x, training=self.training)
        # x = self.gat1(x, edge_index)
        # x = F.relu(x)
        # x = F.dropout(x, training=self.training)
        # x = self.conv4(x, edge_index)
        return x

In [None]:
def normalize_positional_features(dataset):
    # Collect all positional features
    all_pos_features = []
    for data in dataset:
        all_pos_features.append(data.pos)

    # Stack all positional features into a single tensor
    all_pos_features = torch.cat(all_pos_features, dim=0)

    # Fit the min-max scaler on the positional features
    scaler = MinMaxScaler()
    scaler.fit(all_pos_features)

    # Apply the scaler to each data instance and store as a new feature
    for data in dataset:
        data.normalized_pos = torch.tensor(scaler.transform(data.pos), dtype=torch.float)
    return dataset

def normalize_y_values(dataset):
    # Collect all y values
    all_y_values = []
    for data in dataset:
        all_y_values.append(data.y)

    # Stack all y values into a single tensor
    all_y_values = torch.cat(all_y_values, dim=0)

    # Fit the min-max scaler on the y values
    scaler = MinMaxScaler()
    scaler.fit(all_y_values)

    # Apply the scaler to each data instance and store as a new feature
    for data in dataset:
        data.normalized_y = torch.tensor(scaler.transform(data.y), dtype=torch.float)  # Keep the 2D shape

    return dataset

def normalize_dataset(dataset):
    # Normalize node features
    dataset = normalize_data(dataset)
    # Normalize positional features (if any)
    dataset = normalize_positional_features(dataset)
    # Normalize y values
    dataset = normalize_y_values(dataset)
    return dataset


def replace_invalid_values(tensor):
    finite_mask = torch.isfinite(tensor)
    if finite_mask.any():
        max_finite_value = tensor[finite_mask].max()
    else:
        max_finite_value = torch.tensor(0.0)
    tensor[~finite_mask] = max_finite_value
    tensor[torch.isnan(tensor)] = max_finite_value
    return tensor

def normalize_data(dataset):
    shape_of_x = dataset[0].x.shape[1]
    if shape_of_x == 1:
        # Collect all node features
        all_node_features = []
        for data in dataset:
            all_node_features.append(data.x)

        # Stack all node features into a single tensor
        all_node_features = torch.cat(all_node_features, dim=0)

        # Fit the min-max scaler on the node features
        scaler = MinMaxScaler()
        scaler.fit(all_node_features)

        # Apply the scaler to each data instance and store as a new feature
        for data in dataset:
            data.normalized_x = torch.tensor(scaler.transform(data.x), dtype=torch.float)
            
    else:
        normalized_x = []
        for i in range(shape_of_x):
            all_node_features_this_dimension = []
            # Concatenate all x attributes across the dataset, keeping the feature dimension
            for data in dataset:
                all_node_features_this_dimension.append(data.x[:,i])
                
            all_node_features_this_dimension = torch.cat(all_node_features_this_dimension, dim=0).reshape(-1, 1)
        
            # Replace infinity values with the maximum finite value in the tensor
            finite_mask = torch.isfinite(all_node_features_this_dimension)
            max_finite_value = all_node_features_this_dimension[finite_mask].max()
            all_node_features_this_dimension[~finite_mask] = max_finite_value

            # Check and handle NaN values
            nan_mask = torch.isnan(all_node_features_this_dimension)
            all_node_features_this_dimension[nan_mask] = max_finite_value

            # Convert tensor to numpy array for scikit-learn
            all_node_features_np = all_node_features_this_dimension.numpy()
            
            scaler = MinMaxScaler()
            scaler.fit(all_node_features_np)
            for data in dataset:
                data_x_dim_replaced = replace_invalid_values(data.x[:, i].reshape(-1, 1))
                normalized_x_this_dimension = torch.tensor(scaler.transform(data_x_dim_replaced.numpy()), dtype=torch.float)
                
                if i == 0:
                    data.normalized_x = normalized_x_this_dimension
                else:
                    data.normalized_x = torch.cat((data.normalized_x, normalized_x_this_dimension), dim=1)
            
            normalized_x.append(normalized_x_this_dimension)
        normalized_x = torch.cat(normalized_x, dim=1)
        data.normalized_x = normalized_x  
    return dataset

In [None]:
# Load the list of dictionaries
data_dict_list = torch.load('../data/dataset_1pm_0-1382_with_more_infos.pt')

# Reconstruct the Data objects
datalist = [Data(x=d['x'], edge_index=d['edge_index'], pos=d['pos'], y=d['y']) for d in data_dict_list]

# # Apply normalization to your dataset
dataset_normalized = normalize_dataset(datalist)

### Approximate MSE - baseline error

In [None]:
y_values_normalized = np.concatenate([data.normalized_y for data in dataset_normalized])

# Compute the mean and standard deviation
mean_y_normalized = np.mean(y_values_normalized)
std_y_normalized = np.std(y_values_normalized)

print(f"Mean of y: {mean_y_normalized}")
print(f"Standard Deviation of y: {std_y_normalized}")

# Convert numpy arrays to torch tensors
y_values_normalized_tensor = torch.tensor(y_values_normalized, dtype=torch.float32)
mean_y_normalized_tensor = torch.tensor(mean_y_normalized, dtype=torch.float32)

# Create the target tensor with the same shape as y_values_normalized_tensor
target_tensor = mean_y_normalized_tensor * torch.ones_like(y_values_normalized_tensor)

# Instantiate the MSELoss function
mse_loss = torch.nn.MSELoss()

# Compute the MSE 
mse = mse_loss(y_values_normalized_tensor, target_tensor)

# Print the MSE value
print("Baseline error is: " + str(mse.item()))

Mean of y: 0.030937498435378075
Standard Deviation of y: 0.074600949883461
Baseline error is: 0.005565311759710312


## 2. Load model and loss function

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
wandb.init(
        project=project_name,
        config={
            "epochs": num_epochs,
            "batch_size": batch_size,
            "lr": lr,
            'early_stopping_patience': 10,
            # "dropout": 0.15,
            })
config = wandb.config

model = GnnModel().to(device)

# Define loss and optimizer
# optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=5e-4)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
loss_fct = torch.nn.MSELoss()

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011167903711116702, max=1.0…

## 3. Split into train and test set

In [None]:
train_dl = gio.create_dataloader(dataset=dataset_normalized, is_train=True, batch_size=config.batch_size, train_ratio=train_ratio)
valid_dl = gio.create_dataloader(dataset=dataset_normalized, is_train=False, batch_size=config.batch_size, train_ratio=train_ratio)
n_steps_per_epoch = math.ceil(len(train_dl.dataset) / config.batch_size)
print(n_steps_per_epoch)

Total dataset length: 1382
Training subset length: 1100
Total dataset length: 1382
Validation subset length: 260
55


## 4. Train the model

We first find a good model for one batch. 

In [None]:
early_stopping = gio.EarlyStopping(patience=5, verbose=True)

for epoch in range(config.epochs):
    model.train()
    data = next(iter(train_dl))
    # for idx in range(len(train_dl)):
    # for idx, data in enumerate(train_dl):
    input_node_features, targets = data.normalized_x.to(device), data.normalized_y.to(device)
    predicted = model(data)
    train_loss = loss_fct(predicted, targets)
    optimizer.zero_grad()
    train_loss.backward()
    optimizer.step()
    # wandb.log({"train_loss": train_loss.item(), "epoch": epoch, "step": idx})
        # print(f"epoch: {epoch}, step: {idx}, loss: {train_loss.item()}")
        
    val_loss = gio.validate_model(model, valid_dl, loss_fct, device)
    wandb.log({"val_loss": val_loss})
    print(f"epoch: {epoch}, val_loss: {val_loss}")
    early_stopping(val_loss)
    if early_stopping.early_stop:
        print("Early stopping triggered. Stopping training.")
        break
    
wandb.summary["val_loss"] = val_loss
wandb.finish()

NameError: name 'gio' is not defined