<a href="https://colab.research.google.com/github/ge28yen/Complex-Physics-with-GNNs/blob/main/Copy_of_GNN_Physics_my_implementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 0. Import dependencies, set up configuration



In [3]:
import torch.nn as nn
import torch
from typing import *
import logging
from typing import *
import numpy as np
import math
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

import wandb

# Install torch geometric
!pip install torch-cluster -f https://data.pyg.org/whl/torch-2.5.1+cu121.html
!pip install torch-scatter -f https://data.pyg.org/whl/torch-2.5.1+cu121.html
!pip install torch-sparse -f https://data.pyg.org/whl/torch-2.5.1+cu121.html
!pip install torch-geometric

Looking in links: https://data.pyg.org/whl/torch-2.5.1+cu121.html
Looking in links: https://data.pyg.org/whl/torch-2.5.1+cu121.html
Looking in links: https://data.pyg.org/whl/torch-2.5.1+cu121.html


In [5]:
import torch_geometric as pyg
import torch_scatter
from google.colab import drive
drive.mount('/content/drive') # WARNING: the dataset is currenlty only available on the

Mounted at /content/drive


In [None]:
# Configure the logging level and format
logging.basicConfig(
    level=logging.ERROR,  #set to logging.INFO if you want the debugging messages shown, logging.ERROR otherwise
    format="%(asctime)s - %(levelname)s - %(message)s",
    force=True
)

METADATA = {
    'epochs': 1,
    'learning_rate': 0.005,
    'batch_size': 4, #even this batch size overloads the available GPU sometimes
    'connectivity_radius': 0.02,
    'borders_x' : [0.1, 0.9],
    'borders_y' : [0.1, 0.9],
    'timestep' : 1, #time lag duration
    'embedding_dimension': 16,
    'hidden_dimension': 128
}

In [None]:
wandb.login(key="")

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

# 1. Define the Dataset

## Data preprocessing
The data in the form of needs to be preprocessed. The input graph to to the model should have the following attributes:
- graph.x : an array of ints indicating particle types
- graph.y : an array of n_dim - dimensional

In [6]:
def velocities_from_positions(positions_seq, timestep):
    #Calculation: velocity_i = position_i - position_i-1 / timestep

    velocities = positions_seq[:, 1:, :] - positions_seq [:, :-1, :]
    logging.info(f'velocities shape: {velocities.shape}')

    return velocities

def recalculate_positions(target_positions, x_boundaries, y_boundaries):
    #Calculate distances to boundaries from particle positions

    x_target_positions = target_positions[:, 0, 0].squeeze()
    y_target_positions = target_positions[:, 0, 1].squeeze()
    x_boundaries = torch.stack((x_target_positions-x_boundaries[0], x_boundaries[1]-x_target_positions), dim=1) # shape = (400, 2),
    y_boundaries = torch.stack((y_target_positions-y_boundaries[0], y_boundaries[1]-y_target_positions), dim=1)
    logging.info(f'x_boundaries shape: {x_boundaries.shape}')
    logging.info(f'y_boundaries shape: {y_boundaries.shape}')

    return x_boundaries, y_boundaries

def acceleration_from_velocities(velocities, timestep):
    #Calculate ast acceleration from velocities

    acceleration = (velocities[:, 1:] - velocities[:, :-1])/timestep
    logging.info(f'acceleration shape, {acceleration.shape}')
    last_acceleration = acceleration[:,-1] # TODO: this can be done more efficient
    logging.info(f'last acceleration, {last_acceleration[:5]}')

    return last_acceleration
    # Calculate accelerations from velocities

def get_edge_features(edge_indexes, target_positions):
    #Calculate relative displacements and absolute distance of the edges

    transposed_edge_indexes = torch.t(edge_indexes)
    edge_features = []
    for two_nodes in transposed_edge_indexes.numpy():
      node1 = two_nodes[0]
      node2 = two_nodes[1]
      position_node1 = target_positions.squeeze().numpy()[node1]
      position_node2 = target_positions.squeeze().numpy()[node2]
      relative_position = [position_node1[0] - position_node1[0], position_node2[1] - position_node1[0]]
      distance = math.sqrt(sum(x**2 for x in relative_position))
      relative_position.append(distance)
      edge_features.append(relative_position)
    edge_features = torch.tensor(edge_features)
    logging.info(f'edge_features_shape, {edge_features.shape}')

    return edge_features

def preprocess(particle_type, positions_seq ,metadata): # tensors of shape (n_particles), (n_particles, n_timesteps, dim),

  #0. preprocess the postion_sequence:
  target_positions = positions_seq[:, -1: :]
  previous_positions = positions_seq[:, :-1]

  #1.Calculate velocities from previous_positons
  timestep = metadata['timestep']
  velocities = velocities_from_positions(positions_seq, timestep)

  #2.Recalculate positons as given boundaries
  borders_x = metadata['borders_x']
  borders_y = metadata['borders_y']
  x_boundaries, y_boundaries  = recalculate_positions(borders_x, borders_y)

  #3. Calculate the edge indexes:
  target_positions.squeeze()
  edge_indexes = pyg.nn.radius_graph(target_positions.squeeze(), metadata['connectivity_radius'])
  logging.info('edge_indexes shape, {edge_indexes.shape}')       # Should have shape (2, n_edges)


  #4. Calculate the edge features:
  edge_features = get_edge_features(edge_indexes, target_positions)

  #5. Calculate the accelerations:
  acceleration = acceleration_from_velocities(velocities, timestep)

  ## 5. Sum it all up in a graph:
  flattened_velocities = velocities.view(velocities.shape[0], -1)

  graph = pyg.data.Data(
      x = particle_type,
      edge_index = edge_indexes,
      node_features =torch.cat((x_boundaries, y_boundaries, flattened_velocities), dim =-1 ),#torch.cat(None, dim = -1)
      edge_features =edge_features,
      y = acceleration
  )

  return graph

## Define the Datasets

In [7]:
|#check the file loading
base_path = '/content/drive/MyDrive/GGN_for_physics_DATA/'
valid_pth_path = base_path +"/valid_dataset.pth"
valid_json_path = base_path +"/valid_offsets.json"
test_pth_path = base_path + "/test_dataset.pth"
test_json_path = base_path + "/test_offsets.json"


In [8]:
import json

class ShortDataset(torch.utils.data.Dataset):
    def __init__(self, pth_path, offsets_path):
        super().__init__()
        self.dataset = torch.load(pth_path)
        with open(offsets_path, 'rb') as f:
          self.offsets = json.load(f)

        logging.info(f'self.offsets, {self.offsets}')
        self.length = int(list(self.offsets.keys())[-1]) #the length will be the number of the last

    def __len__(self):
        return self.length

    def __getitem__(self, idx):
        offset_id = self.offsets[str(idx)]
        particle_type_offset = offset_id['particle_type']['offset']
        position_offset = offset_id['position']['offset']

        shape =offset_id['position']['shape']
        n_particles = shape[0]
        positions = self.dataset['position'][position_offset:position_offset+n_particles]
        particle_types = self.dataset['particle_type'][particle_type_offset:particle_type_offset +n_particles]
        graph  = preprocess(particle_types, positions, METADATA)
        return graph


In [9]:
#test the Short Dataset
valid_short_dataset = ShortDataset(valid_pth_path, valid_json_path)
test_short_dataset = ShortDataset(test_pth_path, test_json_path)

print(len(valid_short_dataset))
graph = test_short_dataset[1278]


  self.dataset = torch.load(pth_path)


29849


NameError: name 'METADATA' is not defined

## Visualize the graph

In [None]:
# Visualize a datapoint:
if False:
  import numpy as np
  import matplotlib.pyplot as plt
  from matplotlib.animation import FuncAnimation
  from matplotlib import animation
  from IPython.display import HTML

  new_positions = position.transpose(0,1)

  # Create example data: Random walk with (timesteps, n, 2)
  n = 100  # Number of points
  timesteps = 50  # Number of frames
  data = np.cumsum(np.random.randn(timesteps, n, 2), axis=0)  # Random walk data

  data = new_positions.numpy()
  # Set up the figure and axis
  fig, ax = plt.subplots()
  ax.set_xlim(np.min(data[:, :, 0]), np.max(data[:, :, 0]))
  ax.set_ylim(np.min(data[:, :, 1]), np.max(data[:, :, 1]))
  sc = ax.scatter([], [], s=10)

  # Initialization function
  def init():
      sc.set_offsets(np.empty((0, 2)))  # Empty 2D array for initialization
      return sc,

  # Update function
  def update(frame):
      offsets = data[frame]  # Extract frame data of shape (n, 2)
      sc.set_offsets(offsets)
      return sc,

  # Create the animation
  ani = FuncAnimation(
      fig, update, frames=6, init_func=init, blit=True, interval=200
  )

  HTML(ani.to_html5_video())

In [None]:
# Implement this later
class LongDataset(torch.utils.data.Dataset):
  def __init__(self, pth_path, json_path):
    super().__init__()
    with open(pth_path, 'rb') as f:
      self.dataset = f
    with open(json_path) as f:
      self.offsets = json.load(f)
  def len(self):
    return None
  def get(self, idx):
    return None

# 2. Define the GNN model

In [None]:
class MLP(nn.Module):
  def __init__(self, input_size, hidden_size, output_size, n_layers):
    assert n_layers>=2
    super().__init__()
    self.MLP=nn.ModuleList()
    self.MLP.append(nn.Linear(input_size, hidden_size))
    self.MLP.append(nn.ReLU())
    for i in range(1, n_layers-1):
      if i == n_layers-2:
        self.MLP.append(nn.Linear(hidden_size, output_size))
      else:
        self.MLP.append(nn.Linear(hidden_size, hidden_size))
        self.MLP.append(nn.ReLU())

    # idk what this reset_parameters is
    # they also do LayerNorm
    self.reset_parameters()

  def reset_parameters(self):
    pass

  def forward(self, x):
    for layer in self.MLP:
      x= layer(x)
    return x

In [None]:
class Processor(pyg.nn.MessagePassing):
  def __init__(self, hidden_size, n_layers):
    super().__init__()
    self.lin_node = MLP(hidden_size*2, hidden_size, hidden_size, n_layers)
    self.lin_edge = MLP(hidden_size*3, hidden_size, hidden_size, n_layers)

  def forward(self, x, edge_index, edge_feature):
      edge_out, aggr = self.propagate(edge_index, x=(x, x), edge_feature=edge_feature)
      node_out = self.lin_node(torch.cat((x, aggr), dim=-1))
      edge_out = edge_feature + edge_out
      node_out = x + node_out
      return node_out, edge_out

  def message(self, x_i, x_j, edge_feature):
    input = torch.cat((x_i, x_j, edge_feature), dim = -1)
    output = self.lin_edge(input)
    return output

  def aggregate(self, inputs, index, dim_size = None):
    logging.info(f'aggregate inputs shape, {inputs.shape}') # n_edges, n_edge features  <-128
    logging.info(f'aggregate index {index}') # n_edges
    logging.info(f'aggregate index {index.shape}')
    out = torch_scatter.scatter(inputs, index, dim=self.node_dim, dim_size=dim_size, reduce="sum") # this I need to understand still
    return (inputs, out)

In [None]:
import torch.nn as nn

class LearnedSimulator(nn.Module):
  def __init__(self,
               n_particle_types,
               embedding_dim,
               hidden_dimensions,
               n_layers = 3,
               n_mp_layers = 3,
               window_size = 6,
               dim = 2
               ):
    super().__init__()
    self.type_embeds = nn.Embedding(n_particle_types, embedding_dim)
    self.node_preprocess = MLP(embedding_dim + dim * (window_size -1 + 2), hidden_dimensions, hidden_dimensions, n_layers)
    self.edge_preprocess = MLP(dim+1, hidden_dimensions, hidden_dimensions, n_layers)
    self.node_postprocess = MLP(hidden_dimensions, hidden_dimensions, dim, n_layers)
    self.n_mp_layers = n_mp_layers
    self.layers = torch.nn.ModuleList()
    for _ in range (self.n_mp_layers):
      self.layers.append(Processor(hidden_dimensions, hidden_dimensions))

  ## Reminder: graph.x -> size = (n_nodes), graph.pos -> size = ((n_nodes,14)), graph.

  def forward(self, graph):
    type_embedded = self.type_embeds(graph.x)
    node_inputs= torch.cat((type_embedded, graph.node_features), dim = -1)
    logging.info(f'Shape of node input, {node_inputs.shape}')
    node_processed = self.node_preprocess(node_inputs)
    edge_processed = self.edge_preprocess(graph.edge_features)
    logging.info(f'node_processed, {node_processed.shape}')
    logging.info(f'edge_processed,{edge_processed.shape}')
    logging.info(f'index, {graph.edge_index}')
    for processor_layer in self.layers:
      node_processed, edge_processed = processor_layer(node_processed, graph.edge_index, edge_processed)
    node_decoded = self.node_postprocess(node_processed)
    return node_decoded

In [None]:
simulator = LearnedSimulator(9, METADATA['embedding_dimension'],METADATA['hidden_dimension'])

wandb.init(project="GNN_for_physics")

In [None]:
## Try forwarding the data through the simulator once:
logging.info(f'{graph.x}')
out=simulator(graph)
logging.info(f'this is out shape: {out.shape}')

# 3. Perform Training

In [None]:
# Define the DataLoader
import torch.optim as optim
from tqdm import tqdm

simulator = simulator.cuda()
DataLoader = pyg.data.DataLoader
first_dataloader = DataLoader(dataset = valid_short_dataset, batch_size = METADATA['batch_size'], shuffle = False)

valid_dataloader= DataLoader(dataset = valid_short_dataset, batch_size = METADATA['batch_size'])
test_dataloader= DataLoader(dataset = test_short_dataset, batch_size = METADATA['batch_size'], shuffle = False)

loss_function = nn.MSELoss()
optimizer = optim.Adam(params = simulator.parameters(),lr = METADATA['learning_rate'])

for epoch in range(METADATA['epochs']):
  simulator.train()
  progress_bar = tqdm(first_dataloader, desc=f"Epoch {epoch+1}", unit="batch", total=len(first_dataloader))
  for i, data in enumerate(progress_bar):
    if i == 100:
      break
    optimizer.zero_grad()
    n_particles = len(data.x)
    if n_particles >2000:
      continue
    data = data.cuda()
    out = simulator(data)
    print(out)
    print(data.y)
    loss = loss_function(out, data.y)
    memory_allocated = torch.cuda.memory_allocated(device=None)
    memory_reserved = torch.cuda.memory_reserved(device=None)

    if i%100 == 0:
      simulator.eval()
      losses = []
      for i, data in enumerate(test_dataloader):
        print('I, DATA', i, data)
        break
        if i == 25:
          break
        n_particles = len(data.x)
        if n_particles >2000:
          continue
        data = data.cuda()
        out = simulator(data)
        loss = loss_function(out, data.y)
        losses.append(loss.item())
      avg_loss = sum(losses)/len(losses)
      simulator.train()
    loss.backward()
    optimizer.step()
    progress_bar.set_postfix({"loss": loss.item(),'n_particles': n_particles, "memory allocated": memory_allocated, "memory reserved": memory_reserved, "test avg loss": avg_loss})


RuntimeError: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx

In [None]:
#testing the test dataloader
print(len(test_short_dataset))


In [None]:
torch.cuda.empty_cache()
del simulator, first_dataloader

In [None]:
import gc

gc.collect()  # Collect unreferenced objects
torch.cuda.empty_cache()

In [None]:
from google.colab import drive

drive.mount('/content/drive')