# MPNN Model Evaluation

In [1]:
import pandas as pd
import numpy as np
import torch
from torch_geometric.data import Data
from rdkit import Chem
from rdkit.Chem.rdmolops import GetAdjacencyMatrix
from preprocessing.featurisation import get_atom_features, get_bond_features
from preprocessing.fetch_smiles import resolve_smiles_by_cas_interactive
from preprocessing.smiles_to_graph import batch_from_csv

from models.mpnn_model import MPNNModel

## Importing the Data

In [5]:
graph_list = batch_from_csv("data/processed/input.csv")
graph_list[:5]

[Data(x=[8, 79], edge_index=[2, 16], edge_attr=[16, 10], y=[1]),
 Data(x=[11, 79], edge_index=[2, 24], edge_attr=[24, 10], y=[1]),
 Data(x=[10, 79], edge_index=[2, 22], edge_attr=[22, 10], y=[1]),
 Data(x=[5, 79], edge_index=[2, 10], edge_attr=[10, 10], y=[1]),
 Data(x=[10, 79], edge_index=[2, 18], edge_attr=[18, 10], y=[1])]

In [6]:
num_node_features = graph_list[0].num_node_features
num_edge_features = graph_list[0].num_edge_features
avg_degree = 2*(graph_list[0].num_edges) / graph_list[0].num_nodes
print(f"Number of node features: {num_node_features}")
print(f"Average degree of the graph: {avg_degree:.2f}") # From the avg. degree we can use at least 4 GCNConv layers
print(f"Number of edge features: {num_edge_features}")


Number of node features: 79
Average degree of the graph: 4.00
Number of edge features: 10


In [7]:
from torch_geometric.loader import DataLoader
batch = DataLoader(graph_list, batch_size=32, shuffle=True)

#### Set Random Seed for Reproducibility

In [9]:
import torch
import random
import numpy as np

def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(123)


In [10]:
from torch.nn import MSELoss
from torch_geometric.loader import DataLoader

def train_mpnn_model(dataloader, model, lr=1e-3, epochs=300):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    loss_fn = MSELoss()

    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in dataloader:
            optimizer.zero_grad()
            out = model(batch.x, batch.edge_index, batch.edge_attr, batch.batch)
            loss = loss_fn(out.squeeze(), batch.y.squeeze())
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")
    
    return model

train_loader = DataLoader(graph_list, batch_size=32, shuffle=True)

In [12]:
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score, mean_squared_error

def plot_predictions(model, loader):
    """
    Function to plot predictions vs actual values without scaling.
    """
    all_preds, all_targets = [], []

    model.eval()
    with torch.no_grad():
        for batch in loader:
            preds = model(batch.x, batch.edge_index, batch.edge_attr, batch.batch)
            all_preds.append(preds.squeeze())
            all_targets.append(batch.y.squeeze())

    all_preds = torch.cat(all_preds).cpu().numpy()
    all_targets = torch.cat(all_targets).cpu().numpy()

    r2 = r2_score(all_targets, all_preds)
    rmse = np.sqrt(mean_squared_error(all_targets, all_preds))

    print(f"RÂ² score: {r2:.3f}")
    print(f"RMSE: {rmse:.3f}")

    plt.figure(figsize=(6, 6))
    plt.scatter(all_targets, all_preds, alpha=0.7)
    plt.plot([all_targets.min(), all_targets.max()], 
             [all_targets.min(), all_targets.max()], 'r--')
    plt.xlabel("Actual")
    plt.ylabel("Predicted")
    plt.title("Predicted vs. Actual")
    plt.grid(True)
    plt.tight_layout()
    plt.show()

In [13]:
model = MPNNModel(in_channels=num_node_features, edge_dim=num_edge_features, hidden_dim=64, out_dim=1, dropout_rate=0.2)
print(model)

MPNNModel(
  (edge_nn1): Sequential(
    (0): Linear(in_features=10, out_features=5056, bias=True)
    (1): ReLU()
    (2): Linear(in_features=5056, out_features=5056, bias=True)
  )
  (conv1): NNConv(79, 64, aggr=add, nn=Sequential(
    (0): Linear(in_features=10, out_features=5056, bias=True)
    (1): ReLU()
    (2): Linear(in_features=5056, out_features=5056, bias=True)
  ))
  (ffnn): Sequential(
    (0): Linear(in_features=64, out_features=64, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
    (3): Linear(in_features=64, out_features=1, bias=True)
  )
)


In [14]:
train_mpnn_model(train_loader, model, lr=1e-3, epochs=300)

Epoch 1, Loss: 107.4863
Epoch 2, Loss: 126.3471
Epoch 3, Loss: 92.2640
Epoch 4, Loss: 84.7127
Epoch 5, Loss: 93.4396
Epoch 6, Loss: 106.2781
Epoch 7, Loss: 85.5798
Epoch 8, Loss: 84.8889
Epoch 9, Loss: 89.5376
Epoch 10, Loss: 84.3240
Epoch 11, Loss: 80.9187
Epoch 12, Loss: 71.3736
Epoch 13, Loss: 77.8855
Epoch 14, Loss: 97.8183
Epoch 15, Loss: 91.7695
Epoch 16, Loss: 85.2427
Epoch 17, Loss: 82.3048
Epoch 18, Loss: 66.6919
Epoch 19, Loss: 58.1515
Epoch 20, Loss: 69.1456
Epoch 21, Loss: 58.9860
Epoch 22, Loss: 47.9104
Epoch 23, Loss: 64.5834
Epoch 24, Loss: 87.6238
Epoch 25, Loss: 42.6985
Epoch 26, Loss: 87.0997
Epoch 27, Loss: 47.8797
Epoch 28, Loss: 77.2109
Epoch 29, Loss: 79.9216
Epoch 30, Loss: 72.7383
Epoch 31, Loss: 87.5580
Epoch 32, Loss: 54.6522
Epoch 33, Loss: 56.7840
Epoch 34, Loss: 62.1017
Epoch 35, Loss: 40.4292
Epoch 36, Loss: 36.3215
Epoch 37, Loss: 64.7751
Epoch 38, Loss: 43.8276
Epoch 39, Loss: 41.3344
Epoch 40, Loss: 48.8835
Epoch 41, Loss: 40.6066
Epoch 42, Loss: 81.873

MPNNModel(
  (edge_nn1): Sequential(
    (0): Linear(in_features=10, out_features=5056, bias=True)
    (1): ReLU()
    (2): Linear(in_features=5056, out_features=5056, bias=True)
  )
  (conv1): NNConv(79, 64, aggr=add, nn=Sequential(
    (0): Linear(in_features=10, out_features=5056, bias=True)
    (1): ReLU()
    (2): Linear(in_features=5056, out_features=5056, bias=True)
  ))
  (ffnn): Sequential(
    (0): Linear(in_features=64, out_features=64, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
    (3): Linear(in_features=64, out_features=1, bias=True)
  )
)