In [19]:
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import pandas as pd
import pubchempy as pcp
import sys
import os
import rdkit
import sklearn
import torch
import torch_geometric
import tqdm
import warnings
import time
warnings.filterwarnings("ignore")

In [20]:
from torch_geometric.datasets import MoleculeNet
from rdkit import Chem
from rdkit.Chem import Draw
import pubchempy as pcp

data = MoleculeNet(root=".", name="lipo")
print(f"Number of molecules in Lipophilicity dataset: {len(data)}")
# Print dataset-level attributes (non-private, non-callable)
print("Dataset attributes:")
for attr in dir(data):
    if not attr.startswith("_") and not callable(getattr(data, attr)):
        print(f"{attr}: {getattr(data, attr)}")



Number of molecules in Lipophilicity dataset: 4200
Dataset attributes:
force_reload: False
has_download: True
has_process: True
log: True
name: lipo
names: {'esol': ('ESOL', 'delaney-processed.csv', 'delaney-processed', -1, -2), 'freesolv': ('FreeSolv', 'SAMPL.csv', 'SAMPL', 1, 2), 'lipo': ('Lipophilicity', 'Lipophilicity.csv', 'Lipophilicity', 2, 1), 'pcba': ('PCBA', 'pcba.csv.gz', 'pcba', -1, slice(0, 128, None)), 'muv': ('MUV', 'muv.csv.gz', 'muv', -1, slice(0, 17, None)), 'hiv': ('HIV', 'HIV.csv', 'HIV', 0, -1), 'bace': ('BACE', 'bace.csv', 'bace', 0, 2), 'bbbp': ('BBBP', 'BBBP.csv', 'BBBP', -1, -2), 'tox21': ('Tox21', 'tox21.csv.gz', 'tox21', -1, slice(0, 12, None)), 'toxcast': ('ToxCast', 'toxcast_data.csv.gz', 'toxcast_data', 0, slice(1, 618, None)), 'sider': ('SIDER', 'sider.csv.gz', 'sider', 0, slice(1, 28, None)), 'clintox': ('ClinTox', 'clintox.csv.gz', 'clintox', 0, slice(1, 3, None))}
num_classes: 553
num_edge_features: 3
num_features: 9
num_node_features: 9
pre_filter: No

In [21]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv, global_mean_pool
from torch_geometric.data import DataLoader
from torch_geometric.datasets import MoleculeNet
from torch_geometric.transforms import NormalizeFeatures
from sklearn.metrics import r2_score



# Split dataset into training, validation, and test sets
torch.manual_seed(42)
dataset = data.shuffle()

# Ensure the node features are of type float
dataset.x = dataset.x.float()

train_size = int(0.8 * len(dataset))
val_size = int(0.2 * len(dataset))
# test_size = len(dataset) - train_size - val_size

train_dataset = dataset[:train_size]
val_dataset = dataset[train_size:train_size + val_size]
# test_dataset = dataset[train_size + val_size:]

# Create data loaders
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
# test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Define the GraphSAGE Model
class GraphSAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GraphSAGE, self).__init__()
        
        # Define six GraphSAGE layers
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, hidden_channels)
        self.conv3 = SAGEConv(hidden_channels, hidden_channels)
        self.conv4 = SAGEConv(hidden_channels, hidden_channels)
        self.conv5 = SAGEConv(hidden_channels, hidden_channels)
        self.conv6 = SAGEConv(hidden_channels, hidden_channels)
        # Linear layer to align input dimensions for skip connections
        self.project = torch.nn.Linear(in_channels, hidden_channels)

        # Fully connected layers
        self.fc1 = torch.nn.Linear(hidden_channels, hidden_channels)
        self.fc2 = torch.nn.Linear(hidden_channels, out_channels)
    def forward(self, data):
        # Unpack the data object
        x, edge_index, batch = data.x, data.edge_index, data.batch
        x = x.to(torch.float32)

        # Project input features to match hidden dimensions
        x_proj = self.project(x)

        # Apply SAGEConv layers with skip connections
        x1 = F.relu(self.conv1(x, edge_index))
        x2 = F.relu(self.conv2(x1 + x_proj, edge_index))  # Skip connection from input to second layer
        x3 = F.relu(self.conv3(x2 + x1, edge_index))      # Skip connection from x1 to x3
        x4 = F.relu(self.conv4(x3 + x2, edge_index))      # Skip connection from x2 to x4
        x5 = F.relu(self.conv5(x4 + x3, edge_index))      # Skip connection from x3 to x5
        x6 = F.relu(self.conv6(x5 + x4, edge_index))  
        x = global_mean_pool(x6, batch)

        # Fully connected layers
        x = F.relu(self.fc1(x))
        x = self.fc2(x)

        return x

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
model = GraphSAGE(
    in_channels=9,  # Input size matches node feature size
    hidden_channels=128,  # Hidden layer size
    out_channels=1  # Output size (1 for regression task)
).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=1e-4)
criterion = torch.nn.MSELoss()

cpu


In [24]:
# Define the Training Loop
def train(model, loader):
    model.train()
    total_loss = 0
    all_true = []
    all_pred = []
    for data in loader:
        data = data.to(device)
        optimizer.zero_grad()
        out = model(data)
        loss = criterion(out, data.y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        
        # Collect true and predicted values for R² score calculation
        all_true.append(data.y.cpu().numpy())
        all_pred.append(out.cpu().detach().numpy())
    
    # Flatten the lists and compute R² score
    all_true = np.concatenate(all_true)
    all_pred = np.concatenate(all_pred)
    r2 = r2_score(all_true, all_pred)
    
    return total_loss / len(loader), r2

# Define the Validation and Testing Loop
@torch.no_grad()
def evaluate(model, loader):
    model.eval()
    total_loss = 0
    all_true = []
    all_pred = []
    for data in loader:
        data = data.to(device)
        out = model(data)
        loss = criterion(out, data.y)
        total_loss += loss.item()
        
        # Collect true and predicted values for R² score calculation
        all_true.append(data.y.cpu().numpy())
        all_pred.append(out.cpu().detach().numpy())
    
    # Flatten the lists and compute R² score
    all_true = np.concatenate(all_true)
    all_pred = np.concatenate(all_pred)
    r2 = r2_score(all_true, all_pred)
    
    return total_loss / len(loader), r2

# Train the Model
epochs = 200
for epoch in range(1, epochs + 1):
    train_loss, train_r2 = train(model, train_loader)
    val_loss, val_r2 = evaluate(model, val_loader)
    print(f"Epoch: {epoch:03d}, Train Loss: {train_loss:.4f}, Train R²: {train_r2:.4f}, Validation Loss: {val_loss:.4f}, Validation R²: {val_r2:.4f}")

# Test the Model
test_loss, test_r2 = evaluate(model, val_loader)
print(f"Test Loss: {test_loss:.4f}, Test R²: {test_r2:.4f}")

Epoch: 001, Train Loss: 0.9851, Train R²: 0.3284, Validation Loss: 0.9847, Validation R²: 0.2684
Epoch: 002, Train Loss: 0.9484, Train R²: 0.3534, Validation Loss: 0.9935, Validation R²: 0.2625
Epoch: 003, Train Loss: 0.9027, Train R²: 0.3845, Validation Loss: 0.9063, Validation R²: 0.3296
Epoch: 004, Train Loss: 0.8606, Train R²: 0.4133, Validation Loss: 0.9065, Validation R²: 0.3307
Epoch: 005, Train Loss: 0.8295, Train R²: 0.4345, Validation Loss: 0.8457, Validation R²: 0.3851
Epoch: 006, Train Loss: 0.7904, Train R²: 0.4611, Validation Loss: 0.9330, Validation R²: 0.3299
Epoch: 007, Train Loss: 0.7712, Train R²: 0.4742, Validation Loss: 0.7770, Validation R²: 0.4292
Epoch: 008, Train Loss: 0.7540, Train R²: 0.4860, Validation Loss: 0.7674, Validation R²: 0.4372
Epoch: 009, Train Loss: 0.7615, Train R²: 0.4808, Validation Loss: 0.8526, Validation R²: 0.3644
Epoch: 010, Train Loss: 0.7253, Train R²: 0.5055, Validation Loss: 0.7330, Validation R²: 0.4606
Epoch: 011, Train Loss: 0.6727