In [19]:
import os

import numpy as np
import pandas as pd
import torch

from rdkit import Chem
from rdkit.Chem import rdmolops, rdDistGeom
from torch.utils.data import Dataset



In [12]:
# Create class to transform SMILE into graph

class Graph():

    def __init__(self,
                molecule_smiles: str,
                node_vec_len: int,
                max_atoms: int = None,
                ) -> None:
        
        # Store properties
        self.smiles = molecule_smiles
        self.node_vec_len = node_vec_len
        self.max_atoms = max_atoms

        # Convert SMILES to RDKit mol
        self.smiles_to_mol()

        # Check if valid mol was created and generate graph
        if self.mol:
            self.smiles_to_graph()
    
    def smiles_to_mol(self):
        
        mol = Chem.MolFromSmiles(self.smiles)

        if mol is None:
            self.mol = None
            return
        
        self.mol = Chem.AddHs(mol)

    def smiles_to_graph(self):
        
        # Get list of atoms in molecule
        atoms = self.mol.GetAtoms()

        # If max_atoms is not provided, max_atoms = len(atoms)
        if self.max_atoms is None:
            n_atoms = len(list(atoms))
        else:
            n_atoms = self.max_atoms
        
        # Create empty node matrix
        node_mat = np.zeros((n_atoms, self.node_vec_len))

        # Iterate over atoms and add note to matrix
        for atom in atoms:
            # Get atom index and atomic number
            atom_index = atom.GetIdx()
            atom_no = atom.GetAtomicNum()

            # Assign to node matrix
            node_mat[atom_index, atom_no] = 1
        
        # Get adjacency matrix using RDKit
        adj_mat = rdmolops.GetAdjacencyMatrix(self.mol)
        self.std_adj_mat = np.copy(adj_mat)

        # Get ditance matrix using RDKit
        dist_mat = rdDistGeom.GetMoleculeBoundsMatrix(self.mol)
        dist_mat[dist_mat == 0.] = 1 # Avoids division by 0

        # Get modified adjacency matrix with inverse bond lengths
        adj_mat = adj_mat * (1 / dist_mat)

        # Par the adjacency matrix with 0s
        dim_add = n_atoms - adj_mat.shape[0]
        adj_mat = np.pad(
            adj_mat,
            pad_width=((0, dim_add), (0, dim_add)),
            mode='constant',
        )

        # Add an identity matrix to adjacency matrix,
        # this makes an atom its own neighbor
        adj_mat = adj_mat + np.eye(n_atoms)

        # Save adjacency and node matrices
        self.node_mat = node_mat
        self.adj_mat = adj_mat

In [20]:
# Create Pytorch dataset class

class GraphDataset(Dataset):

    def __init__(self,
                 dataset_path: str,
                 node_vec_len: int,
                 max_atoms: int) -> None:
        self.node_vec_len = node_vec_len
        self.max_atoms = max_atoms

        # Open dataset file
        df = pd.read_csv(dataset_path)

        # Create lists
        self.indices = df.index.to_list()
        self.smiles = df['smiles'].to_list()
        self.outputs = df['labels'].to_list()

    def __len__(self):
        return len(self.indices)
    
    def __getitem__(self, i: int):
        # Get SMILE
        smile = self.smiles[i]

        # Create graph using the Graph class
        mol = Graph(smile,
                    self.node_vec_len,
                    self.max_atoms)
        
        # Get the matrices
        node_mat = torch.Tensor(mol.node_mat)
        adj_mat = torch.Tensor(mol.adj_mat)

        # Get output
        output = torch.Tensor([self.outputs[i]])

        return (node_mat, adj_mat), output, smile

In [21]:
# Create custom collate function for DataLoader
def collate_graph_dataset(dataset: Dataset):
    # Create empty lists
    node_mats = []
    adj_mats = []
    outputs = []
    smiles = []

    # iterate over the dataset and assign components to the correct list
    for i in range(len(dataset)):
        (node_mat, adj_mat), output, smile = dataset[i]
        node_mats.append(node_mat)
        adj_mats.append(adj_mat)
        outputs.append(output)
        smiles.append(smile)

    # Create tensors
    node_mats_tensor = torch.cat(node_mats, dim=0)
    adj_mats_tensor = torch.cat(adj_mats, dim=0)
    outputs_tensor = torch.stack(outputs, dim=0)

    # Return tensors
    return (node_mats_tensor, adj_mats_tensor), outputs_tensor, smiles