## Dataset creation and loading

In [78]:
import pandas as pd
# DATA_PATH = "data/raw/HIv.csv"
train_data_path = "https://raw.githubusercontent.com/deepfindr/gnn-project/main/data/raw/HIV_train.csv"
test_data_path  = "https://raw.githubusercontent.com/deepfindr/gnn-project/main/data/raw/HIV_test.csv"
train_data = pd.read_csv(train_data_path)
len(train_data)
train_data.head(100)


Unnamed: 0,index,smiles,activity,HIV_active
0,3999,CC1c2nc(N)nc(N)c2CN1C(=O)c1ccccc1,CI,0
1,4000,Cc1nc(N)c2c(n1)C(C)N(C(=O)c1ccccc1)C2,CI,0
2,4001,NC(=S)NN=Cc1ccc(O)cn1,CI,0
3,4002,COC1C(OC(=O)c2ccc(C)[nH]2)C(O)C(Oc2ccc3c(O)c(N...,CM,1
4,4003,O=C1C=C2C=CC3CC2(O1)C1CCCCN31,CI,0
...,...,...,...,...
95,4094,C=C=C[PH](c1ccccc1)(c1ccccc1)c1ccccc1,CI,0
96,4095,CC(O)C[PH](c1ccccc1)(c1ccccc1)c1ccccc1,CI,0
97,4096,c1ccc([PH](Cc2ccco2)(c2ccccc2)c2ccccc2)cc1,CI,0
98,4097,c1ccc([PH](CC2CCCO2)(c2ccccc2)c2ccccc2)cc1,CI,0


In [79]:
print(train_data.shape)
print(train_data["HIV_active"].value_counts()) ### 1278 values are HIV inhibitors, rest are 0

(37128, 4)
HIV_active
0    35850
1     1278
Name: count, dtype: int64


In [80]:
import rdkit
from rdkit import Chem
from rdkit.Chem import Draw

In [81]:
sample_smiles= train_data["smiles"][4:30].values

samples_mols = [Chem.MolFromSmiles(sample_smiles)]
print("sample molecules : ", samples_mols)

TypeError: No registered converter was able to produce a C++ rvalue of type std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> > from this Python object of type numpy.ndarray

In [85]:
from torch_geometric.data import Dataset, Data
import numpy as np
import os
from rdkit.Chem import rdmolops
from tqdm import tqdm


In [92]:

import torch

class MoleculeDataset(Dataset):
    def __init__(self, root, filename, test=False, transform=None, pre_transform=None):
        """
        root = Where the dataset should be stored. This folder is split
        into raw_dir (downloaded dataset) and processed_dir (processed data). 
        """
        self.test = test
        self.filename = filename
        super(MoleculeDataset, self).__init__(root, transform, pre_transform)
        
    @property
    def raw_file_names(self):
        """ If this file exists in raw_dir, the download is not triggered.
            (The download func. is not implemented here)  
        """
        return self.filename

    @property
    def processed_file_names(self):
        """ If these files are found in raw_dir, processing is skipped"""
        self.data = pd.read_csv(self.raw_paths[0]).reset_index()

        if self.test:
            return [f'data_test_{i}.pt' for i in list(self.data.index)]
        else:
            return [f'data_{i}.pt' for i in list(self.data.index)]

    def download(self):
        pass

    def process(self):
        self.data = pd.read_csv(self.raw_paths[0])
        for index, mol in tqdm(self.data.iterrows(), total=self.data.shape[0]):
            mol_obj = Chem.MolFromSmiles(mol["smiles"])
            # Get node features
            node_feats = self._get_node_features(mol_obj)
            # Get edge features
            edge_feats = self._get_edge_features(mol_obj)
            # Get adjacency info
            edge_index = self._get_adjacency_info(mol_obj)
            # Get labels info
            label = self._get_labels(mol["HIV_active"])

            # Create data object
            data = Data(x=node_feats, 
                        edge_index=edge_index,
                        edge_attr=edge_feats,
                        y=label,
                        smiles=mol["smiles"]
                        ) 
            if self.test:
                torch.save(data, 
                    os.path.join(self.processed_dir, 
                                 f'data_test_{index}.pt'))
            else:
                torch.save(data, 
                    os.path.join(self.processed_dir, 
                                 f'data_{index}.pt'))

    def _get_node_features(self, mol):
        """ 
        This will return a matrix / 2d array of the shape
        [Number of Nodes, Node Feature size]
        """
        all_node_feats = []

        for atom in mol.GetAtoms():
            node_feats = []
            # Feature 1: Atomic number        
            node_feats.append(atom.GetAtomicNum())
            # Feature 2: Atom degree
            node_feats.append(atom.GetDegree())
            # Feature 3: Formal charge
            node_feats.append(atom.GetFormalCharge())
            # Feature 4: Hybridization
            node_feats.append(atom.GetHybridization())
            # Feature 5: Aromaticity
            node_feats.append(atom.GetIsAromatic())
            # Feature 6: Total Num Hs
            node_feats.append(atom.GetTotalNumHs())
            # Feature 7: Radical Electrons
            node_feats.append(atom.GetNumRadicalElectrons())
            # Feature 8: In Ring
            node_feats.append(atom.IsInRing())
            # Feature 9: Chirality
            node_feats.append(atom.GetChiralTag())

            # Append node features to matrix
            all_node_feats.append(node_feats)

        all_node_feats = np.asarray(all_node_feats)
        return torch.tensor(all_node_feats, dtype=torch.float)

    def _get_edge_features(self, mol):
        """ 
        This will return a matrix / 2d array of the shape
        [Number of edges, Edge Feature size]
        """
        all_edge_feats = []

        for bond in mol.GetBonds():
            edge_feats = []
            # Feature 1: Bond type (as double)
            edge_feats.append(bond.GetBondTypeAsDouble())
            # Feature 2: Rings
            edge_feats.append(bond.IsInRing())
            # Append node features to matrix (twice, per direction)
            all_edge_feats += [edge_feats, edge_feats]

        all_edge_feats = np.asarray(all_edge_feats)
        return torch.tensor(all_edge_feats, dtype=torch.float)

    def _get_adjacency_info(self, mol):
        """
        We could also use rdmolops.GetAdjacencyMatrix(mol)
        but we want to be sure that the order of the indices
        matches the order of the edge features
        """
        edge_indices = []
        for bond in mol.GetBonds():
            i = bond.GetBeginAtomIdx()
            j = bond.GetEndAtomIdx()
            edge_indices += [[i, j], [j, i]]

        edge_indices = torch.tensor(edge_indices)
        edge_indices = edge_indices.t().to(torch.long).view(2, -1)
        return edge_indices

    def _get_labels(self, label):
        label = np.asarray([label])
        return torch.tensor(label, dtype=torch.int64)

    def len(self):
        return self.data.shape[0]

    def get(self, idx):
        """ - Equivalent to __getitem__ in pytorch
            - Is not needed for PyG's InMemoryDataset
        """
        if self.test:
            data = torch.load(os.path.join(self.processed_dir, 
                                 f'data_test_{idx}.pt'))
        else:
            data = torch.load(os.path.join(self.processed_dir, 
                                 f'data_{idx}.pt'))   
        return data

In [95]:
dataset = MoleculeDataset(root = "data/", filename="HIV_train.csv")

Processing...
  0%|          | 0/37128 [00:00<?, ?it/s]


AttributeError: 'Mol' object has no attribute 'GetAtoms'

#### Model

In [97]:
import torch
import torch.nn.functional as F
from torch.nn import Sequential, Linear, BatchNorm1d, ReLU
from torch_geometric.nn import TransformerConv, GATConv, TopKPooling, BatchNorm
from torch_geometric.nn import global_mean_pool as gap, global_max_pool as gmp
from torch_geometric.nn.conv.x_conv import XConv

In [98]:
torch.manual_seed(42)

<torch._C.Generator at 0x76bdc373fa10>

In [100]:
class GNN(torch.nn.Module):
    def __init__(self, feature_size):
        num_classes = 2
        embedding_size = 1024

        ## GNN layers

        self.conv1 = GATConv(feature_size, embedding_size, heads = 3, dropout = 0.3)  ## Numnber of heads = 3,m therefore output shape if embedding_size * 3
        self.head_transform1 = Linear(embedding_size * 3, embedding_size)    ## Here we change the embedding size back to normal: embedding_size * 3 ---> embedding_size
        self.pool1 = TopKPooling(embedding_size, ratop = 0.8)

        self.conv2 = GATConv(embedding_size, embedding_size, heads = 3, dropout= 0.3)
        self.head_transform2 = Linear(embedding_size * 3, embedding_size)
        self.pool2 = TopKPooling(embedding_size, ratio = 0.5)
        
        
        self.conv3 = GATConv(embedding_size * 3, embedding_size)
        self.head_transform3 = Linear(embedding_size * 3, embedding_size)
        self.pool3 =  TopKPooling(embedding_size, ratio = 0.2)

        ### Linear Layers
        self.linear1 = Linear(embedding_size * 2, 1024)
        self.liner2  = Linear(1024, num_classes)

    def forward(self, x, edge_attr, edge_index, batch_index):
        ## First block
        x = self.conv1(x, edge_index)
        x = self.head_transform1(x)

        x, edge_index, edge_attr, batch_index, _, _ = self.pool1(x, 
                                                                 edge_index, 
                                                                 None, 
                                                                 batch_index)
        

        x1 = torch.cat([gmp(x, batch_index), gap(x, batch_index)], dim = 1)   ### Intermediate results

        ## Second block

        x = self.conv2(x, edge_index)
        x = self.head_transform2(x)

        x, edge_index, edge_attr, batch_index, _, _ = self.pool2(x, 
                                                                 edge_index, 
                                                                 None, 
                                                                 batch_index)
        

        x2 = torch.cat([gmp(x, batch_index), gap(x, batch_index)], dim = 1)

        ### Third block

        x = self.conv3(x, edge_index)
        x = self.head_transform3(x)

        x, edge_index, edge_attr, batch_index, _, _ = self.pool3(x, 
                                                                 edge_index, 
                                                                 None, 
                                                                 batch_index)
        

        x3 = torch.cat([gmp(x, batch_index), gap(x, batch_index)], dim = 1)


        ### Concat pooled vectors
        x  = x1 + x2 + x3     

        ## Output block

        x = self.linear(x).relu()
        x = F.dropout(x, p = 0.5, training = self.training)
        x = self.linear2(x)

        return x