# PyTorch Geometric GNN for MoleculeNet HIV Dataset

# create dataset

In [2]:
import os
import requests
from pathlib import Path

In [3]:
import numpy as np
import pandas as pd
from tqdm import tqdm

In [4]:
import torch
import torch_geometric
from torch_geometric.data import Dataset, Data

In [5]:
print(f'torch version: {torch.__version__}')
print(f'cuda available: {torch.cuda.is_available()}')
print(f'torch geometric version: {torch_geometric.__version__}')

torch version: 2.0.1
cuda available: False
torch geometric version: 2.3.1


In [6]:
from rdkit import Chem

In [27]:
class MoleculeDataset(Dataset):
    def __init__(self, root, filename, test=False, transform=None, pre_transform=None):
        """
        root = Where the dataset should be stored. This folder is split
        into raw_dir (downloaded dataset) and processed_dir (processed data). 
        """
        self.test = test
        self.filename = filename
        super(MoleculeDataset, self).__init__(root, transform, pre_transform)
        
    @property
    def raw_file_names(self):
        """ If this file exists in raw_dir, the download is not triggered.
            (The download func. is not implemented here)  
        """
        return self.filename

    @property
    def processed_file_names(self):
        """ If these files are found in raw_dir, processing is skipped"""
        self.data = pd.read_csv(self.raw_paths[0]).reset_index()

        if self.test:
            return [f'data_test_{i}.pt' for i in list(self.data.index)]
        else:
            return [f'data_{i}.pt' for i in list(self.data.index)]
        

    def download(self):
        pass

    def process(self):
        self.data = pd.read_csv(self.raw_paths[0]).reset_index()
        featurizer = dc.feat.MolGraphConvFeaturizer(use_edges=True)
        for index, row in tqdm(self.data.iterrows(), total=self.data.shape[0]):
            # Featurize molecule
            mol = Chem.MolFromSmiles(row["smiles"])
            f = featurizer._featurize(mol)
            data = f.to_pyg_graph()
            data.y = self._get_label(row["HIV_active"])
            data.smiles = row["smiles"]
            if self.test:
                torch.save(data, 
                    os.path.join(self.processed_dir, 
                                 f'data_test_{index}.pt'))
            else:
                torch.save(data, 
                    os.path.join(self.processed_dir, 
                                 f'data_{index}.pt'))
            

    def _get_label(self, label):
        label = np.asarray([label])
        return torch.tensor(label, dtype=torch.int64)

    def len(self):
        return self.data.shape[0]

    def get(self, idx):
        """ - Equivalent to __getitem__ in pytorch
            - Is not needed for PyG's InMemoryDataset
        """
        if self.test:
            data = torch.load(os.path.join(self.processed_dir, 
                                 f'data_test_{idx}.pt'))
        else:
            data = torch.load(os.path.join(self.processed_dir, 
                                 f'data_{idx}.pt'))        
        return data

# featurize dataset

In [1]:
import pandas as pd
import torch
import torch_geometric
from torch_geometric.data import Dataset
import numpy as np
import os
from tqdm import tqdm
import deepchem as dc
from rdkit import Chem

In [3]:
print(f"Torch version: {torch.__version__}")
print(f"Cuda available: {torch.cuda.is_available()}")
print(f"Torch geometric version: {torch_geometric.__version__}")

Torch version: 2.0.1
Cuda available: False
Torch geometric version: 2.3.1


In [2]:
class MoleculeDataset(Dataset):
    def __init__(self, root, filename, test=False, transform=None, pre_transform=None):
        """
        root = where the dataset should be stored; this folder is split
        into raw_dir (downloaded dataset) and processed_dir (processed data)
        """
        self.test = test
        self.filename = filename
        super(MoleculeDataset, self).__init__(root, transform, pre_transform)
        
    @property
    def raw_file_names(self):
        """ if this file exists in raw_dir, the download is not triggered
            (the download func. is not implemented here)  
        """
        return self.filename

    @property
    def processed_file_names(self):
        """ if these files are found in raw_dir, processing is skipped"""
        self.data = pd.read_csv(self.raw_paths[0]).reset_index()

        if self.test:
            return [f'data_test_{i}.pt' for i in list(self.data.index)]
        else:
            return [f'data_{i}.pt' for i in list(self.data.index)]
        

    def download(self):
        pass

    def process(self):
        self.data = pd.read_csv(self.raw_paths[0]).reset_index()
        featurizer = dc.feat.MolGraphConvFeaturizer(use_edges=True)
        for index, row in tqdm(self.data.iterrows(), total=self.data.shape[0]):
            # featurize molecule
            mol = Chem.MolFromSmiles(row["smiles"])
            f = featurizer._featurize(mol)
            data = f.to_pyg_graph()
            data.y = self._get_label(row["HIV_active"])
            data.smiles = row["smiles"]
            if self.test:
                torch.save(data, 
                    os.path.join(self.processed_dir, 
                                 f'data_test_{index}.pt'))
            else:
                torch.save(data, 
                    os.path.join(self.processed_dir, 
                                 f'data_{index}.pt'))
            

    def _get_label(self, label):
        label = np.asarray([label])
        return torch.tensor(label, dtype=torch.int64)

    def len(self):
        return self.data.shape[0]

    def get(self, idx):
        """ - equivalent to __getitem__ in pytorch
            - is not needed for PyG's InMemoryDataset
        """
        if self.test:
            data = torch.load(os.path.join(self.processed_dir, 
                                 f'data_test_{idx}.pt'))
        else:
            data = torch.load(os.path.join(self.processed_dir, 
                                 f'data_{idx}.pt'))        
        return data

# oversample data

In [2]:
import pandas as pd

In [11]:
data = pd.read_csv("data/raw/HIV_train.csv")
data.index = data["index"]
data.head()

Unnamed: 0_level_0,index,smiles,activity,HIV_active
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3999,3999,CC1c2nc(N)nc(N)c2CN1C(=O)c1ccccc1,CI,0
4000,4000,Cc1nc(N)c2c(n1)C(C)N(C(=O)c1ccccc1)C2,CI,0
4001,4001,NC(=S)NN=Cc1ccc(O)cn1,CI,0
4002,4002,COC1C(OC(=O)c2ccc(C)[nH]2)C(O)C(Oc2ccc3c(O)c(N...,CM,1
4003,4003,O=C1C=C2C=CC3CC2(O1)C1CCCCN31,CI,0


In [13]:
data["HIV_active"].value_counts()

0    35850
1     1278
Name: HIV_active, dtype: int64

In [12]:
start_index = data.iloc[0]["index"]

In [14]:
# check how many additional samples are needed
neg_class = data["HIV_active"].value_counts()[0]
pos_class = data["HIV_active"].value_counts()[1]
multipler = int(neg_class / pos_class)
multipler

28

In [19]:
# replicate dataset for positive class
replicated_pos = [data[data["HIV_active"] == 1]] * multipler

In [20]:
# append replicated data
data = data.append(replicated_pos, ignore_index=True)
data.shape

(72912, 4)

In [21]:
# shuffle dataset
data = data.sample(frac=1).reset_index(drop=True)

In [22]:
# re-assign index (this is our ID later)
index = range(start_index, start_index + data.shape[0])
data.index = index
data["index"] = index
data.head()

Unnamed: 0,index,smiles,activity,HIV_active
3999,3999,CC(C)(C)OC(=O)Nc1nnc(S(N)(=O)=O)s1,CM,1
4000,4000,CCCc1cc(=O)oc2c3c(cc(OC(C)C)c12)OC(C)C(C)C3=O,CM,1
4001,4001,CSC(SC)=C(C#N)C(=O)Nc1ccc(Cl)cc1,CM,1
4002,4002,N#CC(C#N)=C1NC(=O)C(c2ccccc2)C(=O)N1,CI,0
4003,4003,Cc1cc2nc3n(c2cc1C)C(c1c(F)cccc1F)SC3,CM,1


In [23]:
# save
data.to_csv("data/raw/HIV_train_oversampled.csv")