# PyTorch Geometric Dataset for MoleculeNet HIV Dataset

## Setup

### import

In [14]:
import os
import requests
from pathlib import Path

OS version: {os.__version__}


In [42]:
import numpy as np
import pandas as pd
from tqdm import tqdm

In [10]:
import torch
import torch_geometric
from torch_geometric.data import Dataset, Data

In [20]:
print(f'torch version: {torch.__version__}')
print(f'cuda available: {torch.cuda.is_available()}')
print(f'torch geometric version: {torch_geometric.__version__}')

torch version: 2.0.1
cuda available: False
torch geometric version: 2.3.1


In [21]:
from rdkit import Chem

### data

In [6]:
directory_path = Path('data')
if not directory_path.exists():
    directory_path.mkdir(parents=True)
    print(f'Directory {directory_path} created.')

In [9]:
file_url = 'https://github.com/deepchem/deepchem/raw/master/examples/hiv/HIV.csv'
file_path = directory_path / 'HIV.csv'
if not file_path.exists():
    response = requests.get(file_url)
    with open(file_path, 'wb') as file:
        file.write(response.content)
    print(f"File '{file_path}' download.")

## Dataset

In [51]:
class MoleculeDataset(Dataset):
    def __init__(self, root, filename, test=False, transform=None, pre_transform=None):
        self.test = test
        self.filename = filename
        super(MoleculeDataset, self).__init__(root, transform, pre_transform)
    
    @property
    def raw_file_names(self):
        return self.filename

    @property
    def processed_file_names(self):
        self.data = pd.read_csv(self.raw_paths[0]).reset_index()
        
        if self.test:
            return [f'data_test_{i}.pt' for i in list(self.data.index)]
        else:
            return [f'data_{i}.pt' for i in list(self.data.index)]
    
    def download(self):
        pass

    def process(self):
        self.data = pd.read_csv(self.raw_paths[0])
        for index, mol in tqdm(self.data.iterrows(), total=self.data.shape[0]):
            mol_obj = Chem.MolFromSmiles(mol["smiles"])
            # Get node features
            node_feats = self._get_node_features(mol_obj)
            # Get edge features
            edge_feats = self._get_edge_features(mol_obj)
            # Get adjacency info
            edge_index = self._get_adjacency_info(mol_obj)
            # Get label info
            label = self._get_label(mol["HIV_active"]
            
            # Create data object
            data = Data(x=node_feats,
                        edge_index=edge_index,
                        edge_attr=edge_feats,
                        y=label,
                        smiles=mol["smiles"]
                        )
            # Save to torch files
            if self.test:
                torch.save(data, os.path.join(self.processed_dir,
                                              f'data_test_{index}.pt'))
            else:
                torch.save(data, os.path.join(self.processed_dir,
                                              f'data_{index}.pt'))
    
    def _get_node_features(self, mol):
        all_node_feats = []

        for atom in mol.GetAtoms():
            node_feats = []
            # feature 1: atomic number
            node_feats.append(atom.GetAtomicNum())
            # feature 2: atom degree
            node_feats.append(atom.GetDegree())
            # feature 3: formal charge
            node_feats.append(atom.GetFormalCharge())
            # feature 4: hydridization
            node_feats.append(atom.GetHybridization())
            # feature 5: aromaticity
            node_feats.append(atom.GetIsAromatic())
            # feature 6: total num hs
            node_feats.append(atom.GetTotalNumHs(includeNeighbors=True))
            # feature 7: radical electrons
            node_feats.append(atom.GetNumRadicalElectrons())
            # feature 8: in ring
            node_feats.append(atom.IsInRing())
            # feature 9: chirality
            node_feats.append(atom.GetChiralTag())

            # append node features to matrix
            all_node_feats.append(node_feats)
        
        all_node_feats = np.array(all_node_feats)
        return torch.tensor(all_node_feats, dtype=torch.float)

    def _get_edge_features(self, mol):
        all_edge_feats = []

        for bond in mol.GetBonds():
            edge_feats = []
            # feature 1: bond type
            edge_feats.append(bond.GetBondTypeAsDouble())
            # feature 2: rings
            edge_feats.append(bond.IsInRing())
            # append node features to matrix(twice per direction)
            all_edge_feat += [edge_feats, edge_feats]
        
        all_edge_feats = np.asarray(all_edge_feats)
        return torch.tensor(all_edge_feats, dtype=torch.float)
    
    def _get_adjecency_info(self, mol):
        edge_indices = []
        for bond in mol.GetBonds():
            i = bond.GetBeginAtomIdx()
            j = bond.GetEndAtomIdx()
            edge_indices += [[i, j], [j, i]]
        
        edge_index = torch.tensor(edge_indices)
        edge_indices = edge_indices.t().to(torch.long).view(2, -1)
        return edge_indices

    def _get_labels(self, label):
        label = np.asarray([label])
        return torch.tensor(label, dtype=torch.int64)
    
    def len(self):
        return self.data.shape[0]

    def get(self, idx):
        if self.test:
            data = torch.load(os.path.join(self.processed_dir,
                                           f'data_test_{idx}.pt'))
        else:
            data = torch.load(os.path.join(self.processed_dir,
                                           f'data_{idx}.pt'))
        return data

In [50]:
MoleculeDataset(root=directory_path, filename='HIV.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'data/raw/HIV.csv'