In [1]:
# code to convert SMILES string to bigraph, pull energies as labels
# save and load graphs and energies as input (X) and labels (y)

In [25]:
import torch
from rdkit import Chem
from dgllife.utils import smiles_to_bigraph
# from dgllife.utils import mol_to_bigraph
from dgl.data.utils import load_graphs
from dgl.data.utils import save_graphs
from dgl.dataloading import GraphDataLoader as gdl
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset

In [26]:
# class MyDataset():
#     def __init__(self, df:str):
        
#         self.df = pd.read_csv(df)
#         self.name = os.path.basename(df)
        
#         self.bigraph = [] # X
#         self.labels =  np.ndarray([]) # y 
        

#     def featurize_atoms(self, mol):
#     # featurize atoms
#         feats = []
#         for atom in mol.GetAtoms():
#             feats.append(atom.GetAtomicNum())
#         return {'atomic': torch.tensor(feats).reshape(-1, 1).float()}
    
#     def featurize_bonds(self, mol):
#     # featurize bonds
#         feats = []
#         bond_types = [Chem.rdchem.BondType.SINGLE, Chem.rdchem.BondType.DOUBLE,
#                     Chem.rdchem.BondType.TRIPLE, Chem.rdchem.BondType.AROMATIC]
#         for bond in mol.GetBonds():
#             btype = bond_types.index(bond.GetBondType())
#             feats.extend([btype, btype])
#         return {'type': torch.tensor(feats).reshape(-1, 1).float()}
    
#     def process(self):
#     # make bigraphs and labels (energy)
#         self.bigraph = list(self.df['Smiles'].apply(lambda x: smiles_to_bigraph(str(x), node_featurizer=self.featurize_atoms, edge_featurizer=self.featurize_bonds,explicit_hydrogens=True)))
#         self.labels = np.array(self.df["Energy"])
    
#     def __getitem__(self, idx):
#     # get one example by index
#         return self.graphs[idx], self.label[idx]

#     def __len__(self):
#     # number of graphs in dataset
#         return len(self.graphs)
    
#     def save(self, path):
#     # save graphs and labels
#         graph_path = os.path.join(path, self.name + '_dgl_graph.bin')
#         save_graphs(graph_path, self.bigraph, {'labels': torch.tensor(self.labels)})
        

In [27]:
from dgl.data import DGLDataset

In [34]:
class MyDataset(DGLDataset):
    def __init__(self,
                 df:str,
                 url=None,
                 raw_dir=None,
                 save_dir=None,
                 force_reload=False,
                 verbose=False):
        
        self.df = pd.read_csv(df)
        
        self._name = os.path.basename(df)

        self.bigraph = [] # X
        self.labels =  np.ndarray([]) # y 
        
        super(MyDataset, self).__init__(name=os.path.basename(df),
                                        url=url,
                                        raw_dir=raw_dir,
                                        save_dir=save_dir,
                                        force_reload=force_reload,
                                        verbose=verbose)



    def featurize_atoms(self, mol):
    # featurize atoms
        feats = []
        for atom in mol.GetAtoms():
            feats.append(atom.GetAtomicNum())
        return {'atomic': torch.tensor(feats).reshape(-1, 1).float()}
    
    def featurize_bonds(self, mol):
    # featurize bonds
        feats = []
        bond_types = [Chem.rdchem.BondType.SINGLE, Chem.rdchem.BondType.DOUBLE,
                    Chem.rdchem.BondType.TRIPLE, Chem.rdchem.BondType.AROMATIC]
        for bond in mol.GetBonds():
            btype = bond_types.index(bond.GetBondType())
            feats.extend([btype, btype])
        return {'type': torch.tensor(feats).reshape(-1, 1).float()}
    
    def process(self):
    # make bigraphs and labels (energy)
        self.bigraph = list(self.df['Smiles'].apply(lambda x: smiles_to_bigraph(str(x), node_featurizer=self.featurize_atoms, edge_featurizer=self.featurize_bonds,explicit_hydrogens=True)))
        self.labels = np.array(self.df["Energy"])
    
    def __getitem__(self, idx):
    # get one example by index
        return self.graphs[idx], self.label[idx]

    def __len__(self):
    # number of graphs in dataset
        return len(self.graphs)
    
    def save(self):
    # save graphs and labels
        graph_path = os.path.join(self._name + '_dgl_graph.bin')
        save_graphs(graph_path, self.bigraph, {'labels': torch.tensor(self.labels)})
        
    def load(self):
        # load graphs and labels
        graph_path = os.path.join(self._name + '_dgl_graph.bin')
        graphs, label_dict = load_graphs(graph_path)
        labels = label_dict['labels']
        return graphs, labels


In [35]:
m = MyDataset( '../../Data/Solubility/EnergyDataset-B-Copy1.csv') # file path to csv

In [36]:
m.process()

In [37]:
m.save()

In [51]:
# m.bigraph

[Graph(num_nodes=20, num_edges=38,
       ndata_schemes={'atomic': Scheme(shape=(1,), dtype=torch.float32)}
       edata_schemes={'type': Scheme(shape=(1,), dtype=torch.float32)}),
 Graph(num_nodes=15, num_edges=30,
       ndata_schemes={'atomic': Scheme(shape=(1,), dtype=torch.float32)}
       edata_schemes={'type': Scheme(shape=(1,), dtype=torch.float32)}),
 Graph(num_nodes=30, num_edges=60,
       ndata_schemes={'atomic': Scheme(shape=(1,), dtype=torch.float32)}
       edata_schemes={'type': Scheme(shape=(1,), dtype=torch.float32)}),
 Graph(num_nodes=12, num_edges=22,
       ndata_schemes={'atomic': Scheme(shape=(1,), dtype=torch.float32)}
       edata_schemes={'type': Scheme(shape=(1,), dtype=torch.float32)}),
 Graph(num_nodes=44, num_edges=94,
       ndata_schemes={'atomic': Scheme(shape=(1,), dtype=torch.float32)}
       edata_schemes={'type': Scheme(shape=(1,), dtype=torch.float32)}),
 Graph(num_nodes=15, num_edges=30,
       ndata_schemes={'atomic': Scheme(shape=(1,), dtype=tor

In [38]:
# def load(name, path):
#     # load graphs and labels
#     graph_path = os.path.join(path, name + '_dgl_graph.bin')
#     graphs, label_dict = load_graphs(graph_path)
#     labels = label_dict['labels']
#     return graphs, labels

In [39]:
# load X and y
X, y = m.load() # name = csv file name, "./graphs" = folder name to store graph data

In [40]:
# X

In [41]:
# y

In [42]:
# split into train and test
X_train, X_test, y_train, y_test = train_test_split(X,y)
# split train into train and validation
X_train, X_val, y_train, y_val = train_test_split(X_train,y_train)

In [43]:
train_dataset = X_train,y_train
val_dataset = X_val, y_val
test_dataset = X_test, y_test

In [44]:
train_dataloader = gdl(train_dataset, batch_size=1024, shuffle=True, drop_last=False, num_workers=4)
val_dataloader = gdl(val_dataset, batch_size=1024, shuffle=True, drop_last=False, num_workers=4)
test_dataloader = gdl(test_dataset, batch_size=1024, shuffle=True, drop_last=False, num_workers=4)

In [45]:
# type(X_train)

list