## Generate a dataset of voxels (on-the-fly) for training deep learning models

In [1]:
import sys
sys.path.append('../')

In [2]:
from docktgrid import VoxelDataset, VoxelGrid
from docktgrid.view import BasicView, VolumeView
from docktgrid.transforms import RandomRotation

In [3]:
# create a voxelgrid object
voxel = VoxelGrid(
    views=[VolumeView(), BasicView()],  # you can add multiple views; they are executed in order
    vox_size=1.0,                       # size of the voxel (in Angstrom)
    box_dims=[24.0, 24.0, 24.0],        # dimensions of the box (in Angstrom)
)
voxel.shape

(21, 24, 24, 24)

In [4]:
# create a dataset object
pdbs = ["1xap", "2weg", "4bb9", "4qsu", "6std"]
data = VoxelDataset(
    protein_files=[f"{pdb}_protein.pdb" for pdb in pdbs],
    ligand_files=[f"{pdb}_ligand.pdb" for pdb in pdbs],
    labels=range(len(pdbs)),
    voxel=voxel,
    transform=[RandomRotation()],         # use None if you don't want to apply any transformation
    root_dir="../tests/data/dataset",
)

len(data)

5

In [16]:
# iterate over the dataset
def iterate():
    for x, y in data:
        print(x.shape, y)

iterate()

torch.Size([21, 24, 24, 24]) tensor(0.)
torch.Size([21, 24, 24, 24]) tensor(1.)
torch.Size([21, 24, 24, 24]) tensor(2.)
torch.Size([21, 24, 24, 24]) tensor(3.)
torch.Size([21, 24, 24, 24]) tensor(4.)


## Loading files beforehand (optional)

To avoid reading the molecular files every time, we can use the `scripts.preprocess_dataset` script to searialize molecular objects for faster loading. For more information on how to use the script, run `python -m scripts.preprocess_dataset --help`.

In [17]:
import os
import pickle

# preprocess the dataset
# python -m docktgrid.scripts.preprocess_dataset --pattern '*.pdb' --dir tests/data/dataset

In [18]:
files = os.listdir("../data/processed/")
protein_files = sorted([f for f in files if "protein" in f])
ligand_files = sorted([f for f in files if "ligand" in f])

print(protein_files)
print(ligand_files)

['1xap_protein.pdb.pkl', '2weg_protein.pdb.pkl', '4bb9_protein.pdb.pkl', '4qsu_protein.pdb.pkl', '6std_protein.pdb.pkl']
['1xap_ligand.pdb.pkl', '2weg_ligand.pdb.pkl', '4bb9_ligand.pdb.pkl', '4qsu_ligand.pdb.pkl', '6std_ligand.pdb.pkl']


Load the data into memory first:

In [19]:
protein_mols = [pickle.load(open(f"../data/processed/{f}", "rb")) for f in protein_files]
ligand_mols = [pickle.load(open(f"../data/processed/{f}", "rb")) for f in ligand_files]

We can also exclude protein atoms that are outside the bounding box of the voxel grid (optional):

In [20]:
from docktgrid.molparser import extract_binding_pocket
import numpy as np

for i, ptn in enumerate(protein_mols):
    radius = np.ceil(np.sqrt(3) * max(voxel.shape[1:]) / 2)  # radius of the sphere that contains the voxel grid
    inside_atoms_idx = extract_binding_pocket(ptn.coords, ligand_mols[i].coords.mean(dim=1), radius)
    
    # keep only the atoms inside the binding pocket, rewrite the MolecularData attributes
    ptn.coords = ptn.coords[:, inside_atoms_idx]
    ptn.element_symbols = ptn.element_symbols[inside_atoms_idx]

In [21]:
data = VoxelDataset(
    protein_files=protein_mols,
    ligand_files=ligand_mols,
    labels=range(len(protein_files)),
    voxel=voxel,
    transform=[RandomRotation()],  # use None if you don't want to apply any transformation
    root_dir="../data/processed/",
)

## Iterating over the dataset

In [23]:
# iterate over the dataset
for x, y in data:
    print(x.shape, y)
    # your training code here...

torch.Size([21, 24, 24, 24]) tensor(0.)
torch.Size([21, 24, 24, 24]) tensor(1.)
torch.Size([21, 24, 24, 24]) tensor(2.)
torch.Size([21, 24, 24, 24]) tensor(3.)
torch.Size([21, 24, 24, 24]) tensor(4.)
