DeepChem tutorial at https://deepchem.io/docs/notebooks/protein_ligand_complex_notebook.html

Step 0: import packages, dependencies, etc.

In [None]:
#Run this cell if running on Google Colab. Downloads Miniconda, DeepChem and dependencies, and nglview, mdtraj packages.

%load_ext autoreload
%autoreload 2
%pdb off
# set DISPLAY = True when running tutorial
DISPLAY = False
# set PARALLELIZE to true if you want to use ipyparallel
PARALLELIZE = False
import warnings
warnings.filterwarnings('ignore')

#install conda, rdkit, pytorch
!wget -c https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
!chmod +x Miniconda3-latest-Linux-x86_64.sh
!bash ./Miniconda3-latest-Linux-x86_64.sh -b -f -p /usr/local
!conda install -q -y -c conda-forge rdkit
!conda install pytorch torchvision cudatoolkit=9.0 -c pytorch -y
!conda install -y --prefix /usr/local -c conda-forge rdkit joblib simdna


#!git clone https://github.com/deepchem/deepchem.git      # Clone deepchem source code from GitHub
#!cd deepchem && python setup.py install
#!ls -la /usr/local/lib/python3.7/site-packages/deepchem
!conda install -c deepchem -c rdkit -c conda-forge -c omnia deepchem=2.1.0 -y

#!conda install nglview -y -c conda-forge
!conda install nglview -y -c conda-forge --prefix /usr/local
!conda install mdtraj -y -c conda-forge --prefix /usr/local
import sys
sys.path.append('/usr/local/lib/python3.7/site-packages/')

In [1]:
import deepchem as dc
from deepchem.utils import download_url

from deepchem.utils.evaluate import Evaluator
from urllib.request import urlretrieve  # Python 3
import tempfile
import os
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
import mdtraj as md
import nglview
print("All modules imported.")

Step 1: retrieve and visual dataset

In [3]:
download_url("https://s3-us-west-1.amazonaws.com/deepchem.io/datasets/pdbbind_core_df.csv.gz")
data_dir = os.path.join(dc.utils.get_data_dir())
dataset_file= os.path.join(dc.utils.get_data_dir(), "pdbbind_core_df.csv.gz")
raw_dataset = dc.utils.save.load_from_disk(dataset_file)

In [4]:
print("Type of dataset is: %s" % str(type(raw_dataset)))
print(raw_dataset[:5])
print("Shape of dataset is: %s" % str(raw_dataset.shape))

Type of dataset is: <class 'pandas.core.frame.DataFrame'>
  pdb_id                                             smiles  \
0   2d3u        CC1CCCCC1S(O)(O)NC1CC(C2CCC(CN)CC2)SC1C(O)O   
1   3cyx  CC(C)(C)NC(O)C1CC2CCCCC2C[NH+]1CC(O)C(CC1CCCCC...   
2   3uo4        OC(O)C1CCC(NC2NCCC(NC3CCCCC3C3CCCCC3)N2)CC1   
3   1p1q                         CC1ONC(O)C1CC([NH3+])C(O)O   
4   3ag9  NC(O)C(CCC[NH2+]C([NH3+])[NH3+])NC(O)C(CCC[NH2...   

                                          complex_id  \
0    2d3uCC1CCCCC1S(O)(O)NC1CC(C2CCC(CN)CC2)SC1C(O)O   
1  3cyxCC(C)(C)NC(O)C1CC2CCCCC2C[NH+]1CC(O)C(CC1C...   
2    3uo4OC(O)C1CCC(NC2NCCC(NC3CCCCC3C3CCCCC3)N2)CC1   
3                     1p1qCC1ONC(O)C1CC([NH3+])C(O)O   
4  3ag9NC(O)C(CCC[NH2+]C([NH3+])[NH3+])NC(O)C(CCC...   

                                         protein_pdb  \
0  ['HEADER    2D3U PROTEIN\n', 'COMPND    2D3U P...   
1  ['HEADER    3CYX PROTEIN\n', 'COMPND    3CYX P...   
2  ['HEADER    3UO4 PROTEIN\n', 'COMPND    3UO4 P...   
3 

In [5]:
def combine_mdtraj(protein, ligand):
  chain = protein.topology.add_chain()
  residue = protein.topology.add_residue("LIG", chain, resSeq=1)
  for atom in ligand.topology.atoms:
      protein.topology.add_atom(atom.name, atom.element, residue)
  protein.xyz = np.hstack([protein.xyz, ligand.xyz])
  protein.topology.create_standard_bonds()
  return protein

In [6]:
def visualize_complex(complex_mdtraj):
  ligand_atoms = [a.index for a in complex_mdtraj.topology.atoms if "LIG" in str(a.residue)]
  binding_pocket_atoms = md.compute_neighbors(complex_mdtraj, 0.5, ligand_atoms)[0]
  binding_pocket_residues = list(set([complex_mdtraj.topology.atom(a).residue.resSeq for a in binding_pocket_atoms]))
  binding_pocket_residues = [str(r) for r in binding_pocket_residues]
  binding_pocket_residues = " or ".join(binding_pocket_residues)

  traj = nglview.MDTrajTrajectory( complex_mdtraj ) # load file from RCSB PDB
  ngltraj = nglview.NGLWidget( traj )
  ngltraj.representations = [
  { "type": "cartoon", "params": {
  "sele": "protein", "color": "residueindex"
  } },
  { "type": "licorice", "params": {
  "sele": "(not hydrogen) and (%s)" %  binding_pocket_residues
  } },
  { "type": "ball+stick", "params": {
  "sele": "LIG"
  } }
  ]
  return ngltraj

In [7]:
def visualize_ligand(ligand_mdtraj):
  traj = nglview.MDTrajTrajectory( ligand_mdtraj ) # load file from RCSB PDB
  ngltraj = nglview.NGLWidget( traj )
  ngltraj.representations = [
    { "type": "ball+stick", "params": {"sele": "all" } } ]
  return ngltraj

In [8]:
def convert_lines_to_mdtraj(molecule_lines):
  molecule_lines = molecule_lines.strip('[').strip(']').replace("'","").replace("\\n", "").split(", ")
  tempdir = tempfile.mkdtemp()
  molecule_file = os.path.join(tempdir, "molecule.pdb")
  with open(molecule_file, "w") as f:
    for line in molecule_lines:
        f.write("%s\n" % line)
  molecule_mdtraj = md.load(molecule_file)
  return molecule_mdtraj

In [9]:
first_protein, first_ligand = raw_dataset.iloc[0]["protein_pdb"], raw_dataset.iloc[0]["ligand_pdb"]
protein_mdtraj = convert_lines_to_mdtraj(first_protein)
ligand_mdtraj = convert_lines_to_mdtraj(first_ligand)
complex_mdtraj = combine_mdtraj(protein_mdtraj, ligand_mdtraj)

In [10]:
ngltraj = visualize_complex(complex_mdtraj)
ngltraj

PermissionError: [Errno 13] Permission denied: 'C:\\Users\\darne\\AppData\\Local\\Temp\\tmpamxvwmj0.pdb'

Step 2: featurize the dataset

Step 3: train model

Step 4: optimize hyperparameters

Step 5: plot/visualize performance