In [None]:
%config IPCompleter.greedy=True
%config Completer.use_jedi = False

In [1]:
import os
import sys

In [2]:
import mdtraj as md
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors

In [3]:
PATH_USER = '/home/somesh'
PATH_FOLDER = 'challenge-iclr-2021/submission-somesh'
PATH_PEPTIDE_LIST = os.path.join(PATH_USER, PATH_FOLDER, 'Dataset', 'peptide_list.csv')
PATH_PDB_FOLDER = os.path.join(PATH_USER, 'pepbdb')

In [4]:
df_peptide_list = pd.read_csv(PATH_PEPTIDE_LIST)
df_peptide_list = df_peptide_list[
    (df_peptide_list.noncanonical_peptide == 0) & 
    (df_peptide_list.resolution >= 0) &
    (df_peptide_list.resolution <= 5)].reset_index(drop=True)[:10]

In [5]:
def receptor_peptide_mdtraj(PATH_PDB_FOLDER, pdb_chain_id):
    """Loads PDB files into MDTraj objects.
    
    Args:
        PATH_PDB_FOLDER: str, path where all the PDB folders are 
        pdb_chain_id: str, PDB ID followed by chain ID, example: 1a61_L
    
    Returns:
        tuple(receptor_obj, peptide_obj)
        
    """
    path_pdb_folder = os.path.join(PATH_PDB_FOLDER, pdb_chain_id)
    return (md.load_pdb(os.path.join(path_pdb_folder, 'receptor.pdb')), 
            md.load_pdb(os.path.join(path_pdb_folder, 'peptide.pdb')))

In [6]:
def receptor_peptide_mol(PATH_PDB_FOLDER, pdb_chain_id):
    """Loads PDB files into RDKit mol objects.
    
    Args:
        PATH_PDB_FOLDER: str, path where all the PDB folders are 
        pdb_chain_id: str, PDB ID followed by chain ID, example: 1a61_L
    
    Returns:
        tuple(receptor_obj, peptide_obj)
        
    """
    path_pdb_folder = os.path.join(PATH_PDB_FOLDER, pdb_chain_id)
    return (Chem.rdmolfiles.MolFromPDBFile(os.path.join(path_pdb_folder, 'receptor.pdb')), 
            Chem.rdmolfiles.MolFromPDBFile(os.path.join(path_pdb_folder, 'peptide.pdb')))

In [17]:
def coulomb_matrix_from_mol(rdkit_mol):
    """Featurizes RDKit mol as Coulomb matrix.
    
    Args: 
        rdkit_mol: RDKit mol, receptor/peptide mol object
        
    Returns:
        np array, (n_atoms, n_atoms) Coulomb matrix
        
    """
    return np.array(rdMolDescriptors.CalcCoulombMat(rdkit_mol))

In [82]:
def pc_coordinates(mdtraj_obj):
    """Featurizes MDTraj object as array of coordinates.
    
    Args: 
        mdtraj_obj: MDTraj object, receptor/peptide MDTraj object
        
    Returns:
        np array: (n_atoms, 3), 3 Cartesian coordinates
        
    """    
    return mdtraj_obj.xyz[0]

In [84]:
elements_dict = {'H' : 1.008,'HE' : 4.003, 'LI' : 6.941, 'BE' : 9.012,\
                 'B' : 10.811, 'C' : 12.011, 'N' : 14.007, 'O' : 15.999,\
                 'F' : 18.998, 'NE' : 20.180, 'NA' : 22.990, 'MG' : 24.305,\
                 'AL' : 26.982, 'SI' : 28.086, 'P' : 30.974, 'S' : 32.066,\
                 'CL' : 35.453, 'AR' : 39.948, 'K' : 39.098, 'CA' : 40.078,\
                 'SC' : 44.956, 'TI' : 47.867, 'V' : 50.942, 'CR' : 51.996,\
                 'MN' : 54.938, 'FE' : 55.845, 'CO' : 58.933, 'NI' : 58.693,\
                 'CU' : 63.546, 'ZN' : 65.38, 'GA' : 69.723, 'GE' : 72.631,\
                 'AS' : 74.922, 'SE' : 78.971, 'BR' : 79.904, 'KR' : 84.798,\
                 'RB' : 84.468, 'SR' : 87.62, 'Y' : 88.906, 'ZR' : 91.224,\
                 'NB' : 92.906, 'MO' : 95.95, 'TC' : 98.907, 'RU' : 101.07,\
                 'RH' : 102.906, 'PD' : 106.42, 'AG' : 107.868, 'CD' : 112.414,\
                 'IN' : 114.818, 'SN' : 118.711, 'SB' : 121.760, 'TE' : 126.7,\
                 'I' : 126.904, 'XE' : 131.294, 'CS' : 132.905, 'BA' : 137.328,\
                 'LA' : 138.905, 'CE' : 140.116, 'PR' : 140.908, 'ND' : 144.243,\
                 'PM' : 144.913, 'SM' : 150.36, 'EU' : 151.964, 'GD' : 157.25,\
                 'TB' : 158.925, 'DY': 162.500, 'HO' : 164.930, 'ER' : 167.259,\
                 'TM' : 168.934, 'YB' : 173.055, 'LU' : 174.967, 'HF' : 178.49,\
                 'TA' : 180.948, 'W' : 183.84, 'RE' : 186.207, 'OS' : 190.23,\
                 'IR' : 192.217, 'PT' : 195.085, 'AU' : 196.967, 'HG' : 200.592,\
                 'TL' : 204.383, 'PB' : 207.2, 'BI' : 208.980, 'PO' : 208.982,\
                 'AT' : 209.987, 'RN' : 222.081, 'FR' : 223.020, 'RA' : 226.025,\
                 'AC' : 227.028, 'TH' : 232.038, 'PA' : 231.036, 'U' : 238.029,\
                 'NP' : 237, 'PU' : 244, 'AM' : 243, 'CM' : 247, 'BK' : 247,\
                 'CT' : 251, 'ES' : 252, 'FM' : 257, 'MD' : 258, 'NO' : 259,\
                 'LR' : 262, 'RF' : 261, 'DB' : 262, 'SG' : 266, 'BH' : 264,\
                 'HS' : 269, 'MT' : 268, 'DS' : 271, 'RG' : 272, 'CN' : 285,\
                 'NH' : 284, 'FL' : 289, 'MC' : 288, 'LV' : 292, 'TS' : 294,\
                 'OG' : 294}

def pc_coordinates_mass(mdtraj_obj):
    """Featurizes MDTraj object as array of coordinates.
    
    Args: 
        mdtraj_obj: MDTraj object, receptor/peptide MDTraj object
        
    Returns:
        (n_atoms, 4): np array, 3 Cartesian coordinates and mass
        
    """  
    mass = np.array([
        elements_dict[element] for element in 
        mdtraj_obj.topology.to_dataframe()[0]['element']])
    return np.hstack([mdtraj_obj.xyz[0], mass.reshape(-1, 1)])

In [8]:
for idx in range(df_peptide_list.shape[0]):
    pdb_chain_id = ''.join((
        df_peptide_list.iloc[idx]['PDB_ID'], '_', 
        df_peptide_list.iloc[idx]['peptide_chain_ID']))
    

In [85]:
pdb_chain_id

'1a61_L'

In [None]:
receptor_peptide_pdb(PATH_PDB_FOLDER, pdb_chain_id)