In [None]:
import argparse
import Bio.PDB
import numpy as np
import sys
import os
import shutil
from itertools import product
import pandas as pd
import math as m
import glob
import statistics
import collections
from scipy.spatial import distance
from biopandas.pdb import PandasPdb

parser = Bio.PDB.PDBParser(QUIET=True)

####os.chdir('/groups/cherkasvgrp/share/progressive_docking/pd_python_pose/protacs/test_benchmark_protac_models')
# here you input the desired ligase (FIRST) followed by the protein of protac target (SECOND)
# both should be ligand bound 
# this is ran after HDOCK docking job
pdb1_docked= str(sys.argv[1])
pdb2_docked= str(sys.argv[2])
model_0='model_1.pdb'

# parsing ligand files that were dissected from the original ligase and protein of protac target (bash script)
#pdb1_docked_splitted=str(pdb1_docked.split('.')[0])
#pdb2_docked_splitted=str(pdb2_docked.split('.')[0])

#pdb1_docked_splitted_csv=str(pdb1_docked_splitted.split('_')[3])
#pdb2_docked_splitted_csv=str(pdb2_docked_splitted.split('_')[0])
def parse_pdb_structure (pdb):
    return(parser.get_structure(str(pdb.rsplit( ".", 1 )[ 0 ]) , pdb))

#Deteterming the coordinate of ligase's linker attachment point
def get_ligase_ligand_info (e3_binder):
    #ligase_name=e3_binder.split('.')[0].split('_')[3]
    ligase_name=str(e3_binder)
    if ligase_name == 'IAP':
        x,y,z=35.114,-3.532,45.775
    elif ligase_name == 'RNF4':
        x,y,z=8.227,12.055,-7.251 
    elif ligase_name == 'VHL':
        x,y,z=25.631,-25.053,-15.098
    elif ligase_name == 'UBR1':
        x,y,z=18.010,-6.326,22.463
    elif ligase_name == 'MDM2':
        x,y,z=20.192,16.505,32.243
    elif ligase_name == 'CRBN':
        x,y,z=69.876,42.945,51.515
    elif ligase_name == 'RNF144':
        x,y,z=-10.241,-1.942,-8.284
    elif ligase_name == 'RNF114':
        x,y,z=-10.241,-1.942,-8.284
    else:
        print('None of the recognized E3 Binder is in your directory')
    return (np.array([[x,y,z]]))

# 
def combine_dicts(func, *dicts):
    default = collections.defaultdict(set)
    for d in dicts:
        for k, v in d.items():
            default[k].add(v)
    return {k: func(v) for k, v in default.items()}

# here we try to find the HDOCK model with the least distance between those two residues: ligase and POI
def find_best_protac_model (ligand_coor1):
    print('calculating best model which matches the least corresponding distance between those two residues ...')      
    avg_dis_dicts_models=[]
    for filename in glob.iglob('ligand_model_*.pdb'):
        ppdb = PandasPdb().read_pdb(filename)
        ligand_coor2=ppdb.df['HETATM'].iloc[:,11:14].to_numpy()

        close_atoms_ref=ligand_coor1
        close_atoms_comp=ligand_coor2
        
        min_dis_dicts_models=[]

        for i in close_atoms_ref:
            min_dis_dicts_models.append({str(filename):float(min(np.linalg.norm(i - j) for j in close_atoms_comp))})
            #print(f'{i} has minimum distance of {min(np.linalg.norm(i - j) for j in close_atoms_comp)} for model {filename}')
        avg_dis_dicts_models.append(combine_dicts(statistics.mean, *min_dis_dicts_models))
        
    avg_dis_dicts_models_dics={k: v for d in avg_dis_dicts_models for k, v in d.items()}
    best_model_1=min(avg_dis_dicts_models_dics, key=avg_dis_dicts_models_dics.get)
    avg_dis_dicts_models_dics_updated = {key:val for key, val in avg_dis_dicts_models_dics.items() if key != str(best_model_1)}
    best_model_2=min(avg_dis_dicts_models_dics_updated, key=avg_dis_dicts_models_dics_updated.get)
    avg_dis_dicts_models_dics_updated_2 = {key:val for key, val in avg_dis_dicts_models_dics_updated.items() if key != str(best_model_2)}
    best_model_3=min(avg_dis_dicts_models_dics_updated_2, key=avg_dis_dicts_models_dics_updated_2.get)
    avg_dis_dicts_models_dics_updated_3 = {key:val for key, val in avg_dis_dicts_models_dics_updated_2.items() if key != str(best_model_3)}
    best_model_4=min(avg_dis_dicts_models_dics_updated_3, key=avg_dis_dicts_models_dics_updated_3.get)
    avg_dis_dicts_models_dics_updated_4 = {key:val for key, val in avg_dis_dicts_models_dics_updated_3.items() if key != str(best_model_4)}
    best_model_5=min(avg_dis_dicts_models_dics_updated_4, key=avg_dis_dicts_models_dics_updated_4.get)
    avg_dis_dicts_models_dics_updated_5 = {key:val for key, val in avg_dis_dicts_models_dics_updated_4.items() if key != str(best_model_5)}
    best_model_6=min(avg_dis_dicts_models_dics_updated_5, key=avg_dis_dicts_models_dics_updated_5.get)
    avg_dis_dicts_models_dics_updated_6 = {key:val for key, val in avg_dis_dicts_models_dics_updated_5.items() if key != str(best_model_6)}
    best_model_7=min(avg_dis_dicts_models_dics_updated_6, key=avg_dis_dicts_models_dics_updated_6.get)
    avg_dis_dicts_models_dics_updated_7 = {key:val for key, val in avg_dis_dicts_models_dics_updated_6.items() if key != str(best_model_7)}
    best_model_8=min(avg_dis_dicts_models_dics_updated_7, key=avg_dis_dicts_models_dics_updated_7.get)
    avg_dis_dicts_models_dics_updated_8 = {key:val for key, val in avg_dis_dicts_models_dics_updated_7.items() if key != str(best_model_8)}
    best_model_9=min(avg_dis_dicts_models_dics_updated_8, key=avg_dis_dicts_models_dics_updated_8.get)
    avg_dis_dicts_models_dics_updated_9 = {key:val for key, val in avg_dis_dicts_models_dics_updated_8.items() if key != str(best_model_9)}
    best_model_10=min(avg_dis_dicts_models_dics_updated_9, key=avg_dis_dicts_models_dics_updated_9.get)

    
    return(best_model_1, best_model_2, best_model_3,best_model_4, best_model_5, best_model_6,best_model_7, best_model_8, best_model_9, best_model_10)

def get_energy_difference (model):
    
    FILENAME = model
    TARGET = "Score:"
    score=float()

    with open(FILENAME) as f:
        value = None
        start_seen = False
        for line in f:
            if TARGET in line:
                _,value = line.split(':  ')
                break

    if value is not None:
        score=value
        
    return (score)
######################################################################################################
#APPLYING


ligase_attach_coordinates=get_ligase_ligand_info (pdb1_docked)

best_model, best_model_2, best_model_3, best_model_4, best_model_5, best_model_6,best_model_7, best_model_8, best_model_9, best_model_10= find_best_protac_model(ligase_attach_coordinates)



print(f'-> your best model is in {best_model}')
print(f'-> your second best model is in {best_model_2}')
print(f'-> your third best model is in {best_model_3}')

first_model_energy=get_energy_difference(str(model_0))

best_protac_energy=get_energy_difference(str('_'.join(best_model.split('_')[1:])))
second_best_protac_energy=get_energy_difference(str('_'.join(best_model_2.split('_')[1:])))
third_best_protac_energy=get_energy_difference(str('_'.join(best_model_3.split('_')[1:])))
fourth_best_protac_energy=get_energy_difference(str('_'.join(best_model_4.split('_')[1:])))
fifth_best_protac_energy=get_energy_difference(str('_'.join(best_model_5.split('_')[1:])))
sixth_best_protac_energy=get_energy_difference(str('_'.join(best_model_6.split('_')[1:])))                                                             
seventh_best_protac_energy=get_energy_difference(str('_'.join(best_model_7.split('_')[1:])))
eigth_best_protac_energy=get_energy_difference(str('_'.join(best_model_8.split('_')[1:])))
ninth_best_protac_energy=get_energy_difference(str('_'.join(best_model_9.split('_')[1:])))
tenth_best_protac_energy=get_energy_difference(str('_'.join(best_model_10.split('_')[1:])))                                                            
                                                                    
print(f' Hdock predicted a score of {first_model_energy}: for the finest predicted pose')
print(f' Whereas Hdock predicted a score of {best_protac_energy}: for your best predicted protac model')
energy_sacrificed= float(first_model_energy) - float(best_protac_energy)
print(' The difference for energy sacrificed is {:.2f}'.format(energy_sacrificed))

dir_name=f'{pdb1_docked}_{pdb2_docked}_results'

if not os.path.exists(dir_name):
    os.makedirs(dir_name)
else:
    #shutil.rmtree(dir_name)           # Removes all the subdirectories!
    print('path ealready exists')
shutil.copy(best_model, dir_name)
shutil.copy(best_model_2, dir_name)
shutil.copy(best_model_3, dir_name)
shutil.copy(best_model_4, dir_name)
shutil.copy(best_model_5, dir_name)
shutil.copy(best_model_6, dir_name)
shutil.copy(best_model_7, dir_name)
shutil.copy(best_model_8, dir_name)
shutil.copy(best_model_9, dir_name)
shutil.copy(best_model_10, dir_name)
                                                                    
BEs = [float(best_protac_energy), float(second_best_protac_energy), float(third_best_protac_energy), float(fourth_best_protac_energy),float(fifth_best_protac_energy),float(sixth_best_protac_energy), float(seventh_best_protac_energy),float(eigth_best_protac_energy),float(ninth_best_protac_energy),float(tenth_best_protac_energy)]
models=[best_model, best_model_2, best_model_3, best_model_4,best_model_5, best_model_6,best_model_7, best_model_8, best_model_9, best_model_10]
models_modified=[str('_'.join(i.split('_')[1:])) for i in models]

if str(pdb1_docked) == 'UBR1':
    identify='Ubr1'
elif str(pdb1_docked) == 'IAP':
    identify='IAP'
elif str(pdb1_docked) == 'MDM2':
    identify='Mdm2'
elif str(pdb1_docked) == 'CRBN':
    identify='Cereblon'
elif str(pdb1_docked) == 'RNF114':
    identify='RNF144'
elif str(pdb1_docked) == 'RNF144':
    identify='RNF144'
elif str(pdb1_docked) == 'RNF4':
    identify='RNF4'
elif str(pdb1_docked) == 'VHL':
    identify='VHL'
else:
    print('None of the recognized E3 Binder is in your directory')
    
identifier=f'{identify}_{pdb2_docked.upper()}'

for be in range(10):
    data=[[str(pdb1_docked),str(pdb2_docked).upper(),BEs[be],models_modified[be],str(identifier)]]
    df = pd.DataFrame(data, columns = ['Ligase', 'POI', 'be_auto', 'model_name', 'identifier'])
    #df['be_auto']=BEs[be]
    #df['mode_name']=models_modified[be]
    #df['Ligase']=str(pdb1_docked)
    #df['POI']=str(pdb2_docked).upper()
    #df['identifier']=str(identifier)
    df.to_csv(f'output_{be}.csv', index=False, header=False)
    shutil.copy(f'output_{be}.csv', dir_name)

#df.to_csv('output.csv', index=False)
#shutil.copy('output.csv', dir_name)

#for file in glob.glob('model_*pdb'):
#    shutil.copy(file, dir_name)

print(f'Done {pdb1_docked} & {pdb2_docked} ! All data written to output.csv in directory {dir_name}')


In [32]:
import mdtraj as md 
pdbfile='ligand_model_8.pdb'
traj=md.load_pdb(pdbfile)
traj.xyz[-1]


[[2.3125, -3.2063000202178955, -1.38100004196167],
 [2.423799991607666, -3.1819000244140625, -1.4808000326156616],
 [2.494999885559082, -3.2641000747680664, -1.4745999574661255],
 [2.3719000816345215, -3.1768999099731445, -1.625499963760376],
 [2.255000114440918, -3.2023000717163086, -1.6516000032424927],
 [2.4663000106811523, -3.1561999320983887, -1.7273999452590942],
 [2.604099988937378, -3.113300085067749, -1.705399990081787],
 [2.700200080871582, -3.229099988937378, -1.7294000387191772],
 [2.6772000789642334, -3.2976999282836914, -1.8626999855041504],
 [2.717900037765503, -3.3907999992370605, -1.8478000164031982],
 [2.5304999351501465, -3.323899984359741, -1.892699956893921],
 [2.4374001026153564, -3.2042999267578125, -1.8633999824523926],
 [2.335599899291992, -3.244499921798706, -1.8645000457763672],
 [2.4407999515533447, -3.0940001010894775, -1.9703999757766724],
 [2.4133999347686768, -3.147700071334839, -2.1040000915527344],
 [2.390700101852417, -3.1903998851776123, -2.209599971

In [8]:
from biopandas.pdb import PandasPdb
ppdb = PandasPdb().read_pdb('ligand_model_8.pdb')
ppdb.df['HETATM'].iloc[:,11:14].to_numpy()

array([[ 23.125, -32.063, -13.81 ],
       [ 24.238, -31.819, -14.808],
       [ 24.95 , -32.641, -14.746],
       [ 23.719, -31.769, -16.255],
       [ 22.55 , -32.023, -16.516],
       [ 24.663, -31.562, -17.274],
       [ 26.041, -31.133, -17.054],
       [ 27.002, -32.291, -17.294],
       [ 26.772, -32.977, -18.627],
       [ 27.179, -33.908, -18.478],
       [ 25.305, -33.239, -18.927],
       [ 24.374, -32.043, -18.634],
       [ 23.356, -32.445, -18.645],
       [ 24.408, -30.94 , -19.704],
       [ 24.134, -31.477, -21.04 ],
       [ 23.907, -31.904, -22.096],
       [ 27.548, -32.437, -19.741],
       [ 27.922, -33.332, -20.915],
       [ 27.466, -34.756, -21.097],
       [ 28.489, -35.599, -21.858],
       [ 29.051, -34.845, -22.99 ],
       [ 29.65 , -33.54 , -22.677],
       [ 28.588, -32.648, -22.038],
       [ 27.83 , -32.438, -22.787],
       [ 29.14 , -31.352, -21.664],
       [ 28.726, -30.73 , -20.607],
       [ 27.947, -31.215, -19.686],
       [ 29.195, -29.46 , -2

In [10]:
pip install mdtraj


Collecting mdtraj
  Downloading mdtraj-1.9.6-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (6.2 MB)
[K     |████████████████████████████████| 6.2 MB 5.2 MB/s eta 0:00:01
Collecting astunparse
  Downloading astunparse-1.6.3-py2.py3-none-any.whl (12 kB)
Installing collected packages: astunparse, mdtraj
Successfully installed astunparse-1.6.3 mdtraj-1.9.6
You should consider upgrading via the '/groups/cherkasvgrp/share/progressive_docking/pd_python/tensorflow_gpu/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [None]:
import argparse
import Bio.PDB
import numpy as np
import sys
import os
import shutil
from itertools import product
import pandas as pd
import math as m
import glob
import statistics
import collections
from scipy.spatial import distance

parser = Bio.PDB.PDBParser(QUIET=True)

os.chdir('/groups/cherkasvgrp/share/progressive_docking/pd_python_pose/protacs/test_benchmark_protac_models')
# here you input the desired ligase (FIRST) followed by the protein of protac target (SECOND)
# both should be ligand bound 
# this is ran after HDOCK docking job
pdb1_docked= 'e3_ligase_5t35_VHL.pdb'   
pdb2_docked= '6ut0_1_docked_complex.pdb'    #str(sys.argv[2])     
model_0='model_1.pdb'

# parsing ligand files that were dissected from the original ligase and protein of protac target (bash script)
pdb1_docked_splitted=str(pdb1_docked.split('.')[0])
pdb2_docked_splitted=str(pdb2_docked.split('.')[0])

pdb1_docked_splitted_csv=str(pdb1_docked_splitted.split('_')[3])
pdb2_docked_splitted_csv=str(pdb2_docked_splitted.split('_')[0])
def parse_pdb_structure (pdb):
    return(parser.get_structure(str(pdb.rsplit( ".", 1 )[ 0 ]) , pdb))

#Deteterming the coordinate of ligase's linker attachment point
def get_ligase_ligand_info (e3_binder):
    ligase_name=e3_binder.split('.')[0].split('_')[3]
    coordinates=tuple()
    if ligase_name == 'IAP':
        coordinates=(35.114,-3.532,45.775)
    elif ligase_name == 'RNF4':
        coordinates=(14.308,45.353,-9.169)
    elif ligase_name == 'VHL':
        coordinates=(25.631,-25.053,-15.098)
    elif ligase_name == 'UBR1':
        coordinates=(18.010,-6.326,22.463)
    elif ligase_name == 'MDM2':
        coordinates=(20.192,16.505,32.243)
    elif ligase_name == 'CRBN':
        coordinates=(69.876,42.945,51.515)
    elif ligase_name == 'RNF144':
        coordinates=(-10.241,-1.942,-8.284)
    else:
        print('None of the recognized E3 Binder is in your directory')
    return (np.array((coordinates)))

# 
def combine_dicts(func, *dicts):
    default = collections.defaultdict(set)
    for d in dicts:
        for k, v in d.items():
            default[k].add(v)
    return {k: func(v) for k, v in default.items()}

# here we try to find the HDOCK model with the least distance between those two residues: ligase and POI
def find_best_protac_model (ligand_coor1):
    print('calculating best model which matches the least corresponding distance between those two residues ...')      
    avg_dis_dicts_models=[]
    for filename in glob.iglob('ligand_model_*.pdb'):
        structure_ligand=parser.get_structure(str(filename.rsplit( ".", 1 )[ 0 ]) , filename)
        ligand_coor2=list()
        for model in structure_ligand:
            for chain in model:
                for residue in chain:
                    for atom in residue:
                        x,y,z = atom.get_coord()

                        ligand_coor2.append([x,y,z])
        ligand_coor2=np.array(ligand_coor2)
        close_atoms_ref=ligand_coor1
        close_atoms_comp=ligand_coor2
        
        min_dis_dicts_models=[]
        print(close_atoms_ref)
        print(close_atoms_comp)
        for i in close_atoms_ref:
            min_dis_dicts_models.append({str(filename):float(min(np.linalg.norm(i - j) for j in close_atoms_comp))})
        print(min_dis_dicts_models)
            #print(f'{i} has minimum distance of {min(np.linalg.norm(i - j) for j in close_atoms_comp)} for model {filename}')
        avg_dis_dicts_models.append(combine_dicts(statistics.mean, *min_dis_dicts_models))
        
    avg_dis_dicts_models_dics={k: v for d in avg_dis_dicts_models for k, v in d.items()}
    best_model_1=min(avg_dis_dicts_models_dics, key=avg_dis_dicts_models_dics.get)
    avg_dis_dicts_models_dics_updated = {key:val for key, val in avg_dis_dicts_models_dics.items() if key != str(best_model_1)}
    best_model_2=min(avg_dis_dicts_models_dics_updated, key=avg_dis_dicts_models_dics_updated.get)
    avg_dis_dicts_models_dics_updated_2 = {key:val for key, val in avg_dis_dicts_models_dics_updated.items() if key != str(best_model_2)}
    best_model_3=min(avg_dis_dicts_models_dics_updated_2, key=avg_dis_dicts_models_dics_updated_2.get)
    avg_dis_dicts_models_dics_updated_3 = {key:val for key, val in avg_dis_dicts_models_dics_updated_2.items() if key != str(best_model_3)}
    best_model_4=min(avg_dis_dicts_models_dics_updated_3, key=avg_dis_dicts_models_dics_updated_3.get)
    avg_dis_dicts_models_dics_updated_4 = {key:val for key, val in avg_dis_dicts_models_dics_updated_3.items() if key != str(best_model_4)}
    best_model_5=min(avg_dis_dicts_models_dics_updated_4, key=avg_dis_dicts_models_dics_updated_4.get)
    avg_dis_dicts_models_dics_updated_5 = {key:val for key, val in avg_dis_dicts_models_dics_updated_4.items() if key != str(best_model_5)}
    best_model_6=min(avg_dis_dicts_models_dics_updated_5, key=avg_dis_dicts_models_dics_updated_5.get)
    avg_dis_dicts_models_dics_updated_6 = {key:val for key, val in avg_dis_dicts_models_dics_updated_5.items() if key != str(best_model_6)}
    best_model_7=min(avg_dis_dicts_models_dics_updated_6, key=avg_dis_dicts_models_dics_updated_6.get)
    avg_dis_dicts_models_dics_updated_7 = {key:val for key, val in avg_dis_dicts_models_dics_updated_6.items() if key != str(best_model_7)}
    best_model_8=min(avg_dis_dicts_models_dics_updated_7, key=avg_dis_dicts_models_dics_updated_7.get)
    avg_dis_dicts_models_dics_updated_8 = {key:val for key, val in avg_dis_dicts_models_dics_updated_7.items() if key != str(best_model_8)}
    best_model_9=min(avg_dis_dicts_models_dics_updated_8, key=avg_dis_dicts_models_dics_updated_8.get)
    avg_dis_dicts_models_dics_updated_9 = {key:val for key, val in avg_dis_dicts_models_dics_updated_8.items() if key != str(best_model_9)}
    best_model_10=min(avg_dis_dicts_models_dics_updated_9, key=avg_dis_dicts_models_dics_updated_9.get)

    
    return(best_model_1, best_model_2, best_model_3,best_model_4, best_model_5, best_model_6,best_model_7, best_model_8, best_model_9, best_model_10)

def get_energy_difference (model):
    
    FILENAME = model
    TARGET = "Score:"
    score=float()

    with open(FILENAME) as f:
        value = None
        start_seen = False
        for line in f:
            if TARGET in line:
                _,value = line.split(':  ')
                break

    if value is not None:
        score=value
        
    return (score)
######################################################################################################
#APPLYING


ligase_attach_coordinates=get_ligase_ligand_info (pdb1_docked)

best_model, best_model_2, best_model_3, best_model_4, best_model_5, best_model_6,best_model_7, best_model_8, best_model_9, best_model_10= find_best_protac_model(ligase_attach_coordinates)

print(best_model_9)

print(f'-> your best model is in {best_model}')
print(f'-> your second best model is in {best_model_2}')
print(f'-> your third best model is in {best_model_3}')

first_model_energy=get_energy_difference(str(model_0))

best_protac_energy=get_energy_difference(str('_'.join(best_model.split('_')[1:])))
second_best_protac_energy=get_energy_difference(str('_'.join(best_model_2.split('_')[1:])))
third_best_protac_energy=get_energy_difference(str('_'.join(best_model_3.split('_')[1:])))
fourth_best_protac_energy=get_energy_difference(str('_'.join(best_model_4.split('_')[1:])))
fifth_best_protac_energy=get_energy_difference(str('_'.join(best_model_5.split('_')[1:])))
sixth_best_protac_energy=get_energy_difference(str('_'.join(best_model_6.split('_')[1:])))                                                             
seventh_best_protac_energy=get_energy_difference(str('_'.join(best_model_7.split('_')[1:])))
eigth_best_protac_energy=get_energy_difference(str('_'.join(best_model_8.split('_')[1:])))
ninth_best_protac_energy=get_energy_difference(str('_'.join(best_model_9.split('_')[1:])))
tenth_best_protac_energy=get_energy_difference(str('_'.join(best_model_10.split('_')[1:])))                                                            
                                                                    
print(f' Hdock predicted a score of {first_model_energy}: for the finest predicted pose')
print(f' Whereas Hdock predicted a score of {best_protac_energy}: for your best predicted protac model')
energy_sacrificed= float(first_model_energy) - float(best_protac_energy)
print(' The difference for energy sacrificed is {:.2f}'.format(energy_sacrificed))

dir_name=f'{pdb1_docked_splitted}_{pdb2_docked_splitted}_results'

if not os.path.exists(dir_name):
    os.makedirs(dir_name)
else:
    #shutil.rmtree(dir_name)           # Removes all the subdirectories!
    print('path ealready exists')
shutil.copy(best_model, dir_name)
shutil.copy(best_model_2, dir_name)
shutil.copy(best_model_3, dir_name)
shutil.copy(best_model_4, dir_name)
shutil.copy(best_model_5, dir_name)
shutil.copy(best_model_6, dir_name)
shutil.copy(best_model_7, dir_name)
shutil.copy(best_model_8, dir_name)
shutil.copy(best_model_9, dir_name)
shutil.copy(best_model_10, dir_name)
                                                                    
BEs = [float(best_protac_energy), float(second_best_protac_energy), float(third_best_protac_energy), float(fourth_best_protac_energy),float(fifth_best_protac_energy),float(sixth_best_protac_energy), float(seventh_best_protac_energy),float(eigth_best_protac_energy),float(ninth_best_protac_energy),float(tenth_best_protac_energy)]
models=[best_model, best_model_2, best_model_3, best_model_4,best_model_5, best_model_6,best_model_7, best_model_8, best_model_9, best_model_10]
models_modified=[str('_'.join(i.split('_')[1:])) for i in models]

        
df = pd.DataFrame(columns = ['Ligase', 'POI', 'be_auto', 'model_name'])                                                           

df=df.assign(be_auto=BEs)                                                                    
df=df.assign(model_name=models_modified)                                                                    
df['Ligase']=pdb1_docked_splitted_csv
df['POI']=pdb2_docked_splitted_csv



df.to_csv('output.csv', index=False)
shutil.copy('output.csv', dir_name)

#for file in glob.glob('model_*pdb'):
#    shutil.copy(file, dir_name)

print(f'Done {pdb1_docked} & {pdb2_docked} ! All data written to output.csv in directory {dir_name}')


In [82]:
import pandas as pd
from functools import reduce


df_00=pd.read_csv('output_0.csv', usecols=['score'])
df_00=df_00.rename(columns={'score':'score1'})
df_01=pd.read_csv('output_1.csv', usecols=['score'])
df_01=df_01.rename(columns={'score':'score2'})
df_02=pd.read_csv('output_2.csv', usecols=['score'])
df_02=df_02.rename(columns={'score':'score3'})
df_03=pd.read_csv('output_3.csv', usecols=['score'])
df_03=df_03.rename(columns={'score':'score4'})
df_04=pd.read_csv('output_4.csv', usecols=['score'])
df_04=df_04.rename(columns={'score':'score5'})
df_05=pd.read_csv('output_5.csv', usecols=['score'])
df_05=df_05.rename(columns={'score':'score6'})
df_06=pd.read_csv('output_6.csv', usecols=['score'])
df_06=df_06.rename(columns={'score':'score7'})
df_07=pd.read_csv('output_7.csv', usecols=['score'])
df_07=df_07.rename(columns={'score':'score8'})
df_08=pd.read_csv('output_8.csv', usecols=['score'])
df_08=df_08.rename(columns={'score':'score9'})
df_09=pd.read_csv('output_9.csv', usecols=['score', 'ligase'])
df_09=df_09.rename(columns={'score':'score10', 'ligase':'pdb'})


data_frames = [df_00, df_01, df_02, df_03,df_04,df_05,df_06,df_07,df_08,df_09]
df_concatenated = pd.concat(data_frames, axis =1)
#df_concatenated.drop(df_concatenated.columns[0:2], inplace=True, axis=1)
df_concatenated['name']=df_concatenated.pdb.str.split('_').str[0]
df_concatenated.drop(['pdb'],axis=1,inplace=True)
df_concatenated['name']=df_concatenated['name'].str.upper()
df_concatenated

Unnamed: 0,score1,score2,score3,score4,score5,score6,score7,score8,score9,score10,name
0,-185.92,-170.4,-168.63,-165.06,-158.2,-173.92,-164.87,-150.79,-166.62,-186.04,5T35
1,-184.46,-193.87,-204.52,-205.98,-185.01,-191.93,-184.18,-203.2,-190.57,-182.11,6BN7
2,-183.32,-191.96,-202.35,-191.26,-186.56,-202.54,-225.64,-181.58,-184.38,-184.39,6BOY
3,-157.24,-156.98,-152.27,-162.55,-166.5,-169.62,-150.92,-151.12,-174.99,-174.37,6HAX
4,-187.05,-157.41,-168.67,-165.82,-154.93,-166.69,-165.77,-160.58,-155.04,-171.06,6HAY
5,-422.64,-449.12,-451.5,-422.95,-439.8,-430.98,-420.35,-431.47,-433.92,-474.94,6HR2
6,-161.4,-173.22,-161.94,-166.05,-162.02,-162.92,-152.73,-176.0,-155.32,-161.91,6SIS
7,-156.07,-167.03,-158.49,-162.68,-164.8,-209.91,-158.2,-190.9,-157.87,-189.73,6W7O
8,-162.26,-180.92,-174.72,-194.13,-168.68,-164.68,-162.64,-205.07,-164.89,-187.95,6W8I
9,-169.06,-176.32,-154.82,-156.5,-188.83,-155.61,-176.76,-175.6,-184.31,-159.34,6ZHC


In [46]:
from rdkit import Chem
from rdkit.Chem import Crippen
df_smiles=pd.read_csv('crystals_ligands_and_linkers.csv')
df_smiles['linker_lengths']  = df_smiles['linker'].str.len()
df_smiles['ligand'] = df_smiles['mol'].apply(Chem.MolFromSmiles)
df_smiles['linker'] = df_smiles['linker'].apply(Chem.MolFromSmiles)

df_smiles=df_smiles[['ligand','linker','linker_lengths','name']]
df_smiles['Hbond_acceptors'] = df_smiles['ligand'].apply(Chem.rdMolDescriptors.CalcNumHBA)
df_smiles['Hbond_donors'] = df_smiles['ligand'].apply(Chem.rdMolDescriptors.CalcNumHBD)
df_smiles['TPSA'] = df_smiles['ligand'].apply(Chem.rdMolDescriptors.CalcTPSA)
df_smiles['cLogP'] = df_smiles['ligand'].apply(Crippen.MolLogP)

df_smiles.drop(['ligand'], inplace=True, axis=1)
df_smiles

Unnamed: 0,linker,linker_lengths,name,Hbond_acceptors,Hbond_donors,TPSA,cLogP
0,<rdkit.Chem.rdchem.Mol object at 0x7f2ad1505670>,13,5T35,15,4,211.49,5.59668
1,<rdkit.Chem.rdchem.Mol object at 0x7f2ad15058a0>,16,6BN7,14,3,220.35,4.72104
2,<rdkit.Chem.rdchem.Mol object at 0x7f2accaee300>,15,6BOY,12,3,194.05,5.24286
3,<rdkit.Chem.rdchem.Mol object at 0x7f2ac0893b20>,14,6HAX,12,6,200.57,4.00092
4,<rdkit.Chem.rdchem.Mol object at 0x7f2acc763760>,11,6HAY,14,6,219.03,2.29142
5,<rdkit.Chem.rdchem.Mol object at 0x7f2acf73bda0>,14,6HR2,12,6,200.57,4.00092
6,<rdkit.Chem.rdchem.Mol object at 0x7f2acf73bd50>,17,6W7O,13,7,254.11,5.5895
7,<rdkit.Chem.rdchem.Mol object at 0x7f2acfa75c60>,18,6W8I,15,5,266.97,5.7803
8,<rdkit.Chem.rdchem.Mol object at 0x7f2acf7cb300>,13,7KHH,13,5,228.79,7.88482
9,<rdkit.Chem.rdchem.Mol object at 0x7f2acf7cb210>,22,6ZHC,21,4,274.49,7.02512


In [87]:
merged_df=df_smiles.merge(df_concatenated, how='outer', on='name')
merged_df=merged_df.iloc[:-1,:]
merged_df

Unnamed: 0,linker,linker_lengths,name,Hbond_acceptors,Hbond_donors,TPSA,cLogP,score1,score2,score3,score4,score5,score6,score7,score8,score9,score10
0,<rdkit.Chem.rdchem.Mol object at 0x7f2ad1505670>,13.0,5T35,15.0,4.0,211.49,5.59668,-185.92,-170.4,-168.63,-165.06,-158.2,-173.92,-164.87,-150.79,-166.62,-186.04
1,<rdkit.Chem.rdchem.Mol object at 0x7f2ad15058a0>,16.0,6BN7,14.0,3.0,220.35,4.72104,-184.46,-193.87,-204.52,-205.98,-185.01,-191.93,-184.18,-203.2,-190.57,-182.11
2,<rdkit.Chem.rdchem.Mol object at 0x7f2accaee300>,15.0,6BOY,12.0,3.0,194.05,5.24286,-183.32,-191.96,-202.35,-191.26,-186.56,-202.54,-225.64,-181.58,-184.38,-184.39
3,<rdkit.Chem.rdchem.Mol object at 0x7f2ac0893b20>,14.0,6HAX,12.0,6.0,200.57,4.00092,-157.24,-156.98,-152.27,-162.55,-166.5,-169.62,-150.92,-151.12,-174.99,-174.37
4,<rdkit.Chem.rdchem.Mol object at 0x7f2acc763760>,11.0,6HAY,14.0,6.0,219.03,2.29142,-187.05,-157.41,-168.67,-165.82,-154.93,-166.69,-165.77,-160.58,-155.04,-171.06
5,<rdkit.Chem.rdchem.Mol object at 0x7f2acf73bda0>,14.0,6HR2,12.0,6.0,200.57,4.00092,-422.64,-449.12,-451.5,-422.95,-439.8,-430.98,-420.35,-431.47,-433.92,-474.94
6,<rdkit.Chem.rdchem.Mol object at 0x7f2acf73bd50>,17.0,6W7O,13.0,7.0,254.11,5.5895,-156.07,-167.03,-158.49,-162.68,-164.8,-209.91,-158.2,-190.9,-157.87,-189.73
7,<rdkit.Chem.rdchem.Mol object at 0x7f2acfa75c60>,18.0,6W8I,15.0,5.0,266.97,5.7803,-162.26,-180.92,-174.72,-194.13,-168.68,-164.68,-162.64,-205.07,-164.89,-187.95
8,<rdkit.Chem.rdchem.Mol object at 0x7f2acf7cb300>,13.0,7KHH,13.0,5.0,228.79,7.88482,-171.58,-191.25,-182.63,-193.91,-187.31,-180.33,-203.59,-190.3,-216.32,-167.26
9,<rdkit.Chem.rdchem.Mol object at 0x7f2acf7cb210>,22.0,6ZHC,21.0,4.0,274.49,7.02512,-169.06,-176.32,-154.82,-156.5,-188.83,-155.61,-176.76,-175.6,-184.31,-159.34


In [94]:
#merged_df['mean'] = df.iloc[:, [0,1,2,3,4].mean(axis=1)
merged_df['score'] = merged_df.iloc[:, 7:].mean(axis=1)
merged_df.drop(['score1','score2','score3','score4','score5','score6','score7','score8','score9','score10'],axis=1)
merged_df=merged_df.rename(columns={'linker':'mol'})
merged_df=merged_df[['score','linker_lengths','cLogP','TPSA','mol','Hbond_acceptors','Hbond_donors']]
merged_df

Unnamed: 0,score,linker_lengths,cLogP,TPSA,mol,Hbond_acceptors,Hbond_donors
0,-169.045,13.0,5.59668,211.49,<rdkit.Chem.rdchem.Mol object at 0x7f2ad1505670>,15.0,4.0
1,-192.583,16.0,4.72104,220.35,<rdkit.Chem.rdchem.Mol object at 0x7f2ad15058a0>,14.0,3.0
2,-193.398,15.0,5.24286,194.05,<rdkit.Chem.rdchem.Mol object at 0x7f2accaee300>,12.0,3.0
3,-161.656,14.0,4.00092,200.57,<rdkit.Chem.rdchem.Mol object at 0x7f2ac0893b20>,12.0,6.0
4,-165.302,11.0,2.29142,219.03,<rdkit.Chem.rdchem.Mol object at 0x7f2acc763760>,14.0,6.0
5,-437.767,14.0,4.00092,200.57,<rdkit.Chem.rdchem.Mol object at 0x7f2acf73bda0>,12.0,6.0
6,-171.568,17.0,5.5895,254.11,<rdkit.Chem.rdchem.Mol object at 0x7f2acf73bd50>,13.0,7.0
7,-176.594,18.0,5.7803,266.97,<rdkit.Chem.rdchem.Mol object at 0x7f2acfa75c60>,15.0,5.0
8,-188.448,13.0,7.88482,228.79,<rdkit.Chem.rdchem.Mol object at 0x7f2acf7cb300>,13.0,5.0
9,-169.715,22.0,7.02512,274.49,<rdkit.Chem.rdchem.Mol object at 0x7f2acf7cb210>,21.0,4.0


In [95]:
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem.AllChem import  GetMorganFingerprintAsBitVect, GetErGFingerprint

def ExplicitBitVect_to_Array(bitvector):
    bitstring = bitvector.ToBitString()
    intmap = map(int, bitstring)
    return list(intmap)


def fingerprint(mol):
    return rdMolDescriptors.GetMorganFingerprintAsBitVect((mol), radius=2, nBits = 2048, useBondTypes=True, useFeatures= True)


merged_df['fingerprint'] = merged_df['mol'].apply(fingerprint)
merged_df=merged_df.drop(['mol'], axis=1)

merged_df['ready_fingerprints']=merged_df['fingerprint'].apply(ExplicitBitVect_to_Array)
merged_df=merged_df.drop(['fingerprint'], axis=1)

mfp_list=list(merged_df['ready_fingerprints'])
#mfp_list=[l.tolist() for l in mfp_list]

merged_df=merged_df.drop(['ready_fingerprints'], axis=1)

for i in range(len(merged_df)):

    try:
        mfp_list[i].extend((merged_df.loc[i,:]).tolist()[1:])
    except:
        print((merged_df.loc[i,:]).tolist()[1:])


In [98]:
import numpy as np
X = np.array(mfp_list)


array([[  1.  ,   0.  ,   1.  , ..., 211.49,  15.  ,   4.  ],
       [  1.  ,   1.  ,   1.  , ..., 220.35,  14.  ,   3.  ],
       [  1.  ,   1.  ,   1.  , ..., 194.05,  12.  ,   3.  ],
       ...,
       [  1.  ,   0.  ,   1.  , ..., 228.79,  13.  ,   5.  ],
       [  1.  ,   0.  ,   1.  , ..., 274.49,  21.  ,   4.  ],
       [  1.  ,   0.  ,   1.  , ..., 194.59,  12.  ,   7.  ]])

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd



In [26]:
Columns: [score, linker_lengths, cLogP, TPSA, Hbond_acceptors, Hbond_donors, flag]

NameError: name 'score' is not defined

In [39]:
import rdkit
df_smiles['Hbond_acceptors'] = df_smiles['ligand'].apply(Chem.rdMolDescriptors.CalcNumHBA)
df_smiles['Hbond_donors'] = df_smiles['ligand'].apply(Chem.rdMolDescriptors.CalcNumHBD)
df_smiles['TPSA'] = df_smiles['ligand'].apply(Chem.rdMolDescriptors.CalcTPSA)
df_smiles['cLogP'] = df_smiles['ligand'].apply(Crippen.MolLogP)


AttributeError: module 'rdkit.Chem' has no attribute 'rdMolDescriptors'

In [None]:
df_smiles['ligand'] = df_smiles['mol'].apply(Chem.MolFromSmiles)
df_smiles['MW'] = df_smiles['ligand'].apply(Chem.rdMolDescriptors.CalcExactMolWt)
rdkit.Chem.rdMolDescriptors.CalcNumHBA((Mol)mol) 
rdkit.Chem.rdMolDescriptors.CalcNumHBD((Mol)mol) 
rdkit.Chem.rdMolDescriptors.CalcTPSA((Mol)mol[, (bool)force=False[, (bool)includeSandP=False]]) 
rdkit.Chem.rdMolDescriptors.CalcCrippenDescriptors((Mol)mol[, (bool)includeHs=True[, (bool)force=False]])
rdkit.Chem.rdMolDescriptors.CalcExactMolWt((Mol)mol[, (bool)onlyHeavy=False]) 


In [None]:
import argparse
import Bio.PDB
import numpy as np
import sys
import os
import shutil
from itertools import product
import pandas as pd
import math as m
import glob
import statistics
import collections
from scipy.spatial import distance
from biopandas.pdb import PandasPdb

parser = Bio.PDB.PDBParser(QUIET=True)

####os.chdir('/groups/cherkasvgrp/share/progressive_docking/pd_python_pose/protacs/test_benchmark_protac_models')
# here you input the desired ligase (FIRST) followed by the protein of protac target (SECOND)
# both should be ligand bound 
# this is ran after HDOCK docking job
# example of use is: python benchmarking_protacs_2.py 5t35_vhl 5t35_brd4

pdb1_docked= str(sys.argv[1])
pdb2_docked= str(sys.argv[2])
model_0='model_1.pdb'

# parsing ligand files that were dissected from the original ligase and protein of protac target (bash script)
#pdb1_docked_splitted=str(pdb1_docked.split('.')[0])
#pdb2_docked_splitted=str(pdb2_docked.split('.')[0])

#pdb1_docked_splitted_csv=str(pdb1_docked_splitted.split('_')[3])
#pdb2_docked_splitted_csv=str(pdb2_docked_splitted.split('_')[0])
def parse_pdb_structure (pdb):
    return(parser.get_structure(str(pdb.rsplit( ".", 1 )[ 0 ]) , pdb))

#Deteterming the coordinate of ligase's linker attachment point
def get_ligase_ligand_info (e3_binder):
    #ligase_name=e3_binder.split('.')[0].split('_')[3]
    ligase_name=str(e3_binder)
    if ligase_name == 'IAP':
        x,y,z=35.114,-3.532,45.775
    elif ligase_name == 'RNF4':
        x,y,z=8.227,12.055,-7.251 
    elif ligase_name == 'VHL':
        x,y,z=25.631,-25.053,-15.098
    elif ligase_name == 'UBR1':
        x,y,z=18.010,-6.326,22.463
    elif ligase_name == 'MDM2':
        x,y,z=20.192,16.505,32.243
    elif ligase_name == 'CRBN':
        x,y,z=69.876,42.945,51.515
    elif ligase_name == 'RNF144':
        x,y,z=-10.241,-1.942,-8.284
    elif ligase_name == 'RNF114':
        x,y,z=-10.241,-1.942,-8.284
    else:
        print('None of the recognized E3 Binder is in your directory')
    return (np.array([[x,y,z]]))

# 
def combine_dicts(func, *dicts):
    default = collections.defaultdict(set)
    for d in dicts:
        for k, v in d.items():
            default[k].add(v)
    return {k: func(v) for k, v in default.items()}

# here we try to find the HDOCK model with the least distance between those two residues: ligase and POI
def find_best_protac_model (ligand_coor1):
    print('calculating best model which matches the least corresponding distance between those two residues ...')      
    avg_dis_dicts_models=[]
    for filename in glob.iglob('ligand_model_*.pdb'):
        ppdb = PandasPdb().read_pdb(filename)
        ligand_coor2=ppdb.df['HETATM'].iloc[:,11:14].to_numpy()

        close_atoms_ref=ligand_coor1
        close_atoms_comp=ligand_coor2
        
        min_dis_dicts_models=[]

        for i in close_atoms_ref:
            min_dis_dicts_models.append({str(filename):float(min(np.linalg.norm(i - j) for j in close_atoms_comp))})
            #print(f'{i} has minimum distance of {min(np.linalg.norm(i - j) for j in close_atoms_comp)} for model {filename}')
        avg_dis_dicts_models.append(combine_dicts(statistics.mean, *min_dis_dicts_models))
        
    avg_dis_dicts_models_dics={k: v for d in avg_dis_dicts_models for k, v in d.items()}
    best_model_1=min(avg_dis_dicts_models_dics, key=avg_dis_dicts_models_dics.get)
    
    top_models=[]
    avg_dis_dicts_models_dics_updated ={}
    top_models.append(best_model_1)
    for i in range (19):
        avg_dis_dicts_models_dics_updated = {key:val for key, val in avg_dis_dicts_models_dics_updated.items() if key != str(top_models[i])}
        best_model_n=min(avg_dis_dicts_models_dics_updated, key=avg_dis_dicts_models_dics_updated.get)
        top_models.append(best_model_n)
        
    return(top_models)

def get_energy_difference (model):
    
    FILENAME = model
    TARGET = "Score:"
    score=float()

    with open(FILENAME) as f:
        value = None
        start_seen = False
        for line in f:
            if TARGET in line:
                _,value = line.split(':  ')
                break

    if value is not None:
        score=value
        
    return (score)
######################################################################################################
#APPLYING


ligase_attach_coordinates=get_ligase_ligand_info (pdb1_docked)

top_models= find_best_protac_model(ligase_attach_coordinates)



print(f'-> your best model is in {top_models[0]}')
print(f'-> your second best model is in {top_models[1]}')
print(f'-> your third best model is in {top_models[2]}')

first_model_energy=get_energy_difference(str(model_0))
energy_protac_all=[]
for i in range (20):
    protac_energy=get_energy_difference(str('_'.join(top_models[i].split('_')[1:])))
    energy_protac_all.append(protac_energy)
                                                          
                                                                    
print(f' Hdock predicted a score of {first_model_energy}: for the finest predicted pose')
print(f' Whereas Hdock predicted a score of {energy_protac_all[0]}: for your best predicted protac model')
energy_sacrificed= float(first_model_energy) - float(energy_protac_all[0])
print(' The difference for energy sacrificed is {:.2f}'.format(energy_sacrificed))

dir_name=f'{pdb1_docked}_{pdb2_docked}_results'

if not os.path.exists(dir_name):
    os.makedirs(dir_name)
else:
    #shutil.rmtree(dir_name)           # Removes all the subdirectories!
    print('path ealready exists')

for i in range(20):
    shutil.copy(str(top_models[i]), dir_name)

                                                                    
BEs = [float(energy_protac_all[0]), float(energy_protac_all[1]), float(energy_protac_all[2]), float(energy_protac_all[3]),float(energy_protac_all[4]),float(energy_protac_all[5]), float(energy_protac_all[6]),float(energy_protac_all[7]),float(energy_protac_all[8]),float(energy_protac_all[9]),float(energy_protac_all[10]),float(energy_protac_all[11]),float(energy_protac_all[12]),float(energy_protac_all[13]),float(energy_protac_all[14]),float(energy_protac_all[15]),float(energy_protac_all[16]),float(energy_protac_all[17]),float(energy_protac_all[18]),float(energy_protac_all[19])]
#models=[top_models[0], top_models[1], top_models[2], top_models[3],top_models[4], top_models[5],top_models[6], top_models[7], top_models[8], top_models[9],top_models[10],top_models[11],top_models[12],top_models[13],top_models[14],top_models[15],top_models[16],top_models[17],top_models[18],top_models[19]]
models_modified=[str('_'.join(i.split('_')[1:])) for i in top_models]

if str(pdb1_docked) == 'UBR1':
    identify='Ubr1'
elif str(pdb1_docked) == 'IAP':
    identify='IAP'
elif str(pdb1_docked) == 'MDM2':
    identify='Mdm2'
elif str(pdb1_docked) == 'CRBN':
    identify='Cereblon'
elif str(pdb1_docked) == 'RNF114':
    identify='RNF144'
elif str(pdb1_docked) == 'RNF144':
    identify='RNF144'
elif str(pdb1_docked) == 'RNF4':
    identify='RNF4'
elif str(pdb1_docked) == 'VHL':
    identify='VHL'
else:
    print('None of the recognized E3 Binder is in your directory')
    
identifier=f'{identify}_{pdb2_docked.upper()}'

for be in range(10):
    data=[[str(pdb1_docked),str(pdb2_docked).upper(),BEs[be],models_modified[be],str(identifier)]]
    df = pd.DataFrame(data, columns = ['Ligase', 'POI', 'be_auto', 'model_name', 'identifier'])
    #df['be_auto']=BEs[be]
    #df['mode_name']=models_modified[be]
    #df['Ligase']=str(pdb1_docked)
    #df['POI']=str(pdb2_docked).upper()
    #df['identifier']=str(identifier)
    df.to_csv(f'output_{be}.csv', index=False, header=False)
    shutil.copy(f'output_{be}.csv', dir_name)

#df.to_csv('output.csv', index=False)
#shutil.copy('output.csv', dir_name)

#for file in glob.glob('model_*pdb'):
#    shutil.copy(file, dir_name)

print(f'Done {pdb1_docked} & {pdb2_docked} ! All data written to output.csv in directory {dir_name}')

