In [None]:
import argparse
import Bio.PDB
import numpy as np
import sys
import os
import shutil
from itertools import product
import pandas as pd
import math as m
import glob
import statistics
import collections
from scipy.spatial import distance
from biopandas.pdb import PandasPdb

parser = Bio.PDB.PDBParser(QUIET=True)

####os.chdir('/groups/cherkasvgrp/share/progressive_docking/pd_python_pose/protacs/test_benchmark_protac_models')
# here you input the desired ligase (FIRST) followed by the protein of protac target (SECOND)
# both should be ligand bound 
# this is ran after HDOCK docking job
# example of use is: python benchmarking_protacs_2.py 5t35_vhl 5t35_brd4

pdb1_docked= str(sys.argv[1])
pdb2_docked= str(sys.argv[2])
model_0='model_1.pdb'

# parsing ligand files that were dissected from the original ligase and protein of protac target (bash script)
#pdb1_docked_splitted=str(pdb1_docked.split('.')[0])
#pdb2_docked_splitted=str(pdb2_docked.split('.')[0])

#pdb1_docked_splitted_csv=str(pdb1_docked_splitted.split('_')[3])
#pdb2_docked_splitted_csv=str(pdb2_docked_splitted.split('_')[0])
def parse_pdb_structure (pdb):
    return(parser.get_structure(str(pdb.rsplit( ".", 1 )[ 0 ]) , pdb))
 
def combine_dicts(func, *dicts):
    default = collections.defaultdict(set)
    for d in dicts:
        for k, v in d.items():
            default[k].add(v)
    return {k: func(v) for k, v in default.items()}

# here we try to find the HDOCK model with the least distance between those two residues: ligase and POI
def find_best_protac_model (ligand_coor1):
    print('calculating best model which matches the least corresponding distance between those two residues ...') 
 
    if ligand_coor1.size == 0:
        ppdb = PandasPdb().read_pdb(f'ligand_{pdb1_docked})
        ligand_coor1=ppdb.df['HETATM'].iloc[:,11:14].to_numpy()
    
    avg_dis_dicts_models=[]
    for filename in glob.iglob('ligand_model_*.pdb'):
        ppdb = PandasPdb().read_pdb(filename)
        ligand_coor2=ppdb.df['HETATM'].iloc[:,11:14].to_numpy()

        close_atoms_ref=ligand_coor1
        close_atoms_comp=ligand_coor2
        
        min_dis_dicts_models=[]

        for i in close_atoms_ref:
            min_dis_dicts_models.append({str(filename):float(min(np.linalg.norm(i - j) for j in close_atoms_comp))})
            #print(f'{i} has minimum distance of {min(np.linalg.norm(i - j) for j in close_atoms_comp)} for model {filename}')
        avg_dis_dicts_models.append(combine_dicts(statistics.mean, *min_dis_dicts_models))
        
    avg_dis_dicts_models_dics={k: v for d in avg_dis_dicts_models for k, v in d.items()}
    best_model_1=min(avg_dis_dicts_models_dics, key=avg_dis_dicts_models_dics.get)
    
    top_models=[]
    avg_dis_dicts_models_dics_updated ={}
    top_models.append(best_model_1)
    avg_dis_dicts_models_dics_updated = {key:val for key, val in avg_dis_dicts_models_dics.items()}
    for i in range (19):
        avg_dis_dicts_models_dics_updated = {key:val for key, val in avg_dis_dicts_models_dics_updated.items() if key != str(top_models[i])}
        best_model_n=min(avg_dis_dicts_models_dics_updated, key=avg_dis_dicts_models_dics_updated.get)
        top_models.append(best_model_n)
        
    return(top_models)

def get_energy_difference (model):
    
    FILENAME = model
    TARGET = "Score:"
    score=float()

    with open(FILENAME) as f:
        value = None
        start_seen = False
        for line in f:
            if TARGET in line:
                _,value = line.split(':  ')
                break

    if value is not None:
        score=value
        
    return (score)
######################################################################################################
#APPLYING


ligase_attach_coordinates=get_ligase_ligand_info (pdb1_docked)

top_models= find_best_protac_model(ligase_attach_coordinates)



print(f'-> your best model is in {top_models[0]}')
print(f'-> your second best model is in {top_models[1]}')
print(f'-> your third best model is in {top_models[2]}')

first_model_energy=get_energy_difference(str(model_0))
energy_protac_all=[]
for i in range (20):
    protac_energy=get_energy_difference(str('_'.join(top_models[i].split('_')[1:])))
    energy_protac_all.append(protac_energy)
                                                          
                                                                    
print(f' Hdock predicted a score of {first_model_energy}: for the finest predicted pose')
print(f' Whereas Hdock predicted a score of {energy_protac_all[0]}: for your best predicted protac model')
energy_sacrificed= float(first_model_energy) - float(energy_protac_all[0])
print(' The difference for energy sacrificed is {:.2f}'.format(energy_sacrificed))

dir_name=f'{pdb1_docked}_{pdb2_docked}_results'

if not os.path.exists(dir_name):
    os.makedirs(dir_name)
else:
    #shutil.rmtree(dir_name)           # Removes all the subdirectories!
    print('path ealready exists')

for i in range(20):
    shutil.copy(str(top_models[i]), dir_name)

                                                                    
BEs = [float(energy_protac_all[0]), float(energy_protac_all[1]), float(energy_protac_all[2]), float(energy_protac_all[3]),float(energy_protac_all[4]),float(energy_protac_all[5]), float(energy_protac_all[6]),float(energy_protac_all[7]),float(energy_protac_all[8]),float(energy_protac_all[9]),float(energy_protac_all[10]),float(energy_protac_all[11]),float(energy_protac_all[12]),float(energy_protac_all[13]),float(energy_protac_all[14]),float(energy_protac_all[15]),float(energy_protac_all[16]),float(energy_protac_all[17]),float(energy_protac_all[18]),float(energy_protac_all[19])]
#models=[top_models[0], top_models[1], top_models[2], top_models[3],top_models[4], top_models[5],top_models[6], top_models[7], top_models[8], top_models[9],top_models[10],top_models[11],top_models[12],top_models[13],top_models[14],top_models[15],top_models[16],top_models[17],top_models[18],top_models[19]]
models_modified=[str('_'.join(i.split('_')[1:])) for i in top_models]

if str(pdb1_docked.split('.')[0].split('_')[3]) == 'UBR1':
    identify='Ubr1'
elif str(pdb1_docked.split('.')[0].split('_')[3]) == 'IAP':
    identify='IAP'
elif str(pdb1_docked.split('.')[0].split('_')[3]) == 'MDM2':
    identify='Mdm2'
elif str(pdb1_docked.split('.')[0].split('_')[3]) == 'CRBN':
    identify='Cereblon'
elif str(pdb1_docked.split('.')[0].split('_')[3]) == 'RNF114':
    identify='RNF144'
elif str(pdb1_docked.split('.')[0].split('_')[3]) == 'RNF144':
    identify='RNF144'
elif str(pdb1_docked.split('.')[0].split('_')[3]) == 'RNF4':
    identify='RNF4'
elif str(pdb1_docked.split('.')[0].split('_')[3]) == 'VHL':
    identify='VHL'
else:
    print('None of the recognized E3 Binder is in your directory')
    
identifier=f'{identify}_{pdb2_docked.upper()}'

for be in range(20):
    data=[[str(pdb1_docked),str(pdb2_docked).upper(),BEs[be],models_modified[be],str(identifier)]]
    df = pd.DataFrame(data, columns = ['Ligase', 'POI', 'be_auto', 'model_name', 'identifier'])
    #df['be_auto']=BEs[be]
    #df['mode_name']=models_modified[be]
    #df['Ligase']=str(pdb1_docked)
    #df['POI']=str(pdb2_docked).upper()
    #df['identifier']=str(identifier)
    df.to_csv(f'output_{be}.csv', index=False, header=False)
    shutil.copy(f'output_{be}.csv', dir_name)

#df.to_csv('output.csv', index=False)
#shutil.copy('output.csv', dir_name)

#for file in glob.glob('model_*pdb'):
#    shutil.copy(file, dir_name)

print(f'Done {pdb1_docked} & {pdb2_docked} ! All data written to output.csv in directory {dir_name}')




In [1]:
import argparse
import Bio.PDB
import numpy as np
import sys
import os
import shutil
from itertools import product
import pandas as pd
import math as m
import glob
import statistics
import collections
from scipy.spatial import distance
from biopandas.pdb import PandasPdb

In [2]:
parser = Bio.PDB.PDBParser(QUIET=True)

pdb_docked= '5t35_brd4.pdb'   



In [None]:
pdb_docked_splitted=str(pdb1_docked.split('.')[0])

In [58]:
ppdb = PandasPdb().read_pdb(pdb_docked)
lig_x_coord,lig_y_coord,lig_z_coord= statistics.mean(list(ppdb.df['HETATM'].x_coord)), statistics.mean(list(ppdb.df['HETATM'].y_coord)), statistics.mean(list(ppdb.df['HETATM'].z_coord))
ligand_coordinates_avg=[[lig_x_coord,lig_y_coord,lig_z_coord]]

protein_coord=[]
for i in range(len(ppdb.df['ATOM'])):
    protein_coord.append([ppdb.df['ATOM'].x_coord[i], ppdb.df['ATOM'].y_coord[i], ppdb.df['ATOM'].z_coord[i]])

def get_minimum_residue_distance (avg_ligand_coor, rec_coors):
    dis_df = pd.DataFrame(columns=["A", "B", "distance"])

    for pair in product(avg_ligand_coor, rec_coors):
        x, y = pair[0], pair[1]

        dist = distance.euclidean(x, y)
        dis_df = dis_df.append(
            {'A': x, 'B': y, 'distance': dist}, ignore_index=True
        )
    print(dis_df)
    the_minimum_index=dis_df.index[dis_df.distance == dis_df.distance.min()]
    return(the_minimum_index.tolist()[0])

most_minimum_dist_residue_index=get_minimum_residue_distance (ligand_coordinates_avg, protein_coord)

                                                     A  \
0    [23.097666666666665, -33.67418518518519, -14.6...   
1    [23.097666666666665, -33.67418518518519, -14.6...   
2    [23.097666666666665, -33.67418518518519, -14.6...   
3    [23.097666666666665, -33.67418518518519, -14.6...   
4    [23.097666666666665, -33.67418518518519, -14.6...   
..                                                 ...   
880  [23.097666666666665, -33.67418518518519, -14.6...   
881  [23.097666666666665, -33.67418518518519, -14.6...   
882  [23.097666666666665, -33.67418518518519, -14.6...   
883  [23.097666666666665, -33.67418518518519, -14.6...   
884  [23.097666666666665, -33.67418518518519, -14.6...   

                              B   distance  
0     [27.034, -69.694, -4.273]  37.680150  
1      [27.48, -69.167, -5.601]  36.879843  
2      [26.73, -67.889, -5.968]  35.476005  
3     [27.351, -66.865, -6.258]  34.489010  
4     [27.293, -70.219, -6.699]  37.626085  
..                          ...  

In [89]:
def parse_pdb_structure (pdb):
    return(parser.get_structure(str(pdb.rsplit( ".", 1 )[ 0 ]) , pdb))

def what_chain_is_poi (structure):
    chains=[]
    for model in structure:
        for chain in model:
            chains.append(chain)
    chain_id=int()
    for i in range(len(chains)):
        residues_list=[]
        for idx, residue in enumerate(chains[i]):
            residues_list.append(residue)
        if len(residues_list)>=30:
            chain_id=i
            break
        else:
            print(f'poi not chain {i+2} ??')
            continue
    return(chain_id)

chain_id=what_chain_is_poi(parse_pdb_structure(pdb_docked))




In [76]:
target_residue_name=ppdb.df['ATOM'].iloc[most_minimum_dist_residue_index]['residue_name']
target_residue_number=ppdb.df['ATOM'].iloc[most_minimum_dist_residue_index]['residue_number']
target_residue=f'{target_residue_name}{target_residue_number}'
target_residue

'VAL439'

In [82]:
structure=parse_pdb_structure(pdb_docked)
chains=[]
idx_model=int()
for model in structure:
    for chain in model:
        chains.append(chain)

    for idx, residue in enumerate(chains[0]):
        if f'resseq={target_residue_number} ' in str(residue):
            idx_model=idx
idx_model

90

In [116]:
import argparse
import Bio.PDB
import numpy as np
import sys
import os
import shutil
from itertools import product
import pandas as pd
import math as m
import glob
import statistics
import collections
from scipy.spatial import distance
from biopandas.pdb import PandasPdb

parser = Bio.PDB.PDBParser(QUIET=True)

pdb_docked= '5t35_brd4.pdb'   

ppdb = PandasPdb().read_pdb(pdb_docked)
lig_x_coord,lig_y_coord,lig_z_coord= statistics.mean(list(ppdb.df['HETATM'].x_coord)), statistics.mean(list(ppdb.df['HETATM'].y_coord)), statistics.mean(list(ppdb.df['HETATM'].z_coord))
ligand_coordinates_avg=[[lig_x_coord,lig_y_coord,lig_z_coord]]

protein_coord=[]
for i in range(len(ppdb.df['ATOM'])):
    protein_coord.append([ppdb.df['ATOM'].x_coord[i], ppdb.df['ATOM'].y_coord[i], ppdb.df['ATOM'].z_coord[i]])

def get_minimum_residue_distance (avg_ligand_coor, rec_coors):
    dis_df = pd.DataFrame(columns=["A", "B", "distance"])

    for pair in product(avg_ligand_coor, rec_coors):
        x, y = pair[0], pair[1]

        dist = distance.euclidean(x, y)
        dis_df = dis_df.append(
            {'A': x, 'B': y, 'distance': dist}, ignore_index=True
        )
    print(dis_df)
    the_minimum_index=dis_df.index[dis_df.distance == dis_df.distance.min()]
    return(the_minimum_index.tolist()[0])

most_minimum_dist_residue_index=get_minimum_residue_distance (ligand_coordinates_avg, protein_coord)
def parse_pdb_structure (pdb):
    return(parser.get_structure(str(pdb.rsplit( ".", 1 )[ 0 ]) , pdb))

def what_chain_is_poi (structure):
    chains=[]
    for model in structure:
        for chain in model:
            chains.append(chain)
    chain_id=int()
    for i in range(len(chains)):
        residues_list=[]
        for idx, residue in enumerate(chains[i]):
            residues_list.append(residue)
        if len(residues_list)>=30:
            chain_id=i
            break
        else:
            print(f'poi not chain {i+2} ??')
            continue
    return(chain_id)

chain_id=what_chain_is_poi(parse_pdb_structure(pdb_docked))

target_residue_name=ppdb.df['ATOM'].iloc[most_minimum_dist_residue_index]['residue_name']
target_residue_number=ppdb.df['ATOM'].iloc[most_minimum_dist_residue_index]['residue_number']
target_residue=f'{target_residue_name}{target_residue_number}'

structure=parse_pdb_structure(pdb_docked)
chains=[]
idx_model=int()
for model in structure:
    for chain in model:
        chains.append(chain)

    for idx, residue in enumerate(chains[0]):
        if f'resseq={target_residue_number} ' in str(residue):
            idx_model=idx
            
residues_ref = [r for r in structure.get_residues()]
target_atom = residues_ref[idx_model]['CA']
atoms = Bio.PDB.Selection.unfold_entities(structure, 'A')
ns = Bio.PDB.NeighborSearch(atoms)
close_residues = ns.search(target_atom.coord, 15 ,'R')
# close_atoms=[coor.coord for coor in close_atoms]
res_all=[close_residues[i].get_full_id()[3][1] for i in range(len(close_residues))]
res_all

[400,
 438,
 439,
 380,
 389,
 386,
 376,
 447,
 394,
 390,
 435,
 369,
 399,
 372,
 381,
 301,
 377,
 387,
 449,
 382,
 448,
 428,
 373,
 370,
 424,
 378,
 379,
 374,
 375,
 444,
 440,
 429,
 425,
 371,
 433,
 430,
 436,
 445,
 432,
 441,
 385,
 446,
 426,
 427,
 384,
 393,
 422,
 423,
 364,
 437,
 434,
 398,
 431,
 442,
 443]

In [None]:
def get_rec_coor (structure):
    
    ppdb = PandasPdb().read_pdb(structure)
    ligand_coor1=ppdb.df['HETATM'].iloc[:,11:14].to_numpy()
    atom_rec_coors={k:v for k, v in atom_rec_coors.items() if v}
    return (atom_rec_coors)

#function 

get_rec_coor(pdb_docked)
                                

In [None]:
import argparse
import Bio.PDB
import numpy as np
import sys
import os
import shutil
from itertools import product
import pandas as pd
import math as m
import glob
import statistics
import collections
from scipy.spatial import distance
from biopandas.pdb import PandasPdb

parser = Bio.PDB.PDBParser(QUIET=True)

# here you input the desired ligase (FIRST) followed by the protein of protac target (SECOND)
# both should be ligand bound 
# this is ran after HDOCK docking job
pdb_docked= str(sys.argv[1])     

pdb_ligand=''

# parsing ligand files that were dissected from the original ligase and protein of protac target (bash script)
pdb_docked_splitted=str(pdb1_docked.split('.')[0])

# parsing function for any protein to prep for further manipulation of the PDB
def parse_pdb_structure (pdb):
    return(parser.get_structure(str(pdb.rsplit( ".", 1 )[ 0 ]) , pdb))

# here it gets the receptor X Y Z coordinates of its constituting atoms as a dictionary
def get_rec_coor (structure):
    
    atom_rec_coors={k:v for k, v in atom_rec_coors.items() if v}
    return (atom_rec_coors)

# here we process the dictionary from last function into iterable list of corrdinates
def get_processed_list_rec_coor (atom_rec_coors):
    
    list_of_atom_rec_coors=list(atom_rec_coors.values())
    return(list((i[0] for i in list_of_atom_rec_coors)))

# here we get the ligand's average coordinate - averages all X Y Z of ligand's atoms
def avg_ligand_coor (structure_ligand):
    ligand_coor=list()
    for model in structure_ligand:
        for chain in model:
            for residue in chain:
                for atom in residue:
                    x,y,z = atom.get_coord()

                    ligand_coor.append((x,y,z))
    avg_ligand_coors=[[sum(x)/len(x) for x in zip(*ligand_coor)]]
    return (ligand_coor,avg_ligand_coors)

# here we attempt to calculate the least distance between ligand average coordinate and corresponding protein
def get_minimum_residue_distance (avg_ligand_coor, processed_list_of_atom_rec_coors):
    dis_df = pd.DataFrame(columns=["A", "B", "distance"])

    for pair in product(avg_ligand_coor, processed_list_of_atom_rec_coors):
        x, y = pair[0], pair[1]

        dist = m.dist(x, y)
        dis_df = dis_df.append(
            {'A': x, 'B': y, 'distance': dist}, ignore_index=True
        )

    the_minimum_index=dis_df.index[dis_df.distance == dis_df.distance.min()]
    return(the_minimum_index.tolist()[0])

# in case the POI contains more than one chain we need the chain that has the bound ligand (greater probability)
def what_chain_is_poi (structure):
    chains=[]
    idx_hdock_model_1st=int()
    idx_hdock_model_2nd=int()
    for model in structure:
        for chain in model:
            chains.append(chain)
    chain_id=int()
    chains.pop(0)
    for i in range(len(chains)):
        residues_list=[]
        for idx, residue in enumerate(chains[i]):
            residues_list.append(residue)
        if len(residues_list)>=30:
            chain_id=i+1
            break
        else:
            print(f'poi not chain {i+2} ??')
            continue
    return(chain_id)

# here we get the index of that residue in an HDOCKed model
def get_residue_index_in_hdock_model (atom_rec1_coors, atom_rec2_coors, most_minimum_dist_residue1_index, 
                                               most_minimum_dist_residue2_index, structure, chain_poi_id):

    res1_num=list(atom_rec1_coors.items())[int(most_minimum_dist_residue1_index)][0].split('-')[1]
    res2_num=list(atom_rec2_coors.items())[int(most_minimum_dist_residue2_index)][0].split('-')[1]
    
    print(f'closest residue to bound ligand in {pdb1_docked} is residue {res1_num}')
    print(f'closest residue to bound ligand in {pdb2_docked} is residue {res2_num}')
    
    chains=[]
    idx_hdock_model_1st=int()
    idx_hdock_model_2nd=int()
    for model in structure:
        for chain in model:
            chains.append(chain)
            
        for idx, residue in enumerate(chains[0]):
            if f'resseq={res1_num} ' in str(residue):
                idx_hdock_model_1st=idx
        for idx, residue in enumerate(chains[chain_poi_id]):
            if f'resseq={res2_num} ' in str(residue):
                idx_hdock_model_2nd=idx 
    return idx_hdock_model_1st, idx_hdock_model_2nd

# 
def combine_dicts(func, *dicts):
    default = collections.defaultdict(set)
    for d in dicts:
        for k, v in d.items():
            default[k].add(v)
    return {k: func(v) for k, v in default.items()}

# here we try to find the HDOCK model with the least distance between those two residues: ligase and POI
def find_best_protac_model (residue_a, residue_b, chain_poi_id):
    print('calculating best model which matches the least corresponding distance between those two residues ...')      
    avg_dis_dicts_models=[]
    for filename in glob.iglob('model_*.pdb'):

        chains=list()
        structure=parse_pdb_structure (str(filename))

        for model in structure:
            for chain in model:
                chains.append(chain)

        structure_of_reference = chains[0] # 'structures' may contain several chains
        structure_of_comparison= chains[chain_poi_id]

        residues_ref = [r for r in structure_of_reference.get_residues()]
        residues_comp= [r for r in structure_of_comparison.get_residues()]

        target_atom_a = residues_ref[residue_a]['CA']
        target_atom_b = residues_comp[residue_b]['CA']

        atoms_a  = Bio.PDB.Selection.unfold_entities(structure_of_reference, 'A')
        ns = Bio.PDB.NeighborSearch(atoms_a)

        close_atoms_ref = ns.search(target_atom_a.coord, 1)

        atoms_b  = Bio.PDB.Selection.unfold_entities(structure_of_comparison, 'A')
        ns = Bio.PDB.NeighborSearch(atoms_b)

        close_atoms_comp = ns.search(target_atom_b.coord, 25)

        close_atoms_ref=[coor.coord for coor in close_atoms_ref]
        close_atoms_comp=[coor.coord for coor in close_atoms_comp]
        
        #close_atoms_comp_len = len(close_atoms_comp)
        #middle_index = close_atoms_comp_len//2
        #close_atoms_comp_optimized=close_atoms_comp[middle_index:] + close_atoms_comp[0]
        
        min_dis_dicts_models=[]
        
        for i in close_atoms_ref:
            min_dis_dicts_models.append({str(filename):min(np.linalg.norm(i - j) for j in close_atoms_comp)})
            #print(f'{i} has minimum distance of {min(np.linalg.norm(i - j) for j in close_atoms_comp)} for model {filename}')
        avg_dis_dicts_models.append(combine_dicts(statistics.mean, *min_dis_dicts_models))
        
    avg_dis_dicts_models_dics={k: v for d in avg_dis_dicts_models for k, v in d.items()}
    best_model_1=min(avg_dis_dicts_models_dics, key=avg_dis_dicts_models_dics.get)
    avg_dis_dicts_models_dics_updated = {key:val for key, val in avg_dis_dicts_models_dics.items() if key != str(best_model_1)}
    best_model_2=min(avg_dis_dicts_models_dics_updated, key=avg_dis_dicts_models_dics_updated.get)
    avg_dis_dicts_models_dics_updated_2 = {key:val for key, val in avg_dis_dicts_models_dics_updated.items() if key != str(best_model_2)}
    best_model_3=min(avg_dis_dicts_models_dics_updated_2, key=avg_dis_dicts_models_dics_updated_2.get)
    
    return(best_model_1, best_model_2, best_model_3)

def get_energy_difference (model):
    
    FILENAME = model
    TARGET = "Score:"
    score=float()

    with open(FILENAME) as f:
        value = None
        start_seen = False
        for line in f:
            if TARGET in line:
                _,value = line.split(':  ')
                break

    if value is not None:
        score=value
        
    return (score)
######################################################################################################
#APPLYING

pdb1=parse_pdb_structure (pdb1_docked)
pdb2=parse_pdb_structure (pdb2_docked)
model=parse_pdb_structure (model_0)

rec_coor_pdb1=get_rec_coor(pdb1)
rec_coor_pdb2=get_rec_coor(pdb2)

processed_rec_coor_pdb1=get_processed_list_rec_coor (rec_coor_pdb1)
processed_rec_coor_pdb2=get_processed_list_rec_coor (rec_coor_pdb2)

ligand1_coors,avg_ligand1_coor=avg_ligand_coor (parse_pdb_structure(f'{str(pdb1_docked.rsplit( ".", 1 )[ 0 ])}_ligand.pdb'))
ligand2_coors,avg_ligand2_coor=avg_ligand_coor (parse_pdb_structure(f'{str(pdb2_docked.rsplit( ".", 1 )[ 0 ])}_ligand.pdb'))

min_distance_index_residue_to_ligand1= get_minimum_residue_distance (ligand1_coor, processed_rec_coor_pdb1)
min_distance_index_residue_to_ligand2= get_minimum_residue_distance (ligand2_coor, processed_rec_coor_pdb2)

chain_poi_id=what_chain_is_poi (model)

residue_a, residue_b= get_residue_index_in_hdock_model (rec_coor_pdb1,rec_coor_pdb2,
                                              min_distance_index_residue_to_ligand1, 
                                              min_distance_index_residue_to_ligand2,
                                              model, chain_poi_id)

best_model, best_model_2, best_model_3 =find_best_protac_model(residue_a, residue_b, chain_poi_id)


print(f'-> your best model is in {best_model}')
print(f'-> your second best model is in {best_model_2}')
print(f'-> your third best model is in {best_model_3}')

first_model_energy=get_energy_difference(str(model_0))
best_protac_energy=get_energy_difference(str(best_model))
second_best_protac_energy=get_energy_difference(str(best_model_2))
third_best_protac_energy=get_energy_difference(str(best_model_3))

print(f' Hdock predicted a score of {first_model_energy}: for the finest predicted pose')
print(f' Whereas Hdock predicted a score of {best_protac_energy}: for your best predicted protac model')
energy_sacrificed= float(first_model_energy) - float(best_protac_energy)
print(' The difference for energy sacrificed is {:.2f}'.format(energy_sacrificed))

dir_name=f'{pdb1_docked_splitted}_{pdb2_docked_splitted}_results'

if not os.path.exists(dir_name):
    os.makedirs(dir_name)
else:
    #shutil.rmtree(dir_name)           # Removes all the subdirectories!
    pass
shutil.move(best_model, dir_name)
shutil.move(best_model_2, dir_name)
shutil.move(best_model_3, dir_name)

data = [[pdb1_docked_splitted_csv, pdb2_docked_splitted_csv, float(best_protac_energy), float(second_best_protac_energy), float(third_best_protac_energy), float(first_model_energy), best_model, best_model_2, best_model_3]]

        
df = pd.DataFrame(data, columns = ['Ligase', 'POI', '1st best model', '2nd best model', 
                                   'Third best model', 'MODEL 0', 'Name model 1st', 'Name model 2nd', 
                                   'Name model 3rd'])

df.to_csv('output.csv')
shutil.move('output.csv', dir_name)

for file in glob.glob('model_*pdb'):
    shutil.copy(file, dir_name)

print(f'Done {pdb1_docked} & {pdb2_docked} ! All data written to output.csv in directory {dir_name}')


In [None]:
atoms = [a.parent.parent.id + '-' + str(a.parent.id[1]) + '-' +  a.name for a in structure.get_atoms() 
    if a.parent.id[0] == ' ']

    atom_rec_coors = {}
    for atom in atoms:
        atom_rec_coors[atom] = []
        for model in structure:
            atom_ = atom.split('-')
            try:
                coor = model[atom_[0]][int(atom_[1])][atom_[2]].coord
                atom_rec_coors[atom].append(coor.tolist())
            except:
                print(f'Could not find coordinates for {str(structure)} at Atom {atom_}')