In [52]:

    
#!/usr/bin/env python
# coding: utf-8


import os
import sys
import re
import torch
from glob import glob
from openbabel import pybel
import numpy as np
import random
import pickle
import warnings
import requests
import os
import glob
import pandas as pd
import openbabel
import numpy as np
from plip.structure.preparation import PDBComplex
from plip.exchange.report import BindingSiteReport
from rdkit import Chem
from rdkit.Chem import AllChem
from biopandas.pdb import PandasPdb
from Bio.PDB.SASA import ShrakeRupley
from Bio.PDB import PDBParser
from sklearn.model_selection import train_test_split
from dask.dataframe import from_pandas
from dask.multiprocessing import get

PYTORCH_ENABLE_MPS_FALLBACK=1
PYTORCH_ENABLE_SparseCPU_FALLBACK=1
warnings.filterwarnings("ignore")

from dask_jobqueue import SLURMCluster

cluster = SLURMCluster(cores=1024,
                       processes=12,
                       memory="250GB",
                      # account="hmslati",
                      # walltime="01:00:00",
                       queue="gpu-bigmem")

# PLEASE READ -> 45次实验分别进行10倍交叉验证，取平均

#Converts the protein-ligand complexes into 4D tensor. 
class Feature_extractor():
    def __init__(self):
        self.atom_codes = {}
        #'others' includs metal atoms and B atom. There are no B atoms on training and test sets. 
        # 55 to 63 will be reserved to PLIF features as follows:
        # 55: hydrophobic
        # 56: hbond
        # 57: waterbridge
        # 58: saltbridge
        # 59: pistacking
        # 60: pication
        # 61: halogen
        # 62: metal
        # 63: Distances 
        
        # others = ([3,4,5,11,12,13]+list(range(19,32))+list(range(37,51))+list(range(55,84)))
        plif_specs=list(range(55,64))
        #C and N atoms can be hybridized in three ways and S atom can be hybridized in two ways here. 
        #Hydrogen atom is also considered for feature extraction. I think phosphor atom has 3 or 5 as hyb states but 
        # in biological system its usually the same recurrent phosphate even in most small molecules so safe to assume one
        # hybridization state for this purpose. 
        atom_types = [1,(6,1),(6,2),(6,3),(7,1),(7,2),(7,3),8,15,(16,2),(16,3),
                      34,9,17,35,53,11,12,13,14,5,19,20,25,29,28,30,33]+plif_specs
      
        for i, j in enumerate(atom_types):
            if type(j) is list:
                for k in j:
                    self.atom_codes[k] = i
                
            else:
                self.atom_codes[j] = i              
        
        self.sum_atom_types = len(atom_types)
        
    #Onehot encoding of each atom. The atoms in protein or ligand are treated separately.
    def encode(self, atomic_num, orig_coords, plifs, molprotein):
        encoding = np.zeros(self.sum_atom_types*2)
        if molprotein == 1:
            encoding[self.atom_codes[atomic_num]] = 1.0
            for coord, plif_feats in plifs.items():
                if [round(item) for item in coord] == [round(item) for item in orig_coords]:
                    encoding[self.atom_codes[55]] = 1.0 if plifs[coord][0] == 'hydrophobic' \
                    else 0.0
                    encoding[self.atom_codes[56]] = 1.0 if plifs[coord][0] == 'hbond' \
                    else 0.0
                    encoding[self.atom_codes[57]] = 1.0 if plifs[coord][0] == 'waterbridge' \
                    else 0.0
                    encoding[self.atom_codes[58]] = 1.0 if plifs[coord][0] == 'saltbridge' \
                    else 0.0
                    encoding[self.atom_codes[59]] = 1.0 if plifs[coord][0] == 'pistacking' \
                    else 0.0
                    encoding[self.atom_codes[60]] = 1.0 if plifs[coord][0] == 'pication' \
                    else 0.0
                    encoding[self.atom_codes[61]] = 1.0 if plifs[coord][0] == 'halogen' \
                    else 0.0
                    encoding[self.atom_codes[62]] = 1.0 if plifs[coord][0] == 'metal' \
                    else 0.0
                    
                    #distance
                    encoding[self.atom_codes[63]] = plifs[coord][1]
                    

                
        else:
            encoding[self.sum_atom_types+self.atom_codes[atomic_num]] = 1.0
            for coord, plif_feats in plifs.items():
                if [round(item) for item in coord] == [round(item) for item in orig_coords]:
                    encoding[self.sum_atom_types+self.atom_codes[55]] = 1.0 if plifs[coord][0] == 'hydrophobic' \
                    else 0.0
                    encoding[self.sum_atom_types+self.atom_codes[56]] = 1.0 if plifs[coord][0] == 'hbond' \
                    else 0.0
                    encoding[self.sum_atom_types+self.atom_codes[57]] = 1.0 if plifs[coord][0] == 'waterbridge' \
                    else 0.0
                    encoding[self.sum_atom_types+self.atom_codes[58]] = 1.0 if plifs[coord][0] == 'saltbridge' \
                    else 0.0
                    encoding[self.sum_atom_types+self.atom_codes[59]] = 1.0 if plifs[coord][0] == 'pistacking' \
                    else 0.0
                    encoding[self.sum_atom_types+self.atom_codes[60]] = 1.0 if plifs[coord][0] == 'pication' \
                    else 0.0
                    encoding[self.sum_atom_types+self.atom_codes[61]] = 1.0 if plifs[coord][0] == 'halogen' \
                    else 0.0
                    encoding[self.sum_atom_types+self.atom_codes[62]] = 1.0 if plifs[coord][0] == 'metal' \
                    else 0.0
                    
                    #distance
                    encoding[self.sum_atom_types+self.atom_codes[63]] = plifs[coord][1]

        
        return encoding
    
    #Get atom coords and atom features from the complexes.   
    def get_features(self, molecule, plifs, molprotein):
        coords = []
        features = []
            
        for atom in molecule:
            coords.append(atom.coords)
            if atom.atomicnum in [6,7,16]:
                atomicnum = (atom.atomicnum,atom.hyb)
                features.append(self.encode(atomicnum,atom.coords,plifs,molprotein))
            else:
                features.append(self.encode(atom.atomicnum,atom.coords,plifs,molprotein))
        
        coords = np.array(coords, dtype=np.float32)
        features = np.array(features, dtype=np.float32)

        return coords, features
     
    #Define the rotation matrixs of 3D stuctures.
    def rotation_matrix(self, t, roller):
        if roller==0:
            return np.array([[1,0,0],[0,np.cos(t),np.sin(t)],[0,-np.sin(t),np.cos(t)]])
        elif roller==1:
            return np.array([[np.cos(t),0,-np.sin(t)],[0,1,0],[np.sin(t),0,np.cos(t)]])
        elif roller==2:
            return np.array([[np.cos(t),np.sin(t),0],[-np.sin(t),np.cos(t),0],[0,0,1]])

    #Generate 3d grid or 4d tensor. Each grid represents a voxel. Each voxel represents the atom in it by onehot encoding of atomic type.
    #Each complex in train set is rotated 9 times for data amplification.
    #The complexes in core set are not rotated. 
    #The default resolution is 20*20*20.
    def grid(self,grid, coords, features, frag_idx, resolution=1.0, max_dist=10.0,  rotation_bool=True, max_frag=10, rotations=9):
        assert coords.shape[1] == 3
        assert coords.shape[0] == features.shape[0]  

        slider=frag_idx*20

        x=y=z=np.array(range(-10,10),dtype=np.float32)+0.5
        u=0
        for i in range(len(coords)):
            coord=coords[i]
            # add/subtract 10 from the center
            tmpx=abs(coord[0]-x)
            tmpy=abs(coord[1]-y)
            tmpz=abs(coord[2]-z)

            if np.max(tmpx)<=19.5 and np.max(tmpy)<=19.5 and np.max(tmpz) <=19.5:
                u+=1
                # get the position of the closest point to coordinate which is found inside the grid
                # append the features unto that slice

                grid[0,slider+np.argmin(tmpx),slider+np.argmin(tmpy),slider+np.argmin(tmpz)] += torch.tensor(features[i])
                
        if rotation_bool:
            for rotation_idx in range(rotations):
                theta = random.uniform(np.pi/18,np.pi/2)
                roller = random.randrange(3)
                coords = np.dot(coords, self.rotation_matrix(theta,roller))
                for i in range(len(coords)):
                    coord=coords[i]
                    tmpx=abs(coord[0]-x)
                    tmpy=abs(coord[1]-y)
                    tmpz=abs(coord[2]-z)
                    if np.max(tmpx)<=19.5 and np.max(tmpy)<=19.5 and np.max(tmpz) <=19.5:
                        grid[rotation_idx+1,slider+np.argmin(tmpx),slider+np.argmin(tmpy),slider+np.argmin(tmpz)] += features[i]

        return grid
    
class PLIF:
    def __init__(self, PDB: str, MOL_SPLIT_START: int = 70, **kwargs):
        kwargs.setdefault('aggr', 'add')
        super(PLIF,self).__init__()
        
        self.MOL_SPLIT_START=MOL_SPLIT_START
        self.pdb=PDB
        self.records=['ATOM']
        self.values=['HOH','CL','MG','ZN','MN','CA']
        self.ions=['CL','MG','ZN','MN','CA']
        self.interaction_slices={"hydrophobic":[0,1,6,7,8,9,10],
            "hbond":[0,1,7,11,13,15,16],
            "waterbridge":[0,1,[6,7],11,13,16,17],
            "saltbridge":[0,1,7,10,3,11,12],
            "pistacking":[0,1,7,11,6,12,13],
            "pication":[0,1,7,11,3,12,13],
            "halogen":[0,1,7,10,12,14,15],
            "metal":[0,1,11,8,6,17,16]} 

        self.column_names = ['RESNR', 'RESTYPE', 'DIST', 'LIG_IDX','PROT_IDX','FRAGMENT_ATOMS_COORDS', 'AA_COORDS']
        self.path = os.getcwd()


    def okToBreak(self, bond):
        """
        Here we apply a bunch of rules to judge if the bond is OK to break.

        Parameters
        ----------
        bond :
            RDkit MOL object

        Returns
        -------
        Boolean :
            OK or not to break.
        """
        # See if the bond is in Ring (don't break that)
        if bond.IsInRing():
            return False
        # We OK only single bonds to break
        if bond.GetBondType() != Chem.rdchem.BondType.SINGLE:
            return False

        # Get the beginning atom of the bond
        begin_atom = bond.GetBeginAtom()
        # Get the ending atom of the bond
        end_atom = bond.GetEndAtom()
        # What kind of neighbors does these end and begenning atoms have? We need a family of no less than 5!
        neighbor_end=list(end_atom.GetNeighbors())
        neighbor_begin=list(begin_atom.GetNeighbors())
        if (len(neighbor_end) + len(neighbor_begin)) <5:
            return False
        #for atm in neighbor_end:
            #print(atm.GetAtomicNum())
        #print(begin_atom.GetAtomicNum(), end_atom.GetAtomicNum(), MOL_SPLIT_START)
        
        # Now check if end or begenning atoms are in ring (we dont wanna bother those)
        if not(begin_atom.IsInRing() or end_atom.IsInRing()):
            return False
        elif begin_atom.GetAtomicNum() >= self.MOL_SPLIT_START or \
                end_atom.GetAtomicNum() >= self.MOL_SPLIT_START:
            return False
        elif end_atom.GetAtomicNum() == 1:
            return False
        else:
            return True

    def undo_id_label (self, frag, split_id):
        # I am trying to restore Hydrogens where the break happened
        for i, atom in enumerate(frag.GetAtoms()):
            if atom.GetAtomicNum() >= split_id:
                atom.SetAtomicNum(1)

        return frag

    # Divide a molecule into fragments
    def split_molecule(self, mol, pdb):

        split_id = self.MOL_SPLIT_START

        res = []
        res_no_id=[]

        to_check = [mol]
        while len(to_check) > 0:
            ms = self.spf(to_check.pop(), split_id)
            if len(ms) == 1:
                res += ms
            else:
                to_check += ms
                split_id += 1
        for frag in res:
            res_no_id.append(self.undo_id_label(frag, self.MOL_SPLIT_START))

        res_pdb_frags=[]

        for idx, frag in enumerate(res_no_id):
            w = Chem.PDBWriter(f"tmp_{pdb}_{self.MOL_SPLIT_START+idx}.pdb")
            w.write(frag)
            w.close()
            
            unwanted_entries= ['CONECT', 'END']            
            with open(f"tmp_{pdb}_{self.MOL_SPLIT_START+idx}.pdb") as oldfile, open(f"{pdb}_{self.MOL_SPLIT_START+idx}.pdb", 'w') as newfile:
                for line in oldfile:
                    if not any(unwanted_entry in line for unwanted_entry in unwanted_entries):
                        newfile.write(line)

                    
            data = data2 = ""

            # Reading data from file1
            with open(f"ATOM_{pdb}.pdb") as fp:
                data = fp.read()

            # Reading data from file2

            with open(f"{pdb}_{self.MOL_SPLIT_START+idx}.pdb") as fp:
                data2 = fp.read()
            
            # Merging 2 files
            # To add the data of file2
            # from next line
            #data += "\n"
            data += data2
            
            with open(f"HOH_{pdb}.pdb") as fp:
                data3 = fp.read()
            data += data3

            with open (f"ATOM_{pdb}_{self.MOL_SPLIT_START+idx}.pdb", 'w') as fp:
                fp.write(data)
            res_pdb_frags.append(f"ATOM_{pdb}_{self.MOL_SPLIT_START+idx}.pdb")
        return res_pdb_frags #create_chain(res)


    # Function for doing all the nitty gritty splitting work.
    # loops over bonds until bonds get exhausted or bonds are ok to break, whichever comes first. If ok to break, then each
    # fragment needs to be checked individually again through the loop
    def spf(self, mol, split_id):

        bonds = mol.GetBonds()
        for i in range(len(bonds)):
            if self.okToBreak(bonds[i]):
                mol = Chem.FragmentOnBonds(mol, [i])
                # Dummy atoms are always added last
                n_at = mol.GetNumAtoms()
                print('Split ID', split_id)
                mol.GetAtomWithIdx(n_at-1).SetAtomicNum(split_id)
                mol.GetAtomWithIdx(n_at-2).SetAtomicNum(split_id)
                return Chem.rdmolops.GetMolFrags(mol, asMols=True)

        # If the molecule could not been split, return original molecule
        return [mol]
    #get_fragments(fragment_mols)

    def retreive_plip_interactions(self, pdb_file):
        """
        Retreives the interactions from PLIP.

        Parameters
        ----------
        pdb_file :
            The PDB file of the complex. 

        Returns
        -------
        dict :
            A dictionary of the binding sites and the interactions.
        """
        protlig = PDBComplex()   #instantiate the loader from PLIP
        protlig.load_pdb(pdb_file)   # load the pdb file
        for ligand in protlig.ligands:
            protlig.characterize_complex(ligand)   # find ligands and analyze interactions
        sites = {}
        # loop over binding sites
        for key, site in sorted(protlig.interaction_sets.items()):
            binding_site = BindingSiteReport(site)   # collect data about interactions
            # tuples of *_features and *_info will be converted to pandas DataFrame
            keys = (
                "hydrophobic",
                "hbond",
                "waterbridge",
                "saltbridge",
                "pistacking",
                "pication",
                "halogen",
                "metal"
            )
        # interactions is a dictionary which contains relevant information for each
        # of the possible interactions: hydrophobic, hbond, etc. in the considered
        # binding site. Each interaction contains a list with 
        # 1. the features of that interaction, e.g. for hydrophobic:
        # ('RES_number', 'RES_type', ..., 'LIG_coord', 'PROT_coord')
        # 2. information for each of these features, e.g. for hydrophobic
        # ('RES_number', 'RES_type', ..., 'LIG_coord', 'PROT_coord')

            interactions = {
                k: [getattr(binding_site, k + "_features")] + getattr(binding_site, k + "_info")
                for k in keys
            }
            sites[key] = interactions
        return sites

    def get_coords_prot(self, RESNR):
        ppdb = PandasPdb()
        ppdb.read_pdb(f"{self.pdb.split('.')[0]}_protein.pdb")
        only_protein=ppdb.df['ATOM']
        resnr_coords=[]
        for i in RESNR:
            resnr_coords.append(list(only_protein[only_protein['atom_number']==int(i)][['x_coord', 'y_coord', 'z_coord']].values[0]))
        return resnr_coords
    
    def interaction_df(self, split):

        all_interactions_df = pd.DataFrame()


        # We create the dictionary for the complex of interest:
        for idx, s in enumerate(split):

            pdb_id=s.split('.')[0]
            raw=pdb_id.split('_')[1]
            idx_frag=int(pdb_id.split('_')[2])
            interactions_by_site = self.retreive_plip_interactions(f"{pdb_id}.pdb")

            # Let’s see how many binding sites are detected:

    #         print(
    #             f"Number of binding sites detected in {pdb_id} : "
    #             f"{len(interactions_by_site)}\n"
    #             f"with {interactions_by_site.keys()}"
    #         )
            # In this case, the first binding site containing ligand 03P will be further investigated.
            index_of_selected_site = 0
            selected_site = list(interactions_by_site.keys())[index_of_selected_site]
            #print(selected_site)


            valid_types = [
                    "hydrophobic",
                    "hbond",
                    "waterbridge",
                    "saltbridge",
                    "pistacking",
                    "pication",
                    "halogen",
                    "metal",
                ]

            for _type in valid_types:
                output_df=self.create_df_from_binding_site(raw, interactions_by_site[selected_site], idx+self.MOL_SPLIT_START, selected_site,
                                                      interactions_by_site,
                                                      interaction_type=_type)
                all_interactions_df=all_interactions_df.append(output_df)
        all_interactions_df = all_interactions_df[all_interactions_df['RESNR'].notna()]
        all_interactions_df.to_csv(f"{self.path}/results_plifs/{raw}_plifs_and_properties.csv", index=False)
        return all_interactions_df


    # We can construct a pandas.DataFrame for a binding site and particular interaction type.

    def create_df_from_binding_site(self, raw, selected_site_interactions, fragment_idx, selected_site, 
                                    interactions_by_site, interaction_type="hbond"):
        """
        Creates a data frame from a binding site and interaction type.

        Parameters
        ----------
        selected_site_interactions : dict
            Precalculated interactions from PLIP for the selected site
        interaction_type : str
            The interaction type of interest (default set to hydrogen bonding).

        Returns
        -------
        pd.DataFrame :
            DataFrame with information retreived from PLIP.
        """
        # check if interaction type is valid:
        valid_types = [
            "hydrophobic",
            "hbond",
            "waterbridge",
            "saltbridge",
            "pistacking",
            "pication",
            "halogen",
            "metal",
        ]


        if interaction_type not in valid_types:
            print("!!! Wrong interaction type specified. Hbond is chosen by default !!! \n")
            interaction_type = "hbond"

        def interaction_values(n):
            try:
                interactions=interactions_by_site[selected_site][interaction_type]
                if type(n) is list:
                    return [interactions[1:][x][i] for x in 
                        range(len(interactions[1:])) for i in n]
                else:
                    return [interactions[1:][x][n] for x in 
                        range(len(interactions[1:]))]
            except Exception:
                return None
            
        if interactions_by_site[selected_site][interaction_type][1:]:
            #print(list(map(interaction_values, self.interaction_slices[interaction_type])), self.column_names)
            selected_feats=list(map(interaction_values, self.interaction_slices[interaction_type]))
            #print(selected_feats)
            try: 
                if int(selected_feats[4])>int(selected_feats[3]):
                    selected_feats[3], selected_feats[4] = selected_feats[4], selected_feats[3]  
            except: 
                if int(any(selected_feats[4]))>int(any(selected_feats[3])):
                    selected_feats[3], selected_feats[4] = selected_feats[4], selected_feats[3] 
            df = pd.DataFrame(
                # data is stored AFTER the columns names
                [selected_feats],
                # column names are always the first element - we skipped that in the above - we are gonna use that for naming the df
                columns = self.column_names
            )

            df["INTERACTION_TYPE"]=interaction_type
            
            try:
                checked_coords=self.get_coords_prot(selected_feats[4][0].split(',') if ',' in selected_feats[4][0] \
                                                                   else selected_feats[4])
            except:
                checked_coords=selected_feats[6]
                
            df["AA_COORDS"]=[checked_coords]
                #[self.get_coords_prot(selected_feats[4].split(','))]
            df["FRAGMENT_ATOMS_COORDS"]=[selected_feats[5]]
                            #[self.get_coords_lig(selected_feats[3].split(','))]    
            df['FRAGMENT_ID']=fragment_idx

            # ideally we would like to exclude waters from further processing. Threrfore let us reduce any waterbridge 
            # interaction to the eucladean distance in order to omit water
            
            if interaction_type == "waterbridge":
                df['DIST']=[[np.linalg.norm(x) for x in df['DIST'].to_numpy()]]
                
            # also deal with one distance value and two coords, this is common in saltbridge interactions:
            if len(checked_coords) == len(selected_feats[2])*2:
                df['DIST']=[selected_feats[2] + selected_feats[2]]
                
        else:

            df= pd.DataFrame({'RESNR':[None], 'RESTYPE':[None], 'DIST':[None], 'LIG_IDX':[None],'PROT_IDX':[None],
                        'INTERACTION_TYPE':[interaction_type], "AA_COORDS": [None], "FRAGMENT_ATOMS_COORDS":[None],
                              'FRAGMENT_ID':[str(fragment_idx)]})



        return df
    
    def pdb_2_sdf(self, pdb):
        obConversion = openbabel.OBConversion()
        obConversion.SetInAndOutFormats("pdb", "sdf")
        mol = openbabel.OBMol()
        obConversion.ReadFile(mol, pdb)   # Open Babel will uncompress automatically

        mol.AddHydrogens()


        obConversion.WriteFile(mol, f"{pdb.split('.')[0]}.sdf")
        return f"{pdb.split('.')[0]}.sdf"
    
    def sdf_2_pdb(self, sdf):
        obConversion = openbabel.OBConversion()
        obConversion.SetInAndOutFormats("sdf", "pdb")
        mol = openbabel.OBMol()
        obConversion.ReadFile(mol, sdf)   # Open Babel will uncompress automatically

        mol.AddHydrogens()
        obConversion.WriteFile(mol, f"{sdf.split('.')[0]}.pdb")
        return f"HETATM_{sdf.split('.')[0]}.pdb"

    def save_bpdb(self, pdb,ppdb, record):  
        ppdb.to_pdb(path=f"{record}_{pdb.split('.')[0].split('_')[0]}.pdb",
                    records=[record],
                    gz=False, 
                    append_newline=True)

    def get_HOH_pdb(self, pdb):
        ppdb = PandasPdb() 
        ppdb.read_pdb(pdb) 
        ppdb.df['HETATM']=ppdb.df['HETATM'].loc[ppdb.df['HETATM']['residue_name'].isin(self.values)]
        ppdb.to_pdb(path=f"HOH_{pdb.split('.')[0].split('_')[0]}.pdb",
                records=['HETATM'],
                gz=False, 
                append_newline=True)

    def keep_relevant_hetatm(self, pdb):
        raw=str(self.pdb).split('.')[0]
        with open(pdb) as f1, open(f"ATOM_{pdb.split('.')[0].split('_')[0]}.pdb", 'w') as f2:
            for line in f1:
                if 'ATOM' in line:
                    f2.write(line)
        with open(f'{raw}_ligand.pdb') as f1, open(f"HETATM_{pdb.split('.')[0].split('_')[0]}.pdb", 'w') as f2:
            for line in f1:
                if ('HETATM' in line) and not any(ion in line for ion in self.ions):
                    f2.write(line)
        try: 
            self.get_HOH_pdb(pdb)
        except:
            with open(pdb) as f1, open(f"HOH_{pdb.split('.')[0].split('_')[0]}.pdb", 'w') as f2:
                for line in f1:
                    if ('HETATM' in line) and any(ion in line for ion in self.ions):
                        f2.write(line)
        return
    
    
    def fragment_and_plif(self):
        path = os.getcwd()
        if not os.path.exists('results_plifs'):
            os.mkdir(f'{path}/results_plifs')

        raw=str(self.pdb).split('.')[0]
        self.sdf_2_pdb(f'{raw}_ligand.sdf')
        self.keep_relevant_hetatm(f'{raw}_protein.pdb')
        fragment_mols = Chem.SDMolSupplier(str(f'{raw}_ligand.sdf'), removeHs=True, sanitize=False)
        fragment_mols_alt = Chem.MolFromMol2File(f'{raw}_ligand.mol2', sanitize=True, removeHs=True)
        content = open(f'{raw}_ligand.pdb').read()
        hets=re.findall("^HETATM (.*)", content, re.M)
        if len(hets)<5:
            # Read in the file
            with open(f'{raw}_ligand.pdb', 'r') as file :
                filedata = file.read()

            # Replace the target string
            filedata = filedata.replace('ATOM  ', 'HETATM')

            # Write the file out again
            with open(f'{raw}_ligand.pdb', 'w') as file:
                file.write(filedata)
        
        try: 
            fragment_mols = Chem.RemoveHs(fragment_mols[0])
            output_df = self.interaction_df(self.split_molecule(fragment_mols,raw))
            
        except:  
            try: 
                output_df = self.interaction_df(self.split_molecule( Chem.MolFromMol2File(fragment_mols_alt)))
            except:
                try: 
                    fragment_mols = AllChem.MolFromPDBFile(f'{raw}_ligand.pdb')
                    output_df = self.interaction_df(self.split_molecule(fragment_mols,raw))
                except:
                    try:
                        fragment_mols = AllChem.MolFromPDBFile(f'HETATM_{raw}.pdb')
                        output_df = self.interaction_df(self.split_molecule(fragment_mols,raw))
                    except:
                        raise Exception(f"Sorry, the pdb id: {raw} needs chekcing")
#                         output_df= pd.DataFrame({'FRAGMENT_ID':[None], 'AA_COORDS':[None], 
#                                                  'FRAGMENT_ATOMS_COORDS':[None], 
#                                      'INTERACTION_TYPE':[None],'DIST':[None]})
        os.chdir(f'{path}')

        return output_df.groupby('FRAGMENT_ID')['AA_COORDS', 'FRAGMENT_ATOMS_COORDS','INTERACTION_TYPE','DIST'].agg(list)


def kd_equalizer (value):

    if 'mM' in value.split('=')[1]:
        return float(value.split('m')[0].split('=')[1]) / 1000
    elif 'uM' in value.split('=')[1]:
        return float(value.split('u')[0].split('=')[1]) / 1000000
    elif 'nM' in value.split('=')[1]:
        return float(value.split('n')[0].split('=')[1]) / 1000000000
    elif 'pM' in value.split('=')[1]:
        return float(value.split('p')[0].split('=')[1]) / 1000000000000
    elif 'fM' in value.split('=')[1]:
        return float(value.split('f')[0].split('=')[1]) / 1000000000000000


if __name__ == "__main__":
    
    os.chdir(f'/groups/cherkasvgrp/share/progressive_docking/hmslati/plif_cnn/')
             
    train_grids=None
    test_grids=None
    rotations=9
    full_batch=10
    features_shape=74
    
    Feature = Feature_extractor()
    
    p_directory = os.getcwd()
    
    general=pd.read_csv('INDEX_general_PL_data.2020', sep=',')
    refined=pd.read_csv('INDEX_refined_data.2020', sep=',')
    
    general=general[general["Kd/Ki"].str.contains('IC|EC|>|<')==False]
    refined=refined[refined["Kd/Ki"].str.contains('IC|EC|>|<')==False]

    general["Kd/Ki"] = general["Kd/Ki"].str.replace('~','=')
    refined["Kd/Ki"] = refined["Kd/Ki"].str.replace('~','=')


    general['Kd/Ki']=general['Kd/Ki'].apply(lambda x: kd_equalizer(x))
    refined['Kd/Ki']=refined['Kd/Ki'].apply(lambda x: kd_equalizer(x))
    
    merged_PDBBind=general.append(refined) \
                                .sample(frac=1) \
                                .sample(frac=1) \
                                .reset_index(drop=True) \
                                .drop_duplicates(subset='PDB_code', keep="first") 

    merged_PDBBind[merged_PDBBind['PDB_code'].str.contains("3bho")==False]
    
    train_df, test_df = train_test_split(merged_PDBBind, test_size=0.1)
    
    merged_PDBBind.to_csv('merged_PDBBind.csv', index=False)
    train_df.to_csv('train_df.csv', index=False)
    test_df.to_csv('test_df.csv', index=False)
    
    # First training grids: 
    train_label=[]
    
    for _ , row in train_df.iterrows():
        
        pdb_id = row['PDB_code']
        print('pdb_id', pdb_id)
        
        os.chdir(f'general_refined_set/{pdb_id}')
        
        raw=pdb_id
        path = os.getcwd()
        fileList = []
        fileList.extend(glob.glob(f'{path}/{raw}_7*pdb'))
        fileList.extend(glob.glob(f'{path}/{raw}_8*pdb'))
        fileList.extend(glob.glob(f'{path}/{raw}_9*pdb'))
        fileList.extend(glob.glob(f'{path}/*_{raw}*pdb'))
        fileList.extend(glob.glob(f'{path}/*_{raw}*sdf'))
        for filePath in fileList:
            try:
                os.remove(filePath)
            except:
                print("Error while deleting file : ", filePath)
                
        df_plifSpecs = PLIF(PDB = f'{pdb_id}.pdb').fragment_and_plif()
        
        train_label.extend([row['Kd/Ki']]*10)
        
        single_pdb_frags = []
        for idx, row in df_plifSpecs.iterrows():
    
            temp_plifs_prot={}
            temp_plifs_frag={}
            ## do something with fragment_idx . i.e. open the pdb and do your shit with encoding\
            for aa_atm_coord_list, frag_lig_atm_coord_list, interaction, dist_list in zip (row['AA_COORDS'], 
                                                                                               row['FRAGMENT_ATOMS_COORDS'],
                                                                                               row['INTERACTION_TYPE'],
                                                                                               row['DIST']):
                # because sometimes salt bridges makes two concurrent connections so it is possible that we have one distance
                # for two amino acids or ligand atoms! Encoding by atom is crazy fun
                for dist, aa_atm_coord, frag_lig_atm_coord in zip (dist_list, aa_atm_coord_list, 
                                                                                 frag_lig_atm_coord_list):
                    temp_plifs_prot[tuple(aa_atm_coord)]=[interaction,dist]
                    temp_plifs_frag[tuple(frag_lig_atm_coord)]=[interaction,dist]


            pdb = next(pybel.readfile('pdb',os.path.join(path,'ATOM_' + pdb_id + '.pdb')))
            ligand = next(pybel.readfile('pdb',os.path.join(path, pdb_id + f'_{str(idx)}'+'.pdb')))
            single_pdb_frags.append((pdb,ligand,temp_plifs_prot,temp_plifs_frag))  
            
        grid=torch.zeros((full_batch,400,400,400,features_shape))
        
        for idx, mols in enumerate(single_pdb_frags):
            print(len(single_pdb_frags), idx)
#             threads = multiprocessing.Pool(len(single_pdb_frags))
#             threads.map(func, arg_list)
            
            coords1, features1 = Feature.get_features(mols[0],mols[2],1)
            coords2, features2 = Feature.get_features(mols[1],mols[3],0)

            # get the center point of protein
            center=(np.max(coords2,axis=0)+np.min(coords2,axis=0))/2
            coords=np.concatenate([coords1,coords2],axis = 0)
            features=np.concatenate([features1,features2],axis = 0)
            assert len(coords) == len(features)
            # zero the coordinates 
            coords = coords-center
            grid=Feature.grid(grid,coords,features,idx, rotation_bool=True)
        
        if train_grids is None:
            train_grids = grid.to_sparse()
        else:
            train_grids = torch.cat((train_grids,grid.to_sparse()), 0)
        print(train_grids.shape)
        
        raw=pdb_id
        path = os.getcwd()
        fileList = []
        fileList.extend(glob.glob(f'{path}/{raw}_7*pdb'))
        fileList.extend(glob.glob(f'{path}/{raw}_8*pdb'))
        fileList.extend(glob.glob(f'{path}/{raw}_9*pdb'))
        fileList.extend(glob.glob(f'{path}/*_{raw}*pdb'))
        fileList.extend(glob.glob(f'{path}/*_{raw}*sdf'))
        for filePath in fileList:
            try:
                os.remove(filePath)
            except:
                print("Error while deleting file : ", filePath)
                
        os.chdir(p_directory)
        
        print("Memory utilised (bytes): ", sys.getsizeof(train_grids))
    with open('train_grids.pkl','wb') as f:
        pickle.dump(train_grids, f)

    # Second testing grids: 
    test_label=[]
    
    for _ , row in test_df.iterrows():
        
        pdb_id = row['PDB_code']
        print('pdb_id', pdb_id)
        
        os.chdir(f'general_refined_set/{pdb_id}')
        
        raw=pdb_id
        path = os.getcwd()
        fileList = []
        fileList.extend(glob.glob(f'{path}/{raw}_7*pdb'))
        fileList.extend(glob.glob(f'{path}/{raw}_8*pdb'))
        fileList.extend(glob.glob(f'{path}/{raw}_9*pdb'))
        fileList.extend(glob.glob(f'{path}/*_{raw}*pdb'))
        fileList.extend(glob.glob(f'{path}/*_{raw}*sdf'))
        for filePath in fileList:
            try:
                os.remove(filePath)
            except:
                print("Error while deleting file : ", filePath)
                
        df_plifSpecs = PLIF(PDB = f'{pdb_id}.pdb').fragment_and_plif()
        
        test_label.extend([row['Kd/Ki']])
        
        single_pdb_frags = []
        print(len(single_pdb_frags))
        for idx, row in df_plifSpecs.iterrows():
    
            temp_plifs_prot={}
            temp_plifs_frag={}
            ## do something with fragment_idx . i.e. open the pdb and do your shit with encoding\
            for aa_atm_coord_list, frag_lig_atm_coord_list, interaction, dist_list in zip (row['AA_COORDS'], 
                                                                                               row['FRAGMENT_ATOMS_COORDS'],
                                                                                               row['INTERACTION_TYPE'],
                                                                                               row['DIST']):
                # because sometimes salt bridges makes two concurrent connections so it is possible that we have one distance
                # for two amino acids or ligand atoms! Encoding by atom is crazy fun
                for dist, aa_atm_coord, frag_lig_atm_coord in zip (dist_list, aa_atm_coord_list, 
                                                                                 frag_lig_atm_coord_list):
                    temp_plifs_prot[tuple(aa_atm_coord)]=[interaction,dist]
                    temp_plifs_frag[tuple(frag_lig_atm_coord)]=[interaction,dist]


            pdb = next(pybel.readfile('pdb',os.path.join(path,'ATOM_' + pdb_id + '.pdb')))
            ligand = next(pybel.readfile('pdb',os.path.join(path, pdb_id + f'_{str(idx)}'+'.pdb')))
            single_pdb_frags.append((pdb,ligand,temp_plifs_prot,temp_plifs_frag))  
            
        grid=torch.zeros((1,400,400,400,features_shape))

        for idx, mols in enumerate(single_pdb_frags):
            coords1, features1 = Feature.get_features(mols[0],mols[2],1)
            coords2, features2 = Feature.get_features(mols[1],mols[3],0)

            # get the center point of protein
            center=(np.max(coords2,axis=0)+np.min(coords2,axis=0))/2
            coords=np.concatenate([coords1,coords2],axis = 0)
            features=np.concatenate([features1,features2],axis = 0)
            assert len(coords) == len(features)
            # zero the coordinates 
            coords = coords-center
            grid=Feature.grid(grid,coords,features,idx, rotation_bool=False)
        
        if test_grids is None:
            test_grids = grid.to_sparse()
        else:
            test_grids = torch.cat((test_grids,grid.to_sparse()), 0)
        print(test_grids.shape)
        
        raw=pdb_id
        path = os.getcwd()
        fileList = []
        fileList.extend(glob.glob(f'{path}/{raw}_7*pdb'))
        fileList.extend(glob.glob(f'{path}/{raw}_8*pdb'))
        fileList.extend(glob.glob(f'{path}/{raw}_9*pdb'))
        fileList.extend(glob.glob(f'{path}/*_{raw}*pdb'))
        fileList.extend(glob.glob(f'{path}/*_{raw}*sdf'))
        for filePath in fileList:
            try:
                os.remove(filePath)
            except:
                print("Error while deleting file : ", filePath)
                
        os.chdir(p_directory)
        
    
    with open('test_grids.pkl','wb') as f:
        pickle.dump(test_grids, f)
        
    


pdb_id 1df8
Split ID 70
2 0
2 1


KeyboardInterrupt: 

In [31]:

    
#!/usr/bin/env python
# coding: utf-8


import os
import sys
import re
import torch
from glob import glob
from openbabel import pybel
import numpy as np
import random
import pickle
import warnings
import requests
import os
import glob
import pandas as pd
import openbabel
import numpy as np
from plip.structure.preparation import PDBComplex
from plip.exchange.report import BindingSiteReport
from rdkit import Chem
from rdkit.Chem import AllChem
from biopandas.pdb import PandasPdb
from Bio.PDB.SASA import ShrakeRupley
from Bio.PDB import PDBParser
from sklearn.model_selection import train_test_split
from dask.dataframe import from_pandas
from dask.multiprocessing import get
from pandarallel import pandarallel

pandarallel.initialize(progress_bar=False)


PYTORCH_ENABLE_MPS_FALLBACK=1
PYTORCH_ENABLE_SparseCPU_FALLBACK=1
warnings.filterwarnings("ignore")

from dask_jobqueue import SLURMCluster

train_grids=None
test_grids=None
rotations=9
full_batch=10
features_shape=74
# First training grids: 
train_label=[]
# Second testing grids: 
test_label=[]
    
# cluster = SLURMCluster(cores=1024,
#                        processes=12,
#                        memory="250GB",
#                       # account="hmslati",
#                       # walltime="01:00:00",
#                        queue="gpu-bigmem")

# PLEASE READ -> 45次实验分别进行10倍交叉验证，取平均

#Converts the protein-ligand complexes into 4D tensor. 
class Feature_extractor():
    def __init__(self):
        self.atom_codes = {}
        #'others' includs metal atoms and B atom. There are no B atoms on training and test sets. 
        # 55 to 63 will be reserved to PLIF features as follows:
        # 55: hydrophobic
        # 56: hbond
        # 57: waterbridge
        # 58: saltbridge
        # 59: pistacking
        # 60: pication
        # 61: halogen
        # 62: metal
        # 63: Distances 
        
        # others = ([3,4,5,11,12,13]+list(range(19,32))+list(range(37,51))+list(range(55,84)))
        plif_specs=list(range(55,64))
        #C and N atoms can be hybridized in three ways and S atom can be hybridized in two ways here. 
        #Hydrogen atom is also considered for feature extraction. I think phosphor atom has 3 or 5 as hyb states but 
        # in biological system its usually the same recurrent phosphate even in most small molecules so safe to assume one
        # hybridization state for this purpose. 
        atom_types = [1,(6,1),(6,2),(6,3),(7,1),(7,2),(7,3),8,15,(16,2),(16,3),
                      34,9,17,35,53,11,12,13,14,5,19,20,25,29,28,30,33]+plif_specs
      
        for i, j in enumerate(atom_types):
            if type(j) is list:
                for k in j:
                    self.atom_codes[k] = i
                
            else:
                self.atom_codes[j] = i              
        
        self.sum_atom_types = len(atom_types)
        
    #Onehot encoding of each atom. The atoms in protein or ligand are treated separately.
    def encode(self, atomic_num, orig_coords, plifs, molprotein):
        encoding = np.zeros(self.sum_atom_types*2)
        if molprotein == 1:
            encoding[self.atom_codes[atomic_num]] = 1.0
            for coord, plif_feats in plifs.items():
                if [round(item) for item in coord] == [round(item) for item in orig_coords]:
                    encoding[self.atom_codes[55]] = 1.0 if plifs[coord][0] == 'hydrophobic' \
                    else 0.0
                    encoding[self.atom_codes[56]] = 1.0 if plifs[coord][0] == 'hbond' \
                    else 0.0
                    encoding[self.atom_codes[57]] = 1.0 if plifs[coord][0] == 'waterbridge' \
                    else 0.0
                    encoding[self.atom_codes[58]] = 1.0 if plifs[coord][0] == 'saltbridge' \
                    else 0.0
                    encoding[self.atom_codes[59]] = 1.0 if plifs[coord][0] == 'pistacking' \
                    else 0.0
                    encoding[self.atom_codes[60]] = 1.0 if plifs[coord][0] == 'pication' \
                    else 0.0
                    encoding[self.atom_codes[61]] = 1.0 if plifs[coord][0] == 'halogen' \
                    else 0.0
                    encoding[self.atom_codes[62]] = 1.0 if plifs[coord][0] == 'metal' \
                    else 0.0
                    
                    #distance
                    encoding[self.atom_codes[63]] = plifs[coord][1]
                    

                
        else:
            encoding[self.sum_atom_types+self.atom_codes[atomic_num]] = 1.0
            for coord, plif_feats in plifs.items():
                if [round(item) for item in coord] == [round(item) for item in orig_coords]:
                    encoding[self.sum_atom_types+self.atom_codes[55]] = 1.0 if plifs[coord][0] == 'hydrophobic' \
                    else 0.0
                    encoding[self.sum_atom_types+self.atom_codes[56]] = 1.0 if plifs[coord][0] == 'hbond' \
                    else 0.0
                    encoding[self.sum_atom_types+self.atom_codes[57]] = 1.0 if plifs[coord][0] == 'waterbridge' \
                    else 0.0
                    encoding[self.sum_atom_types+self.atom_codes[58]] = 1.0 if plifs[coord][0] == 'saltbridge' \
                    else 0.0
                    encoding[self.sum_atom_types+self.atom_codes[59]] = 1.0 if plifs[coord][0] == 'pistacking' \
                    else 0.0
                    encoding[self.sum_atom_types+self.atom_codes[60]] = 1.0 if plifs[coord][0] == 'pication' \
                    else 0.0
                    encoding[self.sum_atom_types+self.atom_codes[61]] = 1.0 if plifs[coord][0] == 'halogen' \
                    else 0.0
                    encoding[self.sum_atom_types+self.atom_codes[62]] = 1.0 if plifs[coord][0] == 'metal' \
                    else 0.0
                    
                    #distance
                    encoding[self.sum_atom_types+self.atom_codes[63]] = plifs[coord][1]

        
        return encoding
    
    #Get atom coords and atom features from the complexes.   
    def get_features(self, molecule, plifs, molprotein):
        coords = []
        features = []
            
        for atom in molecule:
            coords.append(atom.coords)
            if atom.atomicnum in [6,7,16]:
                atomicnum = (atom.atomicnum,atom.hyb)
                features.append(self.encode(atomicnum,atom.coords,plifs,molprotein))
            else:
                features.append(self.encode(atom.atomicnum,atom.coords,plifs,molprotein))
        
        coords = np.array(coords, dtype=np.float32)
        features = np.array(features, dtype=np.float32)

        return coords, features
     
    #Define the rotation matrixs of 3D stuctures.
    def rotation_matrix(self, t, roller):
        if roller==0:
            return np.array([[1,0,0],[0,np.cos(t),np.sin(t)],[0,-np.sin(t),np.cos(t)]])
        elif roller==1:
            return np.array([[np.cos(t),0,-np.sin(t)],[0,1,0],[np.sin(t),0,np.cos(t)]])
        elif roller==2:
            return np.array([[np.cos(t),np.sin(t),0],[-np.sin(t),np.cos(t),0],[0,0,1]])

    #Generate 3d grid or 4d tensor. Each grid represents a voxel. Each voxel represents the atom in it by onehot encoding of atomic type.
    #Each complex in train set is rotated 9 times for data amplification.
    #The complexes in core set are not rotated. 
    #The default resolution is 20*20*20.
    def grid(self,grid, coords, features, frag_idx, resolution=1.0, max_dist=10.0,  rotation_bool=True, max_frag=10, rotations=9):
        assert coords.shape[1] == 3
        assert coords.shape[0] == features.shape[0]  

        slider=frag_idx*20

        x=y=z=np.array(range(-10,10),dtype=np.float32)+0.5
        u=0
        for i in range(len(coords)):
            coord=coords[i]
            # add/subtract 10 from the center
            tmpx=abs(coord[0]-x)
            tmpy=abs(coord[1]-y)
            tmpz=abs(coord[2]-z)

            if np.max(tmpx)<=19.5 and np.max(tmpy)<=19.5 and np.max(tmpz) <=19.5:
                u+=1
                # get the position of the closest point to coordinate which is found inside the grid
                # append the features unto that slice

                grid[0,slider+np.argmin(tmpx),slider+np.argmin(tmpy),slider+np.argmin(tmpz)] += torch.tensor(features[i])
                
        if rotation_bool:
            for rotation_idx in range(rotations):
                theta = random.uniform(np.pi/18,np.pi/2)
                roller = random.randrange(3)
                coords = np.dot(coords, self.rotation_matrix(theta,roller))
                for i in range(len(coords)):
                    coord=coords[i]
                    tmpx=abs(coord[0]-x)
                    tmpy=abs(coord[1]-y)
                    tmpz=abs(coord[2]-z)
                    if np.max(tmpx)<=19.5 and np.max(tmpy)<=19.5 and np.max(tmpz) <=19.5:
                        grid[rotation_idx+1,slider+np.argmin(tmpx),slider+np.argmin(tmpy),slider+np.argmin(tmpz)] += features[i]

        return grid
    
class PLIF:
    def __init__(self, PDB: str, MOL_SPLIT_START: int = 70, **kwargs):
        kwargs.setdefault('aggr', 'add')
        super(PLIF,self).__init__()
        
        self.MOL_SPLIT_START=MOL_SPLIT_START
        self.pdb=PDB
        self.records=['ATOM']
        self.values=['HOH','CL','MG','ZN','MN','CA']
        self.ions=['CL','MG','ZN','MN','CA']
        self.interaction_slices={"hydrophobic":[0,1,6,7,8,9,10],
            "hbond":[0,1,7,11,13,15,16],
            "waterbridge":[0,1,[6,7],11,13,16,17],
            "saltbridge":[0,1,7,10,3,11,12],
            "pistacking":[0,1,7,11,6,12,13],
            "pication":[0,1,7,11,3,12,13],
            "halogen":[0,1,7,10,12,14,15],
            "metal":[0,1,11,8,6,17,16]} 

        self.column_names = ['RESNR', 'RESTYPE', 'DIST', 'LIG_IDX','PROT_IDX','FRAGMENT_ATOMS_COORDS', 'AA_COORDS']
        self.path = os.getcwd()


    def okToBreak(self, bond):
        """
        Here we apply a bunch of rules to judge if the bond is OK to break.

        Parameters
        ----------
        bond :
            RDkit MOL object

        Returns
        -------
        Boolean :
            OK or not to break.
        """
        # See if the bond is in Ring (don't break that)
        if bond.IsInRing():
            return False
        # We OK only single bonds to break
        if bond.GetBondType() != Chem.rdchem.BondType.SINGLE:
            return False

        # Get the beginning atom of the bond
        begin_atom = bond.GetBeginAtom()
        # Get the ending atom of the bond
        end_atom = bond.GetEndAtom()
        # What kind of neighbors does these end and begenning atoms have? We need a family of no less than 5!
        neighbor_end=list(end_atom.GetNeighbors())
        neighbor_begin=list(begin_atom.GetNeighbors())
        if (len(neighbor_end) + len(neighbor_begin)) <5:
            return False
        #for atm in neighbor_end:
            #print(atm.GetAtomicNum())
        #print(begin_atom.GetAtomicNum(), end_atom.GetAtomicNum(), MOL_SPLIT_START)
        
        # Now check if end or begenning atoms are in ring (we dont wanna bother those)
        if not(begin_atom.IsInRing() or end_atom.IsInRing()):
            return False
        elif begin_atom.GetAtomicNum() >= self.MOL_SPLIT_START or \
                end_atom.GetAtomicNum() >= self.MOL_SPLIT_START:
            return False
        elif end_atom.GetAtomicNum() == 1:
            return False
        else:
            return True

    def undo_id_label (self, frag, split_id):
        # I am trying to restore Hydrogens where the break happened
        for i, atom in enumerate(frag.GetAtoms()):
            if atom.GetAtomicNum() >= split_id:
                atom.SetAtomicNum(1)

        return frag

    # Divide a molecule into fragments
    def split_molecule(self, mol, pdb):

        split_id = self.MOL_SPLIT_START

        res = []
        res_no_id=[]

        to_check = [mol]
        while len(to_check) > 0:
            ms = self.spf(to_check.pop(), split_id)
            if len(ms) == 1:
                res += ms
            else:
                to_check += ms
                split_id += 1
        for frag in res:
            res_no_id.append(self.undo_id_label(frag, self.MOL_SPLIT_START))

        res_pdb_frags=[]

        for idx, frag in enumerate(res_no_id):
            w = Chem.PDBWriter(f"tmp_{pdb}_{self.MOL_SPLIT_START+idx}.pdb")
            w.write(frag)
            w.close()
            
            unwanted_entries= ['CONECT', 'END']            
            with open(f"tmp_{pdb}_{self.MOL_SPLIT_START+idx}.pdb") as oldfile, open(f"{pdb}_{self.MOL_SPLIT_START+idx}.pdb", 'w') as newfile:
                for line in oldfile:
                    if not any(unwanted_entry in line for unwanted_entry in unwanted_entries):
                        newfile.write(line)

                    
            data = data2 = ""

            # Reading data from file1
            with open(f"ATOM_{pdb}.pdb") as fp:
                data = fp.read()

            # Reading data from file2

            with open(f"{pdb}_{self.MOL_SPLIT_START+idx}.pdb") as fp:
                data2 = fp.read()
            
            # Merging 2 files
            # To add the data of file2
            # from next line
            #data += "\n"
            data += data2
            
            with open(f"HOH_{pdb}.pdb") as fp:
                data3 = fp.read()
            data += data3

            with open (f"ATOM_{pdb}_{self.MOL_SPLIT_START+idx}.pdb", 'w') as fp:
                fp.write(data)
            res_pdb_frags.append(f"ATOM_{pdb}_{self.MOL_SPLIT_START+idx}.pdb")
        return res_pdb_frags #create_chain(res)


    # Function for doing all the nitty gritty splitting work.
    # loops over bonds until bonds get exhausted or bonds are ok to break, whichever comes first. If ok to break, then each
    # fragment needs to be checked individually again through the loop
    def spf(self, mol, split_id):

        bonds = mol.GetBonds()
        for i in range(len(bonds)):
            if self.okToBreak(bonds[i]):
                mol = Chem.FragmentOnBonds(mol, [i])
                # Dummy atoms are always added last
                n_at = mol.GetNumAtoms()
                print('Split ID', split_id)
                mol.GetAtomWithIdx(n_at-1).SetAtomicNum(split_id)
                mol.GetAtomWithIdx(n_at-2).SetAtomicNum(split_id)
                return Chem.rdmolops.GetMolFrags(mol, asMols=True)

        # If the molecule could not been split, return original molecule
        return [mol]
    #get_fragments(fragment_mols)

    def retreive_plip_interactions(self, pdb_file):
        """
        Retreives the interactions from PLIP.

        Parameters
        ----------
        pdb_file :
            The PDB file of the complex. 

        Returns
        -------
        dict :
            A dictionary of the binding sites and the interactions.
        """
        protlig = PDBComplex()   #instantiate the loader from PLIP
        protlig.load_pdb(pdb_file)   # load the pdb file
        for ligand in protlig.ligands:
            protlig.characterize_complex(ligand)   # find ligands and analyze interactions
        sites = {}
        # loop over binding sites
        for key, site in sorted(protlig.interaction_sets.items()):
            binding_site = BindingSiteReport(site)   # collect data about interactions
            # tuples of *_features and *_info will be converted to pandas DataFrame
            keys = (
                "hydrophobic",
                "hbond",
                "waterbridge",
                "saltbridge",
                "pistacking",
                "pication",
                "halogen",
                "metal"
            )
        # interactions is a dictionary which contains relevant information for each
        # of the possible interactions: hydrophobic, hbond, etc. in the considered
        # binding site. Each interaction contains a list with 
        # 1. the features of that interaction, e.g. for hydrophobic:
        # ('RES_number', 'RES_type', ..., 'LIG_coord', 'PROT_coord')
        # 2. information for each of these features, e.g. for hydrophobic
        # ('RES_number', 'RES_type', ..., 'LIG_coord', 'PROT_coord')

            interactions = {
                k: [getattr(binding_site, k + "_features")] + getattr(binding_site, k + "_info")
                for k in keys
            }
            sites[key] = interactions
        return sites

    def get_coords_prot(self, RESNR):
        ppdb = PandasPdb()
        ppdb.read_pdb(f"{self.pdb.split('.')[0]}_protein.pdb")
        only_protein=ppdb.df['ATOM']
        resnr_coords=[]
        for i in RESNR:
            resnr_coords.append(list(only_protein[only_protein['atom_number']==int(i)][['x_coord', 'y_coord', 'z_coord']].values[0]))
        return resnr_coords
    
    def interaction_df(self, split):

        all_interactions_df = pd.DataFrame()


        # We create the dictionary for the complex of interest:
        for idx, s in enumerate(split):

            pdb_id=s.split('.')[0]
            raw=pdb_id.split('_')[1]
            idx_frag=int(pdb_id.split('_')[2])
            interactions_by_site = self.retreive_plip_interactions(f"{pdb_id}.pdb")

            # Let’s see how many binding sites are detected:

    #         print(
    #             f"Number of binding sites detected in {pdb_id} : "
    #             f"{len(interactions_by_site)}\n"
    #             f"with {interactions_by_site.keys()}"
    #         )
            # In this case, the first binding site containing ligand 03P will be further investigated.
            index_of_selected_site = 0
            selected_site = list(interactions_by_site.keys())[index_of_selected_site]
            #print(selected_site)


            valid_types = [
                    "hydrophobic",
                    "hbond",
                    "waterbridge",
                    "saltbridge",
                    "pistacking",
                    "pication",
                    "halogen",
                    "metal",
                ]

            for _type in valid_types:
                output_df=self.create_df_from_binding_site(raw, interactions_by_site[selected_site], idx+self.MOL_SPLIT_START, selected_site,
                                                      interactions_by_site,
                                                      interaction_type=_type)
                all_interactions_df=all_interactions_df.append(output_df)
        all_interactions_df = all_interactions_df[all_interactions_df['RESNR'].notna()]
        all_interactions_df.to_csv(f"{self.path}/results_plifs/{raw}_plifs_and_properties.csv", index=False)
        return all_interactions_df


    # We can construct a pandas.DataFrame for a binding site and particular interaction type.

    def create_df_from_binding_site(self, raw, selected_site_interactions, fragment_idx, selected_site, 
                                    interactions_by_site, interaction_type="hbond"):
        """
        Creates a data frame from a binding site and interaction type.

        Parameters
        ----------
        selected_site_interactions : dict
            Precalculated interactions from PLIP for the selected site
        interaction_type : str
            The interaction type of interest (default set to hydrogen bonding).

        Returns
        -------
        pd.DataFrame :
            DataFrame with information retreived from PLIP.
        """
        # check if interaction type is valid:
        valid_types = [
            "hydrophobic",
            "hbond",
            "waterbridge",
            "saltbridge",
            "pistacking",
            "pication",
            "halogen",
            "metal",
        ]


        if interaction_type not in valid_types:
            print("!!! Wrong interaction type specified. Hbond is chosen by default !!! \n")
            interaction_type = "hbond"

        def interaction_values(n):
            try:
                interactions=interactions_by_site[selected_site][interaction_type]
                if type(n) is list:
                    return [interactions[1:][x][i] for x in 
                        range(len(interactions[1:])) for i in n]
                else:
                    return [interactions[1:][x][n] for x in 
                        range(len(interactions[1:]))]
            except Exception:
                return None
            
        if interactions_by_site[selected_site][interaction_type][1:]:
            #print(list(map(interaction_values, self.interaction_slices[interaction_type])), self.column_names)
            selected_feats=list(map(interaction_values, self.interaction_slices[interaction_type]))
            #print(selected_feats)
            try: 
                if int(selected_feats[4])>int(selected_feats[3]):
                    selected_feats[3], selected_feats[4] = selected_feats[4], selected_feats[3]  
            except: 
                if int(any(selected_feats[4]))>int(any(selected_feats[3])):
                    selected_feats[3], selected_feats[4] = selected_feats[4], selected_feats[3] 
            df = pd.DataFrame(
                # data is stored AFTER the columns names
                [selected_feats],
                # column names are always the first element - we skipped that in the above - we are gonna use that for naming the df
                columns = self.column_names
            )

            df["INTERACTION_TYPE"]=interaction_type
            
            try:
                checked_coords=self.get_coords_prot(selected_feats[4][0].split(',') if ',' in selected_feats[4][0] \
                                                                   else selected_feats[4])
            except:
                checked_coords=selected_feats[6]
                
            df["AA_COORDS"]=[checked_coords]
                #[self.get_coords_prot(selected_feats[4].split(','))]
            df["FRAGMENT_ATOMS_COORDS"]=[selected_feats[5]]
                            #[self.get_coords_lig(selected_feats[3].split(','))]    
            df['FRAGMENT_ID']=fragment_idx

            # ideally we would like to exclude waters from further processing. Threrfore let us reduce any waterbridge 
            # interaction to the eucladean distance in order to omit water
            
            if interaction_type == "waterbridge":
                df['DIST']=[[np.linalg.norm(x) for x in df['DIST'].to_numpy()]]
                
            # also deal with one distance value and two coords, this is common in saltbridge interactions:
            if len(checked_coords) == len(selected_feats[2])*2:
                df['DIST']=[selected_feats[2] + selected_feats[2]]
                
        else:

            df= pd.DataFrame({'RESNR':[None], 'RESTYPE':[None], 'DIST':[None], 'LIG_IDX':[None],'PROT_IDX':[None],
                        'INTERACTION_TYPE':[interaction_type], "AA_COORDS": [None], "FRAGMENT_ATOMS_COORDS":[None],
                              'FRAGMENT_ID':[str(fragment_idx)]})



        return df
    
    def pdb_2_sdf(self, pdb):
        obConversion = openbabel.OBConversion()
        obConversion.SetInAndOutFormats("pdb", "sdf")
        mol = openbabel.OBMol()
        obConversion.ReadFile(mol, pdb)   # Open Babel will uncompress automatically

        mol.AddHydrogens()


        obConversion.WriteFile(mol, f"{pdb.split('.')[0]}.sdf")
        return f"{pdb.split('.')[0]}.sdf"
    
    def sdf_2_pdb(self, sdf):
        obConversion = openbabel.OBConversion()
        obConversion.SetInAndOutFormats("sdf", "pdb")
        mol = openbabel.OBMol()
        obConversion.ReadFile(mol, sdf)   # Open Babel will uncompress automatically

        mol.AddHydrogens()
        obConversion.WriteFile(mol, f"{sdf.split('.')[0]}.pdb")
        return f"HETATM_{sdf.split('.')[0]}.pdb"

    def save_bpdb(self, pdb,ppdb, record):  
        ppdb.to_pdb(path=f"{record}_{pdb.split('.')[0].split('_')[0]}.pdb",
                    records=[record],
                    gz=False, 
                    append_newline=True)

    def get_HOH_pdb(self, pdb):
        ppdb = PandasPdb() 
        ppdb.read_pdb(pdb) 
        ppdb.df['HETATM']=ppdb.df['HETATM'].loc[ppdb.df['HETATM']['residue_name'].isin(self.values)]
        ppdb.to_pdb(path=f"HOH_{pdb.split('.')[0].split('_')[0]}.pdb",
                records=['HETATM'],
                gz=False, 
                append_newline=True)

    def keep_relevant_hetatm(self, pdb):
        raw=str(self.pdb).split('.')[0]
        with open(pdb) as f1, open(f"ATOM_{pdb.split('.')[0].split('_')[0]}.pdb", 'w') as f2:
            for line in f1:
                if 'ATOM' in line:
                    f2.write(line)
        with open(f'{raw}_ligand.pdb') as f1, open(f"HETATM_{pdb.split('.')[0].split('_')[0]}.pdb", 'w') as f2:
            for line in f1:
                if ('HETATM' in line) and not any(ion in line for ion in self.ions):
                    f2.write(line)
        try: 
            self.get_HOH_pdb(pdb)
        except:
            with open(pdb) as f1, open(f"HOH_{pdb.split('.')[0].split('_')[0]}.pdb", 'w') as f2:
                for line in f1:
                    if ('HETATM' in line) and any(ion in line for ion in self.ions):
                        f2.write(line)
        return
    
    
    def fragment_and_plif(self):
        path = os.getcwd()
        if not os.path.exists('results_plifs'):
            os.mkdir(f'{path}/results_plifs')

        raw=str(self.pdb).split('.')[0]
        self.sdf_2_pdb(f'{raw}_ligand.sdf')
        self.keep_relevant_hetatm(f'{raw}_protein.pdb')
        fragment_mols = Chem.SDMolSupplier(str(f'{raw}_ligand.sdf'), removeHs=True, sanitize=False)
        fragment_mols_alt = Chem.MolFromMol2File(f'{raw}_ligand.mol2', sanitize=True, removeHs=True)
        content = open(f'{raw}_ligand.pdb').read()
        hets=re.findall("^HETATM (.*)", content, re.M)
        if len(hets)<5:
            # Read in the file
            with open(f'{raw}_ligand.pdb', 'r') as file :
                filedata = file.read()

            # Replace the target string
            filedata = filedata.replace('ATOM  ', 'HETATM')

            # Write the file out again
            with open(f'{raw}_ligand.pdb', 'w') as file:
                file.write(filedata)
        
        try: 
            fragment_mols = Chem.RemoveHs(fragment_mols[0])
            output_df = self.interaction_df(self.split_molecule(fragment_mols,raw))
            
        except:  
            try:
                fragment_mols = Chem.SDMolSupplier(str(f'{raw}_ligand.sdf'), removeHs=True, sanitize=False)
                output_df = self.interaction_df(self.split_molecule(fragment_mols,raw))
            except:
                try: 
                    output_df = self.interaction_df(self.split_molecule( Chem.MolFromMol2File(fragment_mols_alt)))
                except:
                    try: 
                        fragment_mols = AllChem.MolFromPDBFile(f'{raw}_ligand.pdb')
                        output_df = self.interaction_df(self.split_molecule(fragment_mols,raw))
                    except:
                        try:
                            fragment_mols = AllChem.MolFromPDBFile(f'HETATM_{raw}.pdb')
                            output_df = self.interaction_df(self.split_molecule(fragment_mols,raw))
                        except:
    #                         raise Exception(f"Sorry, the pdb id: {raw} needs chekcing")
                            output_df= pd.DataFrame({'FRAGMENT_ID':[None], 'AA_COORDS':[None], 
                                                     'FRAGMENT_ATOMS_COORDS':[None], 
                                         'INTERACTION_TYPE':[None],'DIST':[None]})
                            print(f"Sorry, the pdb id: {raw} needs chekcing")
        os.chdir(f'{path}')

        return output_df.groupby('FRAGMENT_ID')['AA_COORDS', 'FRAGMENT_ATOMS_COORDS','INTERACTION_TYPE','DIST'].agg(list)


def kd_equalizer (value):

    if 'mM' in value.split('=')[1]:
        return float(value.split('m')[0].split('=')[1]) / 1000
    elif 'uM' in value.split('=')[1]:
        return float(value.split('u')[0].split('=')[1]) / 1000000
    elif 'nM' in value.split('=')[1]:
        return float(value.split('n')[0].split('=')[1]) / 1000000000
    elif 'pM' in value.split('=')[1]:
        return float(value.split('p')[0].split('=')[1]) / 1000000000000
    elif 'fM' in value.split('=')[1]:
        return float(value.split('f')[0].split('=')[1]) / 1000000000000000

def dask_plif_cnn_train(row_pdb, row_kd):
    train_grids=None
    train_label=[]
    pdb_id = row_pdb
    print('pdb_id', pdb_id)

    os.chdir(f'/groups/cherkasvgrp/share/progressive_docking/hmslati/plif_cnn/general_refined_set/{pdb_id}')

    raw=pdb_id
    path = os.getcwd()
    fileList = []
    fileList.extend(glob.glob(f'{path}/{raw}_7*pdb'))
    fileList.extend(glob.glob(f'{path}/{raw}_8*pdb'))
    fileList.extend(glob.glob(f'{path}/{raw}_9*pdb'))
    fileList.extend(glob.glob(f'{path}/*_{raw}*pdb'))
    fileList.extend(glob.glob(f'{path}/*_{raw}*sdf'))
    for filePath in fileList:
        try:
            os.remove(filePath)
        except:
            print("Error while deleting file : ", filePath)

    df_plifSpecs = PLIF(PDB = f'{pdb_id}.pdb').fragment_and_plif()
    
    if not len(df_plifSpecs):
        return

    train_label.extend([row_kd]*10)

    single_pdb_frags = []
    for idx, row in df_plifSpecs.iterrows():

        temp_plifs_prot={}
        temp_plifs_frag={}
        ## do something with fragment_idx . i.e. open the pdb and do your shit with encoding\
        for aa_atm_coord_list, frag_lig_atm_coord_list, interaction, dist_list in zip (row['AA_COORDS'], 
                                                                                           row['FRAGMENT_ATOMS_COORDS'],
                                                                                           row['INTERACTION_TYPE'],
                                                                                           row['DIST']):
            # because sometimes salt bridges makes two concurrent connections so it is possible that we have one distance
            # for two amino acids or ligand atoms! Encoding by atom is crazy fun
            for dist, aa_atm_coord, frag_lig_atm_coord in zip (dist_list, aa_atm_coord_list, 
                                                                             frag_lig_atm_coord_list):
                temp_plifs_prot[tuple(aa_atm_coord)]=[interaction,dist]
                temp_plifs_frag[tuple(frag_lig_atm_coord)]=[interaction,dist]


        pdb = next(pybel.readfile('pdb',os.path.join(path,'ATOM_' + pdb_id + '.pdb')))
        ligand = next(pybel.readfile('pdb',os.path.join(path, pdb_id + f'_{str(idx)}'+'.pdb')))
        single_pdb_frags.append((pdb,ligand,temp_plifs_prot,temp_plifs_frag))  

    grid=torch.zeros((full_batch,400,400,400,features_shape))

    for idx, mols in enumerate(single_pdb_frags):
        print(len(single_pdb_frags), idx)
#             threads = multiprocessing.Pool(len(single_pdb_frags))
#             threads.map(func, arg_list)

        coords1, features1 = Feature.get_features(mols[0],mols[2],1)
        coords2, features2 = Feature.get_features(mols[1],mols[3],0)

        # get the center point of protein
        center=(np.max(coords2,axis=0)+np.min(coords2,axis=0))/2
        coords=np.concatenate([coords1,coords2],axis = 0)
        features=np.concatenate([features1,features2],axis = 0)
        assert len(coords) == len(features)
        # zero the coordinates 
        coords = coords-center
        grid=Feature.grid(grid,coords,features,idx, rotation_bool=True)

    if train_grids is None:
        train_grids = grid.to_sparse()
    else:
        train_grids = torch.cat((train_grids,grid.to_sparse()), 0)
    print(train_grids.shape)

    raw=pdb_id
    path = os.getcwd()
    fileList = []
    fileList.extend(glob.glob(f'{path}/{raw}_7*pdb'))
    fileList.extend(glob.glob(f'{path}/{raw}_8*pdb'))
    fileList.extend(glob.glob(f'{path}/{raw}_9*pdb'))
    fileList.extend(glob.glob(f'{path}/*_{raw}*pdb'))
    fileList.extend(glob.glob(f'{path}/*_{raw}*sdf'))
    for filePath in fileList:
        try:
            os.remove(filePath)
        except:
            print("Error while deleting file : ", filePath)

    

    print("Memory utilised (bytes): ", sys.getsizeof(train_grids))
    with open('train_grids.pkl','wb') as f:
        pickle.dump(train_grids, f)
        
    #Save the label data of training and test set.
    with open('train_label.pkl','wb') as f:
        pickle.dump(train_label,f)    
        
    os.chdir(p_directory)
    
    return pdb_id


def dask_plif_cnn_test(row_pdb, row_kd):
    
    test_grids=None
    test_label=[]
    pdb_id = row_pdb
    print('pdb_id', pdb_id)

    os.chdir(f'/groups/cherkasvgrp/share/progressive_docking/hmslati/plif_cnn/general_refined_set/{pdb_id}')

    raw=pdb_id
    path = os.getcwd()
    fileList = []
    fileList.extend(glob.glob(f'{path}/{raw}_7*pdb'))
    fileList.extend(glob.glob(f'{path}/{raw}_8*pdb'))
    fileList.extend(glob.glob(f'{path}/{raw}_9*pdb'))
    fileList.extend(glob.glob(f'{path}/*_{raw}*pdb'))
    fileList.extend(glob.glob(f'{path}/*_{raw}*sdf'))
    for filePath in fileList:
        try:
            os.remove(filePath)
        except:
            print("Error while deleting file : ", filePath)

    df_plifSpecs = PLIF(PDB = f'{pdb_id}.pdb').fragment_and_plif()
    
    if not len(df_plifSpecs):
        return
    
    test_label.extend([row_kd])

    single_pdb_frags = []
    print(len(single_pdb_frags))
    for idx, row in df_plifSpecs.iterrows():

        temp_plifs_prot={}
        temp_plifs_frag={}
        ## do something with fragment_idx . i.e. open the pdb and do your shit with encoding\
        for aa_atm_coord_list, frag_lig_atm_coord_list, interaction, dist_list in zip (row['AA_COORDS'], 
                                                                                           row['FRAGMENT_ATOMS_COORDS'],
                                                                                           row['INTERACTION_TYPE'],
                                                                                           row['DIST']):
            # because sometimes salt bridges makes two concurrent connections so it is possible that we have one distance
            # for two amino acids or ligand atoms! Encoding by atom is crazy fun
            for dist, aa_atm_coord, frag_lig_atm_coord in zip (dist_list, aa_atm_coord_list, 
                                                                             frag_lig_atm_coord_list):
                temp_plifs_prot[tuple(aa_atm_coord)]=[interaction,dist]
                temp_plifs_frag[tuple(frag_lig_atm_coord)]=[interaction,dist]


        pdb = next(pybel.readfile('pdb',os.path.join(path,'ATOM_' + pdb_id + '.pdb')))
        ligand = next(pybel.readfile('pdb',os.path.join(path, pdb_id + f'_{str(idx)}'+'.pdb')))
        single_pdb_frags.append((pdb,ligand,temp_plifs_prot,temp_plifs_frag))  

    grid=torch.zeros((1,400,400,400,features_shape))

    for idx, mols in enumerate(single_pdb_frags):
        coords1, features1 = Feature.get_features(mols[0],mols[2],1)
        coords2, features2 = Feature.get_features(mols[1],mols[3],0)

        # get the center point of protein
        center=(np.max(coords2,axis=0)+np.min(coords2,axis=0))/2
        coords=np.concatenate([coords1,coords2],axis = 0)
        features=np.concatenate([features1,features2],axis = 0)
        assert len(coords) == len(features)
        # zero the coordinates 
        coords = coords-center
        grid=Feature.grid(grid,coords,features,idx, rotation_bool=False)

    if test_grids is None:
        test_grids = grid.to_sparse()
    else:
        test_grids = torch.cat((test_grids,grid.to_sparse()), 0)
    print(test_grids.shape)

    raw=pdb_id
    path = os.getcwd()
    fileList = []
    fileList.extend(glob.glob(f'{path}/{raw}_7*pdb'))
    fileList.extend(glob.glob(f'{path}/{raw}_8*pdb'))
    fileList.extend(glob.glob(f'{path}/{raw}_9*pdb'))
    fileList.extend(glob.glob(f'{path}/*_{raw}*pdb'))
    fileList.extend(glob.glob(f'{path}/*_{raw}*sdf'))
    for filePath in fileList:
        try:
            os.remove(filePath)
        except:
            print("Error while deleting file : ", filePath)

    


    with open('test_grids.pkl','wb') as f:
        pickle.dump(test_grids, f)
        
    #Save the label data of training and test set.
    with open('test_label.pkl','wb') as f:
        pickle.dump(test_label,f)    
        
    os.chdir(p_directory)
    
    return pdb_id

if __name__ == "__main__":
    
    os.chdir(f'/groups/cherkasvgrp/share/progressive_docking/hmslati/plif_cnn/')

    Feature = Feature_extractor()

    p_directory = os.getcwd()

#     general=pd.read_csv('INDEX_general_PL_data.2020', sep=',')
#     refined=pd.read_csv('INDEX_refined_data.2020', sep=',')

#     general=general[general["Kd/Ki"].str.contains('IC|EC|>|<')==False]
#     refined=refined[refined["Kd/Ki"].str.contains('IC|EC|>|<')==False]

#     general["Kd/Ki"] = general["Kd/Ki"].str.replace('~','=')
#     refined["Kd/Ki"] = refined["Kd/Ki"].str.replace('~','=')


#     general['Kd/Ki']=general['Kd/Ki'].apply(lambda x: kd_equalizer(x))
#     refined['Kd/Ki']=refined['Kd/Ki'].apply(lambda x: kd_equalizer(x))

#     merged_PDBBind=general.append(refined) \
#                                 .sample(frac=1) \
#                                 .sample(frac=1) \
#                                 .reset_index(drop=True) \
#                                 .drop_duplicates(subset='PDB_code', keep="first") 

    merged_PDBBind=pd.read_csv('merged_PDBBind.csv')
#     merged_PDBBind.rename(columns={'Kd/Ki': 'Kd_Ki'}, inplace=True)
#     merged_PDBBind[merged_PDBBind['PDB_code'].str.contains("3bho")==False]

    train_df, test_df = train_test_split(merged_PDBBind, test_size=0.1)

#     merged_PDBBind.to_csv('merged_PDBBind.csv', index=False)
    train_df.to_csv('train_df.csv', index=False)
    test_df.to_csv('test_df.csv', index=False)


    train_df.parallel_apply(lambda x: dask_plif_cnn_train(x.PDB_code, x.Kd_Ki), axis=1)
    test_df.parallel_apply(lambda x: dask_plif_cnn_test(x.PDB_code, x.Kd_Ki), axis=1)

INFO: Pandarallel will run on 24 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
pdb_idpdb_idpdb_idpdb_id pdb_id 5nea 
pdb_id pdb_id 3b28pdb_idpdb_idpdb_id1txr1a4r 
 6sge

 
6qfu 
3lpgpdb_id
 2n0w pdb_idpdb_idpdb_idpdb_id5orvpdb_idpdb_idpdb_id

pdb_id16pk1azm   pdb_id    
pdb_id pdb_id
5q0h1uh1 4aze4j092h4k 

3o7u1g6s2ay3

5tco
 pdb_id

2xaf

 6b5a

5m4qpdb_id
 1a7t


  Failed to kekulize aromatic bonds in MOL file (title is 6sge_ligand)

  Failed to kekulize aromatic bonds in MOL file (title is 6qfu_ligand)

  Failed to kekulize aromatic bonds in MOL file (title is 5q0h_ligand)



Split IDSplit ID 70




Split ID

[20:58:20] Can't kekulize mol.  Unkekulized atoms: 1 3 4 18 20


 71
Split IDSplit ID

[20:58:20] Explicit valence for atom # 7 C, 5, is greater than permitted
[20:58:20] Can't kekulize mol.  Unkekulized atoms: 22 23 24 27 31


 70Split ID  70 Split IDSplit ID

Split ID  70Split ID70


[20:58:20] Can't kekulize mol.  Unkekulized atoms: 0 1 8 13 15 16 17 18 19 20 21


Split IDSplit ID  71
Split ID
70
 70Split ID Split IDSplit ID
71 7070Split ID71



 
Split ID Split ID
  Split ID70Split ID 70
 71
72
71Split IDSplit IDSplit ID
  71
Split ID
Split ID 72
 
 71 
7170
Split ID
 Split ID



Split ID 71 Split ID72




Split ID72
Split ID72
 
70

 70 Split ID Split ID72
Split IDSplit ID70Split ID
Split ID Split ID71  72
 Split ID   717070
71
 
70
Split ID 

71
73Split ID

Split ID73Split ID Split IDSplit ID73
   
7071
Split ID 7274 
Split ID
71
Split ID 74
Split IDSplit ID
 70 
 Split ID73
Split ID 75
Split ID74
  7275Split IDSplit ID
Split ID 
Split ID  75
76
 73Split ID 7677


Split ID 74
Split ID 75




Split ID 70
Split ID 71
Split ID 72
Split ID 73
Split ID 74
Split ID 70
Split ID 70
Split ID 71
Split ID 70
Split ID 71
Split ID 72
Split IDSplit IDSplit IDSplit IDSorry, the pdb id: 6qfu needs chekcingSplit IDSplit IDSplit ID Split ID
       Split ID7070707070


707070

Process ForkPoolWorker-79:


 



pdb_idSplit IDSplit ID70

Split IDSplit ID

Traceback (most recent call last):


Split ID  Split ID 

  File "/home/hmslati/.conda/envs/plifs/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()


71  1zgi71  71

Split ID

  File "/home/hmslati/.conda/envs/plifs/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)


71Split ID

71
71

  File "/home/hmslati/.conda/envs/plifs/lib/python3.8/multiprocessing/pool.py", line 125, in worker
    result = (True, func(*args, **kwds))


Split ID Split ID

Process ForkPoolWorker-67:




  Split ID 

Traceback (most recent call last):
  File "/home/hmslati/.conda/envs/plifs/lib/python3.8/multiprocessing/pool.py", line 51, in starmapstar
    return list(itertools.starmap(args[0], args[1]))


70Split ID72


  File "/home/hmslati/.conda/envs/plifs/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()


 

  File "/home/hmslati/.conda/envs/plifs/lib/python3.8/site-packages/pandarallel/core.py", line 95, in __call__
    result = self.work_function(


7070 


  File "/home/hmslati/.conda/envs/plifs/lib/python3.8/site-packages/pandarallel/data_types/dataframe.py", line 32, in work
    return data.apply(
  File "/home/hmslati/.conda/envs/plifs/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)


Split ID
7272
Split ID
 

  File "/home/hmslati/.conda/envs/plifs/lib/python3.8/multiprocessing/pool.py", line 125, in worker
    result = (True, func(*args, **kwds))


Split IDSplit ID 

  File "/home/hmslati/.conda/envs/plifs/lib/python3.8/site-packages/pandas/core/frame.py", line 8848, in apply
    return op.apply().__finalize__(self, method="apply")





  File "/home/hmslati/.conda/envs/plifs/lib/python3.8/multiprocessing/pool.py", line 51, in starmapstar
    return list(itertools.starmap(args[0], args[1]))


71Split ID  


  File "/home/hmslati/.conda/envs/plifs/lib/python3.8/site-packages/pandas/core/apply.py", line 733, in apply
    return self.apply_standard()
  File "/home/hmslati/.conda/envs/plifs/lib/python3.8/site-packages/pandarallel/core.py", line 95, in __call__
    result = self.work_function(


73 717173

  File "/home/hmslati/.conda/envs/plifs/lib/python3.8/site-packages/pandas/core/apply.py", line 857, in apply_standard
    results, res_index = self.apply_series_generator()





  File "/home/hmslati/.conda/envs/plifs/lib/python3.8/site-packages/pandarallel/data_types/dataframe.py", line 32, in work
    return data.apply(


Split ID

  File "/home/hmslati/.conda/envs/plifs/lib/python3.8/site-packages/pandas/core/apply.py", line 873, in apply_series_generator
    results[i] = self.f(v)





 

  File "/home/hmslati/.conda/envs/plifs/lib/python3.8/site-packages/pandas/core/frame.py", line 8848, in apply
    return op.apply().__finalize__(self, method="apply")


Split ID

  File "/tmp/ipykernel_202927/2211550766.py", line 944, in <lambda>
    train_df.parallel_apply(lambda x: dask_plif_cnn_train(x.PDB_code, x.Kd_Ki), axis=1)


Split IDSplit ID72

  File "/tmp/ipykernel_202927/2211550766.py", line 721, in dask_plif_cnn_train
    df_plifSpecs = PLIF(PDB = f'{pdb_id}.pdb').fragment_and_plif()


 

  File "/home/hmslati/.conda/envs/plifs/lib/python3.8/site-packages/pandas/core/apply.py", line 733, in apply
    return self.apply_standard()


 
74 72

  File "/tmp/ipykernel_202927/2211550766.py", line 683, in fragment_and_plif
    return output_df.groupby('FRAGMENT_ID')['AA_COORDS', 'FRAGMENT_ATOMS_COORDS','INTERACTION_TYPE','DIST'].agg(list)
  File "/home/hmslati/.conda/envs/plifs/lib/python3.8/site-packages/pandas/core/apply.py", line 857, in apply_standard
    results, res_index = self.apply_series_generator()


72



  File "/home/hmslati/.conda/envs/plifs/lib/python3.8/site-packages/pandas/core/groupby/generic.py", line 900, in aggregate
    result = gba.agg()





  File "/home/hmslati/.conda/envs/plifs/lib/python3.8/site-packages/pandas/core/apply.py", line 873, in apply_series_generator
    results[i] = self.f(v)


Split IDSplit IDSplit ID

  File "/home/hmslati/.conda/envs/plifs/lib/python3.8/site-packages/pandas/core/apply.py", line 171, in agg
    return self.agg_list_like()
  File "/tmp/ipykernel_202927/2211550766.py", line 944, in <lambda>
    train_df.parallel_apply(lambda x: dask_plif_cnn_train(x.PDB_code, x.Kd_Ki), axis=1)


 

  File "/home/hmslati/.conda/envs/plifs/lib/python3.8/site-packages/pandas/core/apply.py", line 378, in agg_list_like
    new_res = colg.aggregate(arg)


  73

  File "/tmp/ipykernel_202927/2211550766.py", line 746, in dask_plif_cnn_train
    pdb = next(pybel.readfile('pdb',os.path.join(path,'ATOM_' + pdb_id + '.pdb')))
  File "/home/hmslati/.conda/envs/plifs/lib/python3.8/site-packages/pandas/core/groupby/generic.py", line 271, in aggregate
    ret = self._aggregate_multiple_funcs(func)


7375


  File "/home/hmslati/.conda/envs/plifs/lib/python3.8/site-packages/pandas/core/groupby/generic.py", line 337, in _aggregate_multiple_funcs
    output = self.obj._constructor_expanddim(indexed_output, index=None)





  File "/home/hmslati/.conda/envs/plifs/lib/python3.8/site-packages/openbabel/pybel.py", line 165, in filereader
    notatend = obconversion.ReadFile(obmol, filename)


Split ID


  File "/home/hmslati/.conda/envs/plifs/lib/python3.8/site-packages/pandas/core/frame.py", line 636, in __init__
    mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy, typ=manager)


Split ID

KeyboardInterrupt


 

  File "/home/hmslati/.conda/envs/plifs/lib/python3.8/site-packages/pandas/core/internals/construction.py", line 502, in dict_to_mgr
    return arrays_to_mgr(arrays, columns, index, dtype=dtype, typ=typ, consolidate=copy)


74 

  File "/home/hmslati/.conda/envs/plifs/lib/python3.8/site-packages/pandas/core/internals/construction.py", line 120, in arrays_to_mgr
    index = _extract_index(arrays)



74

  File "/home/hmslati/.conda/envs/plifs/lib/python3.8/site-packages/pandas/core/internals/construction.py", line 640, in _extract_index
    if len(data) == 0:



Split ID 

KeyboardInterrupt


Split ID75 
70
Split ID 71
Split ID 72
Split ID 73
Split ID 74
Split ID 70
Split ID 71
Split ID 70


KeyboardInterrupt: 

pdb_id 6pvs
Split ID 70
Split ID 71
Split ID 72
Split ID 73
2 0
1 0
4 0
4 1
2 1
4 2


Process ForkPoolWorker-78:
Traceback (most recent call last):
  File "/home/hmslati/.conda/envs/plifs/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/home/hmslati/.conda/envs/plifs/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/hmslati/.conda/envs/plifs/lib/python3.8/multiprocessing/pool.py", line 125, in worker
    result = (True, func(*args, **kwds))
  File "/home/hmslati/.conda/envs/plifs/lib/python3.8/multiprocessing/pool.py", line 51, in starmapstar
    return list(itertools.starmap(args[0], args[1]))
  File "/home/hmslati/.conda/envs/plifs/lib/python3.8/site-packages/pandarallel/core.py", line 95, in __call__
    result = self.work_function(
  File "/home/hmslati/.conda/envs/plifs/lib/python3.8/site-packages/pandarallel/data_types/dataframe.py", line 32, in work
    return data.apply(
  File "/home/hmslati/.conda/envs/plifs/lib/python3.8/site-packages/pandas/core/fr

4 3
2 0
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
2 1
pdb_id 4abf
Split ID 70
pdb_id 3sax
Split ID 70
Split ID 71
Split ID 72
Split ID 73
pdb_id 1flr
Split ID 70
Split ID 71
2 0
2 1
5 0
5 1
3 0
5 2
5 3
3 1
5 4
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
3 2
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
pdb_id 3lc5
Split ID 70
Split ID 71
Split ID 72
Split ID 73
pdb_idSplit ID  744ehz



[21:01:17] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 6 7 8 16 17 18


Split ID 70
6 0
2 0
6 1
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
2 1
6 2
pdb_id 4rwk


[21:01:43] Can't kekulize mol.  Unkekulized atoms: 10 12 21 22 23


Split ID 70
Split ID 71
Split ID 72
Split ID 73
Split ID 74
Split ID 75
Split ID 76
6 3
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
pdb_id 6f92
Split ID 70
6 4
6 5
pdb_id 4jz1
Split ID 70
Split ID 71
Split ID 72
Split ID 73
Split ID 74
Split ID 75
Split ID 76
Split ID 77
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
2 0
pdb_id 5qa5
Split ID 70
Split ID 71
2 1
6 0
6 1
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
6 2
pdb_id 5czm




Split ID 70
Split ID 71
Split ID 72
Split ID 73
6 33 
0
3 1
6 4
3 2
6 5
5 0
5 1
5 2
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
5 3
pdb_id 6mt5
5 4
Split ID 70
Split ID 71
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
pdb_id 6dz2


[21:03:00] Can't kekulize mol.  Unkekulized atoms: 5 6 7 8 9 10 12 13 14


Split ID 70
Split ID 71
Split ID 72
Split ID 73
Split ID 74
Split ID 75
pdb_id 1w1g




Split ID 70
Split ID 71
Split ID 72
Split ID 73
3 0
3 1
3 2
4 0
4 1
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
4 2
4 3
5 0
pdb_id 5j9f


[21:03:25] Can't kekulize mol.  Unkekulized atoms: 0 1 9 10 12 15


Split ID 70
Split ID 71
Split ID 72
5 1
5 2
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
pdb_id 3at1




5 3
3 0
5 34 
1
3 2
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
pdb_id 1d2e




Split ID 70
Split ID 71
1 0
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
pdb_id 6hgg
pdb_id 5fck
Split ID 70
Split ID 71
Split ID 72
Split ID 73
Split ID 70
Split ID 71
Split ID 72
Split ID 73
Split ID 74
3 0
6 0
6 1
5 0
6 2
56  13

3 1
6 54 
2
6 5
5 3
5 4
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
pdb_id 4puk


[21:04:38] Can't kekulize mol.  Unkekulized atoms: 0 1 2 4 5 6 8 9 10 14


Split ID 70
3 2
2 0
2 1
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
pdb_id 6qmk
Split ID 70
Split ID 71
Split ID 72
Split ID 73
Split ID 74
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
pdb_id 6bnl
pdb_id 3ayd
Split ID 70
Split IDSplit ID  7071

Split IDSplit ID  7172

Split IDSplit ID  7372
Split ID
 74
Split ID 75
Split ID 76
Split ID 77
5 0
5 1
5 2
5 3
5 4
6 0
6 1
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
6 2
3 0
6 3
pdb_id 1mui
Split ID 70
Split ID6  71
4
Split ID 72
3Split ID  173

6 5
3 2
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
pdb_id 4n7j
Split ID 70
5 0
5 1
5 2
5 3
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
2 0
5 4
pdb_id 1fe3
torch.Size([10, 400, 400, 400, 74])
2Memory utilised (bytes):   172

1 0
pdb_id 4zyw


[21:06:32] Can't kekulize mol.  Unkekulized atoms: 4 5 6 7 8


Split ID 70
Split ID 71
Split ID 72
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
2 0
pdb_id 4llp
2 1
pdb_id 1mai




Split ID 70
Split ID 71
Split ID 72
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
pdb_id 1fkb
Split ID 70
Split ID 71
Split ID 72
Split ID 73
Split ID 74
Split ID 75
1 0
4 0
4 1
4 2
4 3
3 0
3 1
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
3 2
pdb_id 3srg
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
pdb_id 2xb7
Split ID 70
Split ID 71
Split ID 72
Split ID 73
Split ID 74
Split ID 75
Split ID 76
Split ID 77
1 0
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
pdb_id 5fsl
Split ID 70
6 0
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
6 1
2 0
6 2
2 1
6 3
pdb_id 4r92
Split ID 70
Split ID 71
Split ID 72
Split ID 73
Split ID 74
Split ID 75
6 4
6 5
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
pdb_id 2dw7
5 0
5 1
5 2
5 3
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
5 4
1 0
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
pdb_id 3ryj
Split ID pdb_id7



Split ID 70
pdb_id 4b33
13 3
4 0
13 4
4 1
4 2
13 5
4 3
13 6
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
pdb_id 1nw4
1 0
13 7


[21:09:47] Can't kekulize mol.  Unkekulized atoms: 9 10 11 12 18


Split ID 70
Split ID 71
13 8
13 9
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
13 10
pdb_id 6j8q


[21:10:01] Explicit valence for atom # 0 C, 5, is greater than permitted


13 11
3 0
13 12
1 0
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
3 1
3 2
pdb_id 6pg7


[21:10:33] Can't kekulize mol.  Unkekulized atoms: 0 9 10 11 14


Split ID 70
Split ID 71
Split ID 72
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
pdb_id 5f41
Split ID 70
Split ID 71
Split ID 72
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
pdb_id 6f1n
Split ID 70
Split ID 71
Split ID 72
Split ID 73
Split ID 74
4 0
4 1
4 2
4 3
4 0
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
pdb_id 1gah
4 1
Split ID 70
Split ID 71
Split ID 72
Split ID 73
Split ID 74
Split ID 75
Split ID 76
Split ID 77
Split ID 78
5 0
4 2
5 1
5 2
4 3
5 3
5 4
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
pdb_id 3v5t
Split ID 70
Split ID 71
Split ID 72
7 0
7 1
7 2
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
7 3
4 0
pdb_id
 3d04Split ID 70
Split ID 71
7 4
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
4 1
7 5
4 2
7 6
pdb_id 4kv9


[21:11:46] Can't kekulize mol.  Unkekulized atoms: 18 19 20 23 27


Split ID 70
Split ID 71
4 3
2 0
2 1
3 0
3 1
3 2
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
pdb_id 1vyj


[21:12:13] Can't kekulize mol.  Unkekulized atoms: 104 105 106 107 108


Split ID 70
Split ID 71
Split ID 72
Split ID 73
Split ID 74
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
pdb_id 1cbx
Split ID 70
5 0
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
pdb_id 6e91torch.Size([10, 400, 400, 400, 74])

Memory utilised (bytes):  72
Split ID 70
Split ID 71
Split ID 72
Split ID 73
Split ID 74
Split ID 75
Split ID 76
2 0
5 1
2 1
5 2
pdb_id 6aom
Split ID 70
Split ID 71
Split ID 72
Split ID 73
Split ID 74
5 3
5 0
5 1
5 2
5 4
5 3
5 4
4 0
4 1
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
4 2
pdb_id 5vcw
4 3
Split ID 70
Split ID 71
Split ID 72
Split ID 73
Split ID 74
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
pdb_id 3jrs
pdb_id 3nuo
Split ID 70
Split ID 71
Split ID 72Split ID
 Split ID70 
73Split ID
 71
Split ID 72
4 0
4 1
4 2
4 3
4 0
4 0
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
4 1
4 1
4 2
4 2
4 3
4 3
p



Split ID 70
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
pdb_id 1ax2
Split ID 70
Split ID 71
Split ID 72
Split ID 73
Split ID 74
2 0
2 1
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes): torch.Size([10, 400, 400, 400, 74]) 
72Memory utilised (bytes): 
 72
pdb_id pdb_id3g2t 1me7

6 0




Split ID 70
Split ID 71




Split ID 70
Split ID 71
Split ID 72
6 1
6 2
6 3
6 4
6 5
3 0
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
3 1
3 2
pdb_id 4gs6
Split ID 70
4 0
1 0
4 1
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
pdb_id 1w4o




Split ID 70
Split ID 71
Split ID 72
4 2
2 0
pdb_id 3lox
Split ID 70
Split ID 71
Split ID 72
Split ID 73
Split ID 74
Split ID 75
Split ID 76
2 1
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
4 3
pdb_id 4mnw


[21:15:33] Can't kekulize mol.  Unkekulized atoms: 20 21 22 23 24 25 26 27 28


Split ID 70
6 0
6 1
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
6 2
pdb_id 6n0k
Split ID 70
Split ID 71
Split ID 72
Split ID 73
Split ID 74
2Split ID  75
0Split ID
 76
6 3
2 1
6 4
6 5
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
pdb_id 1pme
6 0


[21:16:08] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4


Split ID 70
Split ID 71
Split ID 72
Split ID 73
6 1
6 2
6 3
6 4
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
6 5
3 0
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
pdb_id 5kw2
Split ID 70
Split ID 71
Split ID 72
Split ID 73
Split ID 74
Split ID 75
Split ID 76
3 1
pdb_id 5u7o


[21:16:32] Can't kekulize mol.  Unkekulized atoms: 3 6 14 15 16 18 20 26 27


Split ID 70
Split ID 71
Split ID 72
Split ID 73
Split ID 74
Split ID 75
3 2
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
pdb_id 1t69
6 0
Split ID 70
4 0
6 1
4 1
6 2
6 3
4 2
6 4
4 3
6 5
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
2 0
2 1
pdb_id 6eaa
Split ID 70
Split ID 71
Split ID 72
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
pdb_id 1n5z


[21:17:28] Can't kekulize mol.  Unkekulized atoms: 49 50 51 52 53


Split ID 70
Split ID pdb_id71 
Split ID4afg
 72
Split ID 73
Split ID 74
Split ID 75
Split ID 76pdb_id 
3mweSplit ID
 77
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
1 0
6 0
1 0
6 1
6 2
6 3
6 4
pdb_id 5hz5


[21:17:49] Can't kekulize mol.  Unkekulized atoms: 5 24 25 26 27


Split ID 70
Split ID 71
Split ID 72
6 5
4 0
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
4 1
pdb_id 3u8jtorch.Size([10, 400, 400, 400, 74])

Memory utilised (bytes):  72
Split ID 70
pdb_id 5wqc
4 2
Split ID 70
Split ID 71
Split ID torch.Size([10, 400, 400, 400, 74])72
Split ID 73

Memory utilised (bytes):  72
4 3
pdb_id 6fi5




Split ID 70
Split ID 71
Split ID 72
Split ID 73
Split ID 74
Split ID 75
Split ID 76
Split ID 77
2 0
4 0
4 1
4 2
4 3
2 1
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
9 0
9 1
9 2
9 3
pdb_id 1dub
9 4




Split ID 70
Split ID 71
Split ID 72
9 5
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
pdb_id 1qwf
Split ID 70
Split ID 71
Split ID 72
Split ID 73
Split ID 74
Split ID 75
Split ID 76
Split ID 77
9 6
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
9 7
pdb_id 3fvg
Split ID 70
Split ID 71
9 8
7 0
7 1
7 2
7 3
3 0
7 4
7 5
7 6
3 1
3 2
4 0
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
4 1
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
pdb_id 1sln
pdb_id 3rf5
Split ID 70
Split ID Split ID71 
70
Split ID 71
Split ID 72
4 2
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
pdb_id 3su4
Split ID 70
Split ID 71
Split ID 72
Split ID 73
Split ID 74
Split ID 75
Split ID 76
Split ID 77
3 0
33  10

4 3
3 2
3 1
3 2
8 0
8 1
8 2
8 3
8 4
8 5
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
pdb_id8  1sme
6
8 7
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
torch.Size([10, 400, 400, 400, 74



Split ID 70
Split ID 71
Split ID 72
Split ID 73
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
pdb_id 1obx
Split ID 70
10 0
10 1
3 0
10 2
3 1
10 3
3 2
2 0
5 0
10 4
2 1
5 1
5 2
10 5
5 3
5 4
10 6
10 7
10 8
10 9
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
pdb_id 4hxw


[21:22:01] Can't kekulize mol.  Unkekulized atoms: 1 2 5 6 7 14 15 16 17


Split ID 70
Split ID 71
Split ID 72
Split ID 73
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
4 0
4 1
pdb_id 3grj


[21:22:20] Can't kekulize mol.  Unkekulized atoms: 3 4 5 6 7


Split ID 70
Split ID 71
4 2
pdb_id 4g8o
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
Split ID 70
Split ID 71
Split ID 72
Split ID 73
pdb_id 1f3j
4 3


[21:22:21] Can't kekulize mol.  Unkekulized atoms: 38 39 40 41 42


Split ID 70
Split ID 71
Split ID 72
pdb_id 1wht
Split ID 70
pdb_id 1f40
Split ID 70
Split ID 71
Split ID 72
3 0
3 1
3 2
2 0
2 1
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
pdb_id 3q1x
3 0
3 1
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
pdb_id 3vx3




Split ID 70
Split ID 71
1 0
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
3 2
pdb_id 3hiv


[21:23:17] Can't kekulize mol.  Unkekulized atoms: 8 43 54 66 69


Split ID 70
Split ID 71
Split ID 72
Split ID 73
Split ID 74
Split ID 75
Split ID 76
Split ID 77
Split ID 78
Split ID 79
Split ID 80
Split ID 81
3 0
3 1
3 2
7 0
7 1
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
7 2
7 3
7 4
pdb_id 1gj8


[21:23:49] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 9 10 11


Split ID 70
Split ID 71
Split ID 72
7 5
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
7 6
4 0
pdb_id 5op4
Split ID 70
Split ID 71
Split ID 72
Split ID 73
Split ID 74
Split ID 75
Split ID 76
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
4 1
pdb_id 3c8e
4 2
4 3
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
5 0
5 1
1 0
pdb_id 3qps
Split ID 70
Split ID5  71
2
Split ID 72
5 3
5 4
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
3 0
pdb_id 6ej2
Split ID 70
Split ID 71
3 1
3 2
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
2 0
2 1
pdb_id 5tg5
Split ID 70
Split ID 71
Split ID 72
pdb_id 3w07




Split ID 70
Split ID 71
2 0
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
2 1
pdb_id 4jv6
Split ID 70
Split ID 71
Split ID 72
3 0
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
3 1
pdb_id 2v88
4 0
3 2
4 1
4 2
4 3
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
pdb_id 5f25
1 0
Split ID 70
Split ID 71
2 0
2 1
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
pdb_id 6ajz
Split ID 70
Split ID 71
Split ID 72
Split ID 73
Split ID 74
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
pdb_id 5uwj2 
0
2 1
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
pdb_id 5ekm
Split ID 70
Split ID 71
1 0
pdb_id 5iu8
Split ID 70
Split ID 71
Split ID 72
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
pdb_id 4lrh
3 0


[21:26:57] Can't kekulize mol.  Unkekulized atoms: 3 6 7 8 9 10 11


Split ID 70
Split ID 71
Split ID 72
3 1
4 0
3 2
4 1
4 2
4 3
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
pdb_id 35y0x 
0
Split ID 70
Split ID 71
Split ID 72
Split ID 73
3 1
3 2
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
4 0
4 1
pdb_id 1xug


[21:27:36] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 9 10 11


Split ID 70
Split ID 71
Split ID 72
Split ID 73
4 2
4 3
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
pdb_id 5lb7
Split ID 70
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
5 0
5 1
2 0
5 2
2 1
5 3
5 4
pdb_id 5fiv
Split ID 70
Split ID 71
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
pdb_id 2wtx




Split ID 70
Split ID 71
Split ID 72
Split ID 73
2 0
2 1
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
pdb_id 2y59
pdb_id 5jg1
Split ID 70
Split ID 71Split ID
 70
Split ID 71
Split ID 72
Split ID 73
4 0
4 1
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
5 0
pdb_id 1ag9
3 0




Split ID 70
5 1
5 2
4 2
3 5 13

5 4
3 2
4 3
2 0
2 1
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
pdb_id 5u4c
Split ID 70
Split ID 71
Split ID 72
Split ID 73
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
pdb_id 5qa7pdb_id
 4gy5
Split ID 70
Split ID 71
pdb_id 1j07


[21:29:45] Can't kekulize mol.  Unkekulized atoms: 14 15 16 17 18 19 20 22 23 24 25 26 27


Split ID 70
Split ID 71
5 0
5 1
1 0
3 0
5 2
3 1
5 3
3 2
5 4
3 0
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  372 
1
pdb_id 3u5l
Split ID 70
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
pdb_id 3d0e
Split ID 70
Split ID 71
Split ID 72
Split ID 73
Split ID 74
3 2
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
pdb_id 2l8j
Split ID 70
Split ID 71
Split ID 2 72
0
2 1
2 0
2 1
5 0
5 1
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
5 2pdb_id
 6ql1
Split ID 70
Split ID 71
Split ID 72
5 3
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
5 4
pdb_id 4iic
3 0
Split ID 70
3 1
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
3 2
2 0
pdb_id 5dhs
Split ID 70
Split ID 71
Split ID 72
Split ID 73
Split ID 74
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
pdb_id 1sbr
pdb_id 1qb9
Split IDSplit ID  7070

Split IDSplit ID  71

[21:32:26] Can't kekulize mol.  Unkekulized atoms: 0 1 3 4 5 6 7 8 9 10 17 18 19 20 21


Split ID 70
Split IDpdb_id 71 3eou

Split ID 72


[21:32:26] Can't kekulize mol.  Unkekulized atoms: 0 1 4 5 6 7 9 13 16 17


Split ID 70
Split ID 71
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
3 0
pdb_id 4hvb
3 0
Split ID 70
Split ID 71
Split ID 72
3 1
3 1
3 2
3 2
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
4 0
4 1
pdb_id 3lxo




Split ID 70
Split ID 71
Split ID 72
4 2
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
pdb_id 6oe1
Split ID 70
Split IDtorch.Size([10, 400, 400, 400, 74]) 71

Memory utilised (bytes): Split ID  7272

pdb_id 4ht2
Split ID 70
Split ID 71
Split ID 72
2 0
4 3
2 1
4 0
4 1
4 2
4 3
3 0
3 1
3 2
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
pdb_id 6p87


[21:33:54] Can't kekulize mol.  Unkekulized atoms: 0 2 10 11 15 16 19 20 24


Split ID 70
Split ID 71
Split ID 72
Split ID 73
pdb_id 4lw1
Split ID 70
Split ID 71
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
pdb_id 1sbg


[21:34:06] Can't kekulize mol.  Unkekulized atoms: 34 35 36 37 38


Split ID 70
Split ID 71
Split ID 72
pdb_id 1yon




Split ID 70
3Split ID  710
Split ID
 72
Split ID 73
3 1
3 2
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
4 0
5 0
4 1
5 1
4 2
pdb_id 5ey0
5 2
4 3


[21:34:33] Can't kekulize mol.  Unkekulized atoms: 22 23 24 27 31


Split ID 70
Split ID 71
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
pdb_id 6i75
Split ID 70
Split ID 71
Split ID 72
Split ID 73
Split ID 74
5 3
5 4
3 0
3 1
3 2
3 0
3 1
3 2
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
pdb_id 2or9
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
pdb_id 2xpc
Split ID 70
Split ID 71
Split ID 72
Split ID 73
Split ID 74
Split ID 75
Split ID 76
1 0
pdb_id 1n2v


[21:35:21] Can't kekulize mol.  Unkekulized atoms: 0 1 6 7 13


Split ID 70
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
2 0
6 0
pdb_id 6 1xh5
1
2 1
Split ID 70
Split ID 71
Split ID 72
Split ID 73
Split ID 74
Split ID 75
Split ID 76
Split ID 77
Split ID 78
6 2
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
pdb_id 1f9e
6Split ID  3
70
6 4
6 5
10 0
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
pdb_id 1fkw
210 0 
1
Split ID 70
Split ID 71
2 1
10 2
10 3
10 4
10 5
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
2 0
pdb_id 6d1k




10 Split ID6 
70
2 1
10 7
10 8
10 9
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
2 0
2 1
pdb_id 4f3h


[21:36:48] Can't kekulize mol.  Unkekulized atoms: 13 14 15 18 22


Split ID 70
Split ID 71
Split ID 72
Split ID 73
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
pdb_id 3sug
Split ID 70
Split ID 71
Split ID 72
Split ID 73
Split ID 74
Split ID 75
Split ID 76
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
5 0
5 1
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
pdb_id 3nkk
Split ID 70
pdb_id 1m83




Split ID 70
Split ID 71
5 2
6 0
pdb_id 1xh4
6 1
Split ID 70
Split ID 715
 Split ID3
 72
Split ID 73
Split ID 74
Split ID 75
Split ID 76
6 2
5 4
6 3
2 0
6 4
2 1
6 5
7 0
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
7 1
pdb_id 4ibe
Split ID 70
Split ID 71
Split ID 72
Split ID 73
Split ID 74
7 2
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
pdb_id pdb_id 1r1j3jqb

7 3


[21:37:52] Can't kekulize mol.  Unkekulized atoms: 8 9 10 11 13 16
[21:37:52] Explicit valence for atom # 14 C, 5, is greater than permitted


Split ID 70
Split IDSplit ID  7071

Split ID 71
7 4
5 0
5 1
7 5
5 2
5 3
7 6
5 4
2 0
3 0
2 1
3 1
3 2
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
pdb_id 5ix0
Split ID 70
Split ID 71
Split ID 72
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
pdb_id 1bmm
Split ID 70
Split ID 71
Split ID 72
4 0
4 1
4 torch.Size([10, 400, 400, 400, 74])2

Memory utilised (bytes):  72
pdb_id 6qqv


[21:38:54] Can't kekulize mol.  Unkekulized atoms: 3 4 20 21 22


Split ID 70
Split ID 71
Split ID 72
Split ID 73
4 0
4 3
4 1
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
4 2
4 3
4 0
4 1
4 2
pdb_id 6hm1
Split ID 70
Split ID 71
Split ID 72
torch.Size([10, 400, 400, 400, 74])Split ID 
Memory utilised (bytes): 73
 72Split ID
 74
Split ID 75
Split ID 76
Split ID 77
Split ID 78
Split ID 79
pdb_id 2flb
4 3


[21:39:15] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 9 10 11


Split ID 70
Split ID 71
Split ID 72
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
pdb_id 5wuk
Split ID 70
Split ID 71
Split ID 723
Split ID  073

5 0
3 1
5 1
5 2
3 2
torch.Size([10, 400, 400, 400, 74])
Memory utilised (bytes):  72
pdb_id 6a05


[21:39:40] Can't kekulize mol.  Unkekulized atoms: 2 6 7 8 9


Split ID 70
Split ID 71
Split ID 72
Split ID5  733

5 4
3 0
3 1
3 2


In [32]:
from openbabel import pybel

In [29]:
output_df= pd.DataFrame({'FRAGMENT_ID':[None], 'AA_COORDS':[None], 
                         'FRAGMENT_ATOMS_COORDS':[None], 
             'INTERACTION_TYPE':[None],'DIST':[None]})

output_df=output_df.groupby('FRAGMENT_ID')['AA_COORDS', 'FRAGMENT_ATOMS_COORDS','INTERACTION_TYPE','DIST'].agg(list)

In [30]:
len(output_df)

0

In [2]:
os.chdir(f'/groups/cherkasvgrp/share/progressive_docking/hmslati/plif_cnn/')

In [3]:
merged_PDBBind.to_csv('merged_PDBBind.csv', index=False)

In [12]:
ddf1.iloc[:, 1:4]

Unnamed: 0_level_0,resolution,release_year,-logKd/Ki
npartitions=1000,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,object,int64,float64
12,...,...,...
...,...,...,...
17357,...,...,...
17380,...,...,...


In [16]:
def myadd(row):
    return print('hi',row)

ddf1.apply(lambda x: myadd(x.PDB_ID), axis=1)

Unnamed: 0_level_0,PDB_code,resolution,release_year,-logKd/Ki,Kd/Ki,reference,ligand_name
npartitions=1000,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,object,object,int64,float64,float64,object,object
13,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...
17360,...,...,...,...,...,...,...
17379,...,...,...,...,...,...,...


INFO: Pandarallel will run on 24 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [27]:
ddf1 = train_df
ddf2 = test_df

In [29]:
ddf1.divisions

(0,
 12,
 26,
 39,
 52,
 64,
 76,
 89,
 103,
 114,
 126,
 137,
 149,
 162,
 173,
 185,
 196,
 207,
 219,
 230,
 242,
 255,
 267,
 279,
 292,
 303,
 314,
 326,
 338,
 350,
 361,
 374,
 385,
 397,
 409,
 420,
 434,
 447,
 461,
 474,
 486,
 498,
 512,
 525,
 537,
 549,
 561,
 572,
 584,
 597,
 609,
 621,
 632,
 643,
 655,
 667,
 681,
 694,
 705,
 719,
 732,
 743,
 754,
 768,
 781,
 797,
 811,
 823,
 838,
 850,
 861,
 872,
 884,
 896,
 908,
 920,
 933,
 946,
 957,
 968,
 982,
 995,
 1006,
 1018,
 1032,
 1043,
 1057,
 1071,
 1083,
 1095,
 1108,
 1121,
 1132,
 1144,
 1156,
 1169,
 1184,
 1197,
 1210,
 1227,
 1240,
 1254,
 1268,
 1280,
 1293,
 1308,
 1321,
 1332,
 1345,
 1359,
 1371,
 1388,
 1401,
 1415,
 1428,
 1440,
 1454,
 1469,
 1480,
 1491,
 1505,
 1522,
 1533,
 1546,
 1558,
 1571,
 1584,
 1595,
 1609,
 1624,
 1639,
 1653,
 1669,
 1681,
 1694,
 1707,
 1720,
 1732,
 1748,
 1762,
 1773,
 1786,
 1797,
 1808,
 1822,
 1836,
 1851,
 1863,
 1876,
 1888,
 1902,
 1916,
 1929,
 1943,
 1956,
 1967,

In [3]:
train_df

Unnamed: 0,PDB_code,resolution,release_year,-logKd/Ki,Kd/Ki,reference,ligand_name
17181,4ccd,1.97,2013,4.49,3.200000e-05,4ccc.pdf,(2DG)
11372,5gv2,2.06,2017,5.65,2.230000e-06,5gv2.pdf,(ARG)
10961,5vtb,2.40,2017,6.96,1.100000e-07,5vtb.pdf,(15-mer)
2912,3ui7,2.28,2011,9.00,1.000000e-09,3ui7.pdf,(C1L)
12060,5kbf,2.00,2016,8.92,1.200000e-09,5k8s.pdf,(CMP)
...,...,...,...,...,...,...,...
12038,1t3t,1.90,2004,4.96,1.100000e-05,1t3t.pdf,(ADP)
5256,3u7m,2.15,2012,7.45,3.510000e-08,3u7m.pdf,(FHF)
1158,3h91,1.50,2009,3.73,1.850000e-04,2l11.pdf,(15-mer)
13065,2i3v,2.40,2006,6.55,2.820000e-07,2i3v.pdf,(GLU)


In [25]:
from dask.dataframe import from_pandas
ddf = from_pandas(test_df, npartitions=5, sort=True)  # Dask DataFrame has 5 partitions

In [26]:
ddf

Unnamed: 0_level_0,PDB_code,resolution,release_year,-logKd/Ki,Kd/Ki,reference,ligand_name
npartitions=5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,object,object,int64,float64,float64,object,object
2507,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...
12427,...,...,...,...,...,...,...
17382,...,...,...,...,...,...,...


In [1]:
import numpy as np
import mpi4py
import time

In [None]:
from mpi4py import MPI

In [6]:
from os import environ 
from dask_mpi import initialize
from distributed import Client
from datetime import datetime
from time import sleep
from random import random

# Work out from the environment how many threads to allocate
num_threads = int(environ.get(
    'SLURM_CPUS_PER_TASK',
    environ.get('OMP_NUM_THREADS', 1)
))

# Create the Dask workers
# initialize(interface='cn30', nthreads=num_threads)

# Create the Dask object that will manage the communications
client = Client()

start = datetime.now()
# run_test(client=client)
end = datetime.now()

print(f"Time taken: {end - start}")

Time taken: 0:00:00.000060


In [7]:
print(client)

<Client: 'tcp://127.0.0.1:40583' processes=4 threads=16, memory=32.00 GiB>


In [1]:
from dask_mpi import initialize

In [3]:
pip install dask_mpi

Collecting dask_mpi
  Downloading dask_mpi-2022.4.0-py3-none-any.whl (11 kB)
Installing collected packages: dask_mpi
Successfully installed dask_mpi-2022.4.0
Note: you may need to restart the kernel to use updated packages.


In [8]:
import multiprocessing



In [None]:
train_grids[train_grids is 'nan'] = 0

In [7]:
train_grids.to_dense()

RuntimeError: [enforce fail at alloc_cpu.cpp:73] . DefaultCPUAllocator: can't allocate memory: you tried to allocate 1290240000000 bytes. Error code 12 (Cannot allocate memory)

In [5]:
print("Memory utilised (bytes): ", sys.getsizeof(train_grids))

Memory utilised (bytes):  72


In [3]:
import torch
grid=torch.zeros((10,400,400,400,4))

In [4]:
grid2=torch.zeros((10,400,400,400,4))

In [5]:
grid2.shape[1:]

torch.Size([400, 400, 400, 4])

In [15]:
dim=grid.shape[1:]
print(dim)
grid3= torch.zeros((grid.shape[0] + grid2.shape[0], (400, 400, 400, 4)))

torch.Size([400, 400, 400, 4])


TypeError: zeros(): argument 'size' must be tuple of ints, but found element of type tuple at pos 2

In [6]:
grid3=torch.cat((grid.to_sparse(), grid2.to_sparse()), 0)

In [7]:
grid3.shape

torch.Size([20, 400, 400, 400, 4])

In [8]:
grid3

tensor(indices=tensor([[ 2,  2, 12, 12],
                       [ 0,  0,  0,  0],
                       [16, 16, 16, 16],
                       [ 3,  3,  3,  3],
                       [ 0,  2,  0,  2]]),
       values=tensor([1., 1., 1., 1.]),
       size=(20, 400, 400, 400, 4), nnz=4, layout=torch.sparse_coo)

In [16]:
import sys
print("Memory utilised (bytes): ", sys.getsizeof(grid3))

Memory utilised (bytes):  72


In [5]:
grid[2,0,16,3] += torch.tensor([1.,0.,1,0.])
grid2[2,0,16,3] += torch.tensor([1.,0.,1,0.])

In [None]:
grid[2,0,16,3]

In [None]:
grid.shape

In [None]:
train_grids.shape

In [None]:
import numpy as np
from scipy import sparse

In [None]:
grid=np.zeros((100000,400,400,400,72),dtype=np.float16)

In [None]:
b=sparse.csr_matrix(train_grids.ravel())

In [None]:
print("Memory utilised (bytes): ", sys.getsizeof(b))

In [None]:
b_dense = b.todense()
b_dense.shape

In [None]:
import torch
b_d_tensor = torch.Tensor(b_dense)

In [None]:
b_d_tensor.shape

In [None]:
bd_right_shape_tensor = b_d_tensor.reshape((10, 400, 400, 400, 72))

In [None]:
print("Memory utilised (bytes): ", sys.getsizeof(bd_right_shape_tensor))

In [None]:
bd_right_shape_tensor.shape

In [None]:
train_grids.shape

In [15]:
import warnings
import requests
import os
import re
import glob
import pandas as pd
import openbabel
import numpy as np
from plip.structure.preparation import PDBComplex
from plip.exchange.report import BindingSiteReport
from rdkit import Chem
from rdkit.Chem import AllChem
from biopandas.pdb import PandasPdb
from Bio.PDB.SASA import ShrakeRupley
from Bio.PDB import PDBParser
import oddt
from oddt import fingerprints
warnings.filterwarnings("ignore")

class PLIF:
    def __init__(self, PDB: str, MOL_SPLIT_START: int = 70, **kwargs):
        kwargs.setdefault('aggr', 'add')
        super(PLIF,self).__init__()
        
        self.MOL_SPLIT_START=MOL_SPLIT_START
        self.pdb=PDB
        self.records=['ATOM']
        self.values=['HOH','CL','MG','ZN','MN','CA']
        self.ions=['CL','MG','ZN','MN','CA']
        self.interaction_slices={"hydrophobic":[0,1,6,7,8,9,10],
            "hbond":[0,1,7,11,13,15,16],
            "waterbridge":[0,1,[6,7],11,13,16,17],
            "saltbridge":[0,1,7,10,3,11,12],
            "pistacking":[0,1,7,11,6,12,13],
            "pication":[0,1,7,11,3,12,13],
            "halogen":[0,1,7,10,12,14,15],
            "metal":[0,1,11,8,6,17,16]} 

        self.column_names = ['RESNR', 'RESTYPE', 'DIST', 'LIG_IDX','PROT_IDX','FRAGMENT_ATOMS_COORDS', 'AA_COORDS']
        self.path = os.getcwd()


    def okToBreak(self, bond):
        """
        Here we apply a bunch of rules to judge if the bond is OK to break.

        Parameters
        ----------
        bond :
            RDkit MOL object

        Returns
        -------
        Boolean :
            OK or not to break.
        """
        # See if the bond is in Ring (don't break that)
        if bond.IsInRing():
            return False
        # We OK only single bonds to break
        if bond.GetBondType() != Chem.rdchem.BondType.SINGLE:
            return False

        # Get the beginning atom of the bond
        begin_atom = bond.GetBeginAtom()
        # Get the ending atom of the bond
        end_atom = bond.GetEndAtom()
        # What kind of neighbors does these end and begenning atoms have? We need a family of no less than 5!
        neighbor_end=list(end_atom.GetNeighbors())
        neighbor_begin=list(begin_atom.GetNeighbors())
        if (len(neighbor_end) + len(neighbor_begin)) <5:
            return False
        #for atm in neighbor_end:
            #print(atm.GetAtomicNum())
        #print(begin_atom.GetAtomicNum(), end_atom.GetAtomicNum(), MOL_SPLIT_START)
        
        # Now check if end or begenning atoms are in ring (we dont wanna bother those)
        if not(begin_atom.IsInRing() or end_atom.IsInRing()):
            return False
        elif begin_atom.GetAtomicNum() >= self.MOL_SPLIT_START or \
                end_atom.GetAtomicNum() >= self.MOL_SPLIT_START:
            return False
        elif end_atom.GetAtomicNum() == 1:
            return False
        else:
            return True

    def undo_id_label (self, frag, split_id):
        # I am trying to restore Hydrogens where the break happened
        for i, atom in enumerate(frag.GetAtoms()):
            if atom.GetAtomicNum() >= split_id:
                atom.SetAtomicNum(1)

        return frag

    # Divide a molecule into fragments
    def split_molecule(self, mol, pdb):

        split_id = self.MOL_SPLIT_START

        res = []
        res_no_id=[]

        to_check = [mol]
        while len(to_check) > 0:
            ms = self.spf(to_check.pop(), split_id)
            if len(ms) == 1:
                res += ms
            else:
                to_check += ms
                split_id += 1
        for frag in res:
            res_no_id.append(self.undo_id_label(frag, self.MOL_SPLIT_START))

        res_pdb_frags=[]

        for idx, frag in enumerate(res_no_id):
            w = Chem.PDBWriter(f"tmp_{pdb}_{self.MOL_SPLIT_START+idx}.pdb")
            w.write(frag)
            w.close()
            
            unwanted_entries= ['CONECT', 'END']            
            with open(f"tmp_{pdb}_{self.MOL_SPLIT_START+idx}.pdb") as oldfile, open(f"{pdb}_{self.MOL_SPLIT_START+idx}.pdb", 'w') as newfile:
                for line in oldfile:
                    if not any(unwanted_entry in line for unwanted_entry in unwanted_entries):
                        newfile.write(line)

                    
            data = data2 = ""

            # Reading data from file1
            with open(f"ATOM_{pdb}.pdb") as fp:
                data = fp.read()

            # Reading data from file2

            with open(f"{pdb}_{self.MOL_SPLIT_START+idx}.pdb") as fp:
                data2 = fp.read()
            
            # Merging 2 files
            # To add the data of file2
            # from next line
            #data += "\n"
            data += data2
            
            with open(f"HOH_{pdb}.pdb") as fp:
                data3 = fp.read()
            data += data3

            with open (f"ATOM_{pdb}_{self.MOL_SPLIT_START+idx}.pdb", 'w') as fp:
                fp.write(data)
            res_pdb_frags.append(f"ATOM_{pdb}_{self.MOL_SPLIT_START+idx}.pdb")
        return res_pdb_frags #create_chain(res)


    # Function for doing all the nitty gritty splitting work.
    # loops over bonds until bonds get exhausted or bonds are ok to break, whichever comes first. If ok to break, then each
    # fragment needs to be checked individually again through the loop
    def spf(self, mol, split_id):

        bonds = mol.GetBonds()
        for i in range(len(bonds)):
            if self.okToBreak(bonds[i]):
                mol = Chem.FragmentOnBonds(mol, [i])
                # Dummy atoms are always added last
                n_at = mol.GetNumAtoms()
                print('Split ID', split_id)
                mol.GetAtomWithIdx(n_at-1).SetAtomicNum(split_id)
                mol.GetAtomWithIdx(n_at-2).SetAtomicNum(split_id)
                return Chem.rdmolops.GetMolFrags(mol, asMols=True)

        # If the molecule could not been split, return original molecule
        return [mol]
    #get_fragments(fragment_mols)

    def retreive_plip_interactions(self, pdb_file):
        """
        Retreives the interactions from PLIP.

        Parameters
        ----------
        pdb_file :
            The PDB file of the complex. 

        Returns
        -------
        dict :
            A dictionary of the binding sites and the interactions.
        """
        protlig = PDBComplex()   #instantiate the loader from PLIP
        protlig.load_pdb(pdb_file)   # load the pdb file
        for ligand in protlig.ligands:
            protlig.characterize_complex(ligand)   # find ligands and analyze interactions
        sites = {}
        # loop over binding sites
        for key, site in sorted(protlig.interaction_sets.items()):
            binding_site = BindingSiteReport(site)   # collect data about interactions
            # tuples of *_features and *_info will be converted to pandas DataFrame
            keys = (
                "hydrophobic",
                "hbond",
                "waterbridge",
                "saltbridge",
                "pistacking",
                "pication",
                "halogen",
                "metal"
            )
        # interactions is a dictionary which contains relevant information for each
        # of the possible interactions: hydrophobic, hbond, etc. in the considered
        # binding site. Each interaction contains a list with 
        # 1. the features of that interaction, e.g. for hydrophobic:
        # ('RES_number', 'RES_type', ..., 'LIG_coord', 'PROT_coord')
        # 2. information for each of these features, e.g. for hydrophobic
        # ('RES_number', 'RES_type', ..., 'LIG_coord', 'PROT_coord')

            interactions = {
                k: [getattr(binding_site, k + "_features")] + getattr(binding_site, k + "_info")
                for k in keys
            }
            sites[key] = interactions
        return sites

    def get_coords_prot(self, RESNR):
        ppdb = PandasPdb()
        ppdb.read_pdb(f"{self.pdb.split('.')[0]}_protein.pdb")
        only_protein=ppdb.df['ATOM']
        resnr_coords=[]
        for i in RESNR:
            resnr_coords.append(list(only_protein[only_protein['atom_number']==int(i)][['x_coord', 'y_coord', 'z_coord']].values[0]))
        return resnr_coords
    
    def interaction_df(self, split):

        all_interactions_df = pd.DataFrame()


        # We create the dictionary for the complex of interest:
        for idx, s in enumerate(split):

            pdb_id=s.split('.')[0]
            raw=pdb_id.split('_')[1]
            idx_frag=int(pdb_id.split('_')[2])
            interactions_by_site = self.retreive_plip_interactions(f"{pdb_id}.pdb")

            # Let’s see how many binding sites are detected:

    #         print(
    #             f"Number of binding sites detected in {pdb_id} : "
    #             f"{len(interactions_by_site)}\n"
    #             f"with {interactions_by_site.keys()}"
    #         )
            # In this case, the first binding site containing ligand 03P will be further investigated.
            index_of_selected_site = 0
            selected_site = list(interactions_by_site.keys())[index_of_selected_site]
            #print(selected_site)


            valid_types = [
                    "hydrophobic",
                    "hbond",
                    "waterbridge",
                    "saltbridge",
                    "pistacking",
                    "pication",
                    "halogen",
                    "metal",
                ]

            for _type in valid_types:
                output_df=self.create_df_from_binding_site(raw, interactions_by_site[selected_site], idx+self.MOL_SPLIT_START, selected_site,
                                                      interactions_by_site,
                                                      interaction_type=_type)
                all_interactions_df=all_interactions_df.append(output_df)
        all_interactions_df = all_interactions_df[all_interactions_df['RESNR'].notna()]
        #all_interactions_df.to_csv(f"{self.path}/results_plifs/{raw}_plifs_and_properties.csv", index=False)
        return all_interactions_df


    # We can construct a pandas.DataFrame for a binding site and particular interaction type.

    def create_df_from_binding_site(self, raw, selected_site_interactions, fragment_idx, selected_site, 
                                    interactions_by_site, interaction_type="hbond"):
        """
        Creates a data frame from a binding site and interaction type.

        Parameters
        ----------
        selected_site_interactions : dict
            Precalculated interactions from PLIP for the selected site
        interaction_type : str
            The interaction type of interest (default set to hydrogen bonding).

        Returns
        -------
        pd.DataFrame :
            DataFrame with information retreived from PLIP.
        """
        # check if interaction type is valid:
        valid_types = [
            "hydrophobic",
            "hbond",
            "waterbridge",
            "saltbridge",
            "pistacking",
            "pication",
            "halogen",
            "metal",
        ]


        if interaction_type not in valid_types:
            print("!!! Wrong interaction type specified. Hbond is chosen by default !!! \n")
            interaction_type = "hbond"

        def interaction_values(n):
            try:
                interactions=interactions_by_site[selected_site][interaction_type]
                if type(n) is list:
                    return [interactions[1:][x][i] for x in 
                        range(len(interactions[1:])) for i in n]
                else:
                    return [interactions[1:][x][n] for x in 
                        range(len(interactions[1:]))]
            except Exception:
                return None
            
        if interactions_by_site[selected_site][interaction_type][1:]:
            #print(list(map(interaction_values, self.interaction_slices[interaction_type])), self.column_names)
            selected_feats=list(map(interaction_values, self.interaction_slices[interaction_type]))
            #print(selected_feats)
            try: 
                if int(selected_feats[4])>int(selected_feats[3]):
                    selected_feats[3], selected_feats[4] = selected_feats[4], selected_feats[3]  
            except: 
                if int(any(selected_feats[4]))>int(any(selected_feats[3])):
                    selected_feats[3], selected_feats[4] = selected_feats[4], selected_feats[3] 
            df = pd.DataFrame(
                # data is stored AFTER the columns names
                [selected_feats],
                # column names are always the first element - we skipped that in the above - we are gonna use that for naming the df
                columns = self.column_names
            )

            df["INTERACTION_TYPE"]=interaction_type
            
            try:
                checked_coords=self.get_coords_prot(selected_feats[4][0].split(',') if ',' in selected_feats[4][0] \
                                                                   else selected_feats[4])
            except:
                checked_coords=selected_feats[6]
                
            df["AA_COORDS"]=[checked_coords]
                #[self.get_coords_prot(selected_feats[4].split(','))]
            df["FRAGMENT_ATOMS_COORDS"]=[selected_feats[5]]
                            #[self.get_coords_lig(selected_feats[3].split(','))]    
            df['FRAGMENT_ID']=fragment_idx

            # ideally we would like to exclude waters from further processing. Threrfore let us reduce any waterbridge 
            # interaction to the eucladean distance in order to omit water
            
            if interaction_type == "waterbridge":
                df['DIST']=[[np.linalg.norm(x) for x in df['DIST'].to_numpy()]]
                
            # also deal with one distance value and two coords, this is common in saltbridge interactions:
            if len(checked_coords) == len(selected_feats[2])*2:
                df['DIST']=[selected_feats[2] + selected_feats[2]]
                
        else:

            df= pd.DataFrame({'RESNR':[None], 'RESTYPE':[None], 'DIST':[None], 'LIG_IDX':[None],'PROT_IDX':[None],
                        'INTERACTION_TYPE':[interaction_type], "AA_COORDS": [None], "FRAGMENT_ATOMS_COORDS":[None],
                              'FRAGMENT_ID':[str(fragment_idx)]})



        return df
    
    def pdb_2_sdf(self, pdb):
        obConversion = openbabel.OBConversion()
        obConversion.SetInAndOutFormats("pdb", "sdf")
        mol = openbabel.OBMol()
        obConversion.ReadFile(mol, pdb)   # Open Babel will uncompress automatically

        mol.AddHydrogens()


        obConversion.WriteFile(mol, f"{pdb.split('.')[0]}.sdf")
        return f"{pdb.split('.')[0]}.sdf"
    
    def sdf_2_pdb(self, sdf):
        obConversion = openbabel.OBConversion()
        obConversion.SetInAndOutFormats("sdf", "pdb")
        mol = openbabel.OBMol()
        obConversion.ReadFile(mol, sdf)   # Open Babel will uncompress automatically

        mol.AddHydrogens()
        obConversion.WriteFile(mol, f"{sdf.split('.')[0]}.pdb")
        return f"HETATM_{sdf.split('.')[0]}.pdb"

    def save_bpdb(self, pdb,ppdb, record):  
        ppdb.to_pdb(path=f"{record}_{pdb.split('.')[0].split('_')[0]}.pdb",
                    records=[record],
                    gz=False, 
                    append_newline=True)

    def get_HOH_pdb(self, pdb):
        ppdb = PandasPdb() 
        ppdb.read_pdb(pdb) 
        ppdb.df['HETATM']=ppdb.df['HETATM'].loc[ppdb.df['HETATM']['residue_name'].isin(self.values)]
        ppdb.to_pdb(path=f"HOH_{pdb.split('.')[0].split('_')[0]}.pdb",
                records=['HETATM'],
                gz=False, 
                append_newline=True)

    def keep_relevant_hetatm(self, pdb):
        raw=str(self.pdb).split('.')[0]
        with open(pdb) as f1, open(f"ATOM_{pdb.split('.')[0].split('_')[0]}.pdb", 'w') as f2:
            for line in f1:
                if 'ATOM' in line:
                    f2.write(line)
        with open(f'{raw}_ligand.pdb') as f1, open(f"HETATM_{pdb.split('.')[0].split('_')[0]}.pdb", 'w') as f2:
            for line in f1:
                if ('HETATM' in line) and not any(ion in line for ion in self.ions):
                    f2.write(line)
        try: 
            self.get_HOH_pdb(pdb)
        except:
            with open(pdb) as f1, open(f"HOH_{pdb.split('.')[0].split('_')[0]}.pdb", 'w') as f2:
                for line in f1:
                    if ('HETATM' in line) and any(ion in line for ion in self.ions):
                        f2.write(line)
        return
    
    
    def fragment_and_plif(self):
        path = os.getcwd()
        if not os.path.exists('results_plifs'):
            os.mkdir(f'{path}/results_plifs')

        raw=str(self.pdb).split('.')[0]
        self.sdf_2_pdb(f'{raw}_ligand.sdf')
        self.keep_relevant_hetatm(f'{raw}_protein.pdb')
        fragment_mols = Chem.SDMolSupplier(str(f'{raw}_ligand.sdf'), removeHs=True, sanitize=False)
        fragment_mols_alt = Chem.MolFromMol2File(f'{raw}_ligand.mol2', sanitize=True, removeHs=True)
        content = open(f'{raw}_ligand.pdb').read()
        hets=re.findall("^HETATM (.*)", content, re.M)
        if len(hets)<5:
            # Read in the file
            with open(f'{raw}_ligand.pdb', 'r') as file :
                filedata = file.read()

            # Replace the target string
            filedata = filedata.replace('ATOM  ', 'HETATM')

            # Write the file out again
            with open(f'{raw}_ligand.pdb', 'w') as file:
                file.write(filedata)
        
        try: 
            fragment_mols = Chem.RemoveHs(fragment_mols[0])
            output_df = self.interaction_df(self.split_molecule(fragment_mols,raw))
            
        except:  
            try: 
                output_df = self.interaction_df(self.split_molecule(fragment_mols_alt))
                
            except:
                try: 
                    fragment_mols = Chem.SDMolSupplier(str(f'{raw}_ligand.sdf'), removeHs=True, sanitize=False)
                    output_df = self.interaction_df(self.split_molecule(fragment_mols[0],raw))
                except:
                    try:
                        fragment_mols = AllChem.MolFromPDBFile(f'{raw}_ligand.pdb')
                        output_df = self.interaction_df(self.split_molecule(fragment_mols,raw))
                    except:
                        try:
                            fragment_mols = AllChem.MolFromPDBFile(f'HETATM_{raw}.pdb')
                            output_df = self.interaction_df(self.split_molecule(fragment_mols,raw))
                        except:
#                             raise Exception(f"Sorry, the pdb id: {raw} needs chekcing")
                            output_df= pd.DataFrame({'FRAGMENT_ID':[None], 'AA_COORDS':[None], 
                                                     'FRAGMENT_ATOMS_COORDS':[None], 
                                         'INTERACTION_TYPE':[None],'DIST':[None]})
        os.chdir(f'{path}')

        return output_df.groupby('FRAGMENT_ID')['AA_COORDS', 'FRAGMENT_ATOMS_COORDS','INTERACTION_TYPE','DIST'].agg(list)

if __name__ == "__main__":
    
    p_directory = os.getcwd()
    directory='/groups/cherkasvgrp/share/progressive_docking/hmslati/plif_cnn/general_refined_set/*'
#     file_list=[pdb_id.split('/')[-1] for pdb_id in glob.glob(directory)]
#     for pdb_id in file_list:
    pdb_id = '5va9'
    os.chdir(f'/groups/cherkasvgrp/share/progressive_docking/hmslati/plif_cnn/general_refined_set/{pdb_id}')
#     os.chdir(f'/PDBbind_v2020_refined/refined_set/{pdb_id}')

    df_plifSpecs = PLIF(PDB = f'{pdb_id}.pdb').fragment_and_plif()
    
    if not len(df_plifSpecs) or df_plifSpecs['INTERACTION_TYPE'].apply(', '.join).unique()[0]=='metal':
        print(f"Sorry, the pdb id: {pdb_id} needs chekcing")

[21:44:10] Can't kekulize mol.  Unkekulized atoms: 24 25 26 27 28
[21:44:10] Can't kekulize mol.  Unkekulized atoms: 16 17 18 19 20
[21:44:10] Explicit valence for atom # 101 Rn greater than permitted


Split ID 70
Split ID 70
Split ID 71
Split ID 72
Split ID 73
Split ID 74
Split ID 75
Split ID 76
Split ID 77
Split ID 78
Split ID 79
Split ID 80
Split ID 81
Split ID 82
Split ID 83
Split ID 84
Split ID 85
Split ID 86
Split ID 70
Sorry, the pdb id: 5va9 needs chekcing


In [12]:
df_plifSpecs['INTERACTION_TYPE'].apply(', '.join).unique()[0]

'metal'

In [6]:
df_plifSpecs

Unnamed: 0_level_0,AA_COORDS,FRAGMENT_ATOMS_COORDS,INTERACTION_TYPE,DIST
FRAGMENT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
70,"[[(61.653, 29.816, 11.629), (61.653, 29.816, 1...","[[(61.774, 30.976, 13.683), (61.881, 27.528, 1...",[metal],"[[2.36, 2.73, 2.33, 2.67, 2.28]]"
71,"[[(61.653, 29.816, 11.629), (61.653, 29.816, 1...","[[(61.774, 30.976, 13.683), (61.881, 27.528, 1...",[metal],"[[2.36, 2.73, 2.33, 2.67, 2.28]]"


In [2]:
import glob
import warnings
import requests
import os
import re
import glob
import pandas as pd
import openbabel
import numpy as np
from plip.structure.preparation import PDBComplex
from plip.exchange.report import BindingSiteReport
from rdkit import Chem
from rdkit.Chem import AllChem
from biopandas.pdb import PandasPdb
from Bio.PDB.SASA import ShrakeRupley
from Bio.PDB import PDBParser
import oddt
from oddt import fingerprints
warnings.filterwarnings("ignore")

records=['ATOM']
values=['HOH','CL','MG','ZN','MN','CA']
ions=['CL','MG','ZN','MN','CA']

def pdb_2_sdf(pdb):
    obConversion = openbabel.OBConversion()
    obConversion.SetInAndOutFormats("pdb", "sdf")
    mol = openbabel.OBMol()
    obConversion.ReadFile(mol, pdb)   # Open Babel will uncompress automatically

    mol.AddHydrogens()


    obConversion.WriteFile(mol, f"{pdb.split('.')[0]}.sdf")
    return f"{pdb.split('.')[0]}.sdf"

def sdf_2_pdb(sdf):
    obConversion = openbabel.OBConversion()
    obConversion.SetInAndOutFormats("sdf", "pdb")
    mol = openbabel.OBMol()
    obConversion.ReadFile(mol, sdf)   # Open Babel will uncompress automatically

    mol.AddHydrogens()
    obConversion.WriteFile(mol, f"{sdf.split('.')[0]}.pdb")
    return f"HETATM_{sdf.split('.')[0]}.pdb"

def save_bpdb(pdb,ppdb, record):  
    ppdb.to_pdb(path=f"{record}_{pdb.split('.')[0].split('_')[0]}.pdb",
                records=[record],
                gz=False, 
                append_newline=True)

def get_HOH_pdb(pdb):
    ppdb = PandasPdb() 
    ppdb.read_pdb(pdb) 
    ppdb.df['HETATM']=ppdb.df['HETATM'].loc[ppdb.df['HETATM']['residue_name'].isin(values)]
    ppdb.to_pdb(path=f"HOH_{pdb.split('.')[0].split('_')[0]}.pdb",
            records=['HETATM'],
            gz=False, 
            append_newline=True)

def keep_relevant_hetatm(raw, pdb):
    with open(pdb) as f1, open(f"ATOM_{pdb.split('.')[0].split('_')[0]}.pdb", 'w') as f2:
        for line in f1:
            if 'ATOM' in line:
                f2.write(line)
    with open(f'{raw}_ligand.pdb') as f1, open(f"HETATM_{pdb.split('.')[0].split('_')[0]}.pdb", 'w') as f2:
        for line in f1:
            if ('HETATM' in line) and not any(ion in line for ion in ions):
                f2.write(line)
    try: 
        get_HOH_pdb(pdb)
    except:
        with open(pdb) as f1, open(f"HOH_{pdb.split('.')[0].split('_')[0]}.pdb", 'w') as f2:
            for line in f1:
                if ('HETATM' in line) and any(ion in line for ion in ions):
                    f2.write(line)
    return
os.chdir(f'/groups/cherkasvgrp/share/progressive_docking/hmslati/plif_cnn/general_refined_set_cp/')
p_directory = os.getcwd()
counter=0
directory='/groups/cherkasvgrp/share/progressive_docking/hmslati/plif_cnn/general_refined_set_cp/*'
file_list=[pdb_id.split('/')[-1] for pdb_id in glob.glob(directory)]
for pdb_id in file_list:
    print(pdb_id)
    os.chdir(f'/groups/cherkasvgrp/share/progressive_docking/hmslati/plif_cnn/general_refined_set_cp/{pdb_id}')

    raw=pdb_id
    sdf_2_pdb(f'{raw}_ligand.sdf')
    keep_relevant_hetatm(raw, f'{pdb_id}_protein.pdb')
    fragment_mols = Chem.SDMolSupplier(str(f'{raw}_ligand.sdf'), removeHs=True, sanitize=False)
    fragment_mols_alt = Chem.MolFromMol2File(f'{raw}_ligand.mol2', sanitize=True, removeHs=True)
    content = open(f'{raw}_ligand.pdb').read()
    hets=re.findall("^HETATM (.*)", content, re.M)
    if len(hets)<5:
        # Read in the file
        with open(f'{raw}_ligand.pdb', 'r') as file :
            filedata = file.read()

        # Replace the target string
        filedata = filedata.replace('ATOM  ', 'HETATM')

        # Write the file out again
        with open(f'{raw}_ligand.pdb', 'w') as file:
            file.write(filedata)
#     os.chdir(f'/PDBbind_v2020_refined/refined_set/{pdb_id}')
    try: 
        assert Chem.RemoveHs(fragment_mols[0]) is not None
        print(fragment_mols.GetNumAtoms())
    except:  
        try: 
            assert fragment_mols_alt is not None
            fragment_mols_alt.GetBonds()
            print(fragment_mols_alt.GetNumAtoms())
        except:
            try: 
                assert fragment_mols[0] is not None
                fragment_mols[0].GetBonds()
                print(fragment_mols[0].GetNumAtoms())
            except:
                try:
                    fragment_mols = AllChem.MolFromPDBFile(f'{raw}_ligand.pdb')
                    assert fragment_mols is not None
                    fragment_mols.GetBonds()
                    print(fragment_mols.GetNumAtoms())
                except:
                    try:
                        fragment_mols = AllChem.MolFromPDBFile(f'HETATM_{raw}.pdb')
                        assert fragment_mols is not None
                        fragment_mols.GetBonds()
                        print(fragment_mols.GetNumAtoms())
                    except:
                         print(f"Sorry, the pdb id: {raw} needs chekcing")
    counter+=1
    #print(counter)

    os.chdir(f'{p_directory}')
    

2lbv
22
1lt6
21
4eoh
13
3zc5


[15:20:37] Can't kekulize mol.  Unkekulized atoms: 6 7 10 11 12


27
6hqy
22
1utl
10
4lwi
30
3t4p


[15:20:38] Can't kekulize mol.  Unkekulized atoms: 13 14 15 16 17 18 19 20 21


37
6oyz
40
5y13
22
4i11
24
5i2e
38
6n0q
31
2vvo


[15:20:38] Can't kekulize mol.  Unkekulized atoms: 29 30 31 32 33 34 35 36 37


29
4tk1
88
1p1n
15
5l2w
29
2yay




46
2y7i
12
1i7g
28
1nj5
30
5wei
21
3l7d
26
4v04
39
4qmz
29
6f3e


[15:20:40] Can't kekulize mol.  Unkekulized atoms: 4 12 14 15 16
[15:20:40] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 6 7 8


29
3vby
16
4b7q
23
6qme
23
1ft7
25
2xln




18
2oz2
41
5ih9
32
5u0w
12
2ga2


[15:20:41] Can't kekulize mol.  Unkekulized atoms: 2 3 4 7 11


21
6eox
27
5ypp
8
5z95
31
4a4l
34
4efk
11
4us3


[15:20:42] Can't kekulize mol.  Unkekulized atoms: 5 6 7 8 9 10 11 12 13


15
4alv
30
5he4
30
5jrq


[15:20:42] Can't kekulize mol.  Unkekulized atoms: 2 3 4 5 22 23 24 25 26


34
3k97
23
3cvk
34
3qck
19
1uyi
27
3oy0
21
2w67
14
5ivz
97
4c1y
12
3rr4


[15:20:43] Can't kekulize mol.  Unkekulized atoms: 0 1 2 4 5 6 8 9 10


18
1mnc
25
4gqr
23
5ur1
39
5qc4
45
1apv
36
6cwh
21
6qyp
39
1o3f
25
1szm


[15:20:44] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 9 10 11
[15:20:44] Can't kekulize mol.  Unkekulized atoms: 24 25 26 27 28 29 30 31 32


33
2xxy
15
1hxw
50
4dv8
29
5kly
57
3hxb


[15:20:45] Can't kekulize mol.  Unkekulized atoms: 6 26 27 28 29 30 31 32 33


46
4mvn
39
3u3u
23
4pax




13
5j8x
25
1c5y
12
5a14
36
5cas
34
5c7c
22
4a9i
18
5ukj


[15:20:46] Can't kekulize mol.  Unkekulized atoms: 8 9 10 11 12 13 14 29 30


31
5ab0


[15:20:46] Can't kekulize mol.  Unkekulized atoms: 35 36 37 38 39


187
4zuq
10
3juo
17
6hke
9
6g9x
10
2p33


[15:20:46] Can't kekulize mol.  Unkekulized atoms: 19 20 21 22 23 24 25 26 27


28
5g60
45
4g0k
33
6kdi
64
2c4w
24
1ru2


[15:20:47] Can't kekulize mol.  Unkekulized atoms: 4 5 6 7 8


49
4npv
22
4rxe
29
3ck8




77
5abw
36
4uco
23
6f6s


[15:20:48] Can't kekulize mol.  Unkekulized atoms: 18 19 20 21 22


23
4lq3
54
6cnk
12
3zmv
48
1sqp
33
5ory
14
6jad
23
6t1m


[15:20:49] Can't kekulize mol.  Unkekulized atoms: 0 6 7 8 9
[15:20:49] Can't kekulize mol.  Unkekulized atoms: 2 13 14 15 16 17 18 19 20


27
2b1r
23
1v2j
9
4m5g
21
5zma
24
4rh5
96


[15:20:50] Can't kekulize mol.  Unkekulized atoms: 3 4 8 9 10


3bh3
6
6e3n
18
1g4k
23
5c5t
10
5h08
32
5eh0
31
5q0n
38
1ntv
89
5lp1
46
3cyy
78
5nhf


[15:20:51] Can't kekulize mol.  Unkekulized atoms: 5 6 7 8 23


30
3qfy
10
2afw
11
2hnx
18
3qts


[15:20:52] Can't kekulize mol.  Unkekulized atoms: 6 7 8 9 10


23
2ovy
30
1tkx
21
6ic7
37
4bg1
12
6dvm
34
4ran




43
2vwl
34
4io2
10
5w5j
19
1tyr
22
2c9t


[15:20:54] Can't kekulize mol.  Unkekulized atoms: 64 65 66 67 68 69 70 71 72


91
5wmt
27
2ksb
95
4lh6
14
5un1
17
6czv
21
2gde
44
6cen
42
5agt
63
2zv9


[15:20:55] Explicit valence for atom # 0 B, 4, is greater than permitted
[15:20:55] Can't kekulize mol.  Unkekulized atoms: 69 70 71 72 73 74 75 76 77


21
5eoc
103
4zx8
20
6oh3
72
5lwd
55
1qi0




23
2i0v
24
5zdd




59
4mdt
104
4ke1
41
3ig7
25
2ivz


[15:20:56] Can't kekulize mol.  Unkekulized atoms: 42 43 44 45 46 47 48 49 50


110
6e8k
55
5ntq
31
6ixd
61
4zcw




20
4egh
15
5zw6
40
1d04
54
3qkv




18
4loi
68
6i78
35
2zhd


[15:20:57] Can't kekulize mol.  Unkekulized atoms: 2 6 7 8 9


29
4dkq
27
5u48
19
5mql
36
3fuf


[15:20:58] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 6 7 9
[15:20:58] Can't kekulize mol.  Unkekulized atoms: 99 100 101 102 103 104 105 106 107


10
3p0g
27
3iux
153
5wdw
37
1rq2
13
4ynl
46
2a14
26
2gpp
21
5ue4
24
6qls
33
1xh4
37
1k1n
42
3lzb
43
5var
18
3ozt
31
3dcv
21
5zia
141
2qoa
13
6bnt




109
6p1d
24
3rsx
16
4xya


[15:21:00] Can't kekulize mol.  Unkekulized atoms: 0 9 10 11 12 15 16 18 19
[15:21:00] Can't kekulize mol.  Unkekulized atoms: 60 61 62 63 64


20
6iam
100
1kwr
15
4bzo
30
1ydr
20
1rjk
30
5ey8




76
6ud2
56
1q6j




69
1nnb
20
3npa
20
3qps
29
4awj
13
4abb
11
6ayi
22
4g8v
23
6hlb
75
4kju
41
1xog
21
4rrv
74
6pg8
16
4e93
42
6c0r


[15:21:02] Can't kekulize mol.  Unkekulized atoms: 5 6 7 8 15


40
3fzc
29
4kcg
34
4ps3
31
3nu9
35
4ysi
48
3uh2
22
6ai9
37
6cvv




47
2f3r




87
3aau
28
3eid
36
6jio
18
3hiv


[15:21:04] Can't kekulize mol.  Unkekulized atoms: 8 43 54 66 69
[15:21:04] Can't kekulize mol.  Unkekulized atoms: 62 63 64 65 66


125
6i5n
163
2fyv
21
4b9h
57
3kbz




30
4qhc
21
6fmf
40
2xne
29
4kv9


[15:21:05] Can't kekulize mol.  Unkekulized atoms: 18 19 20 23 27


42
4yc9
45
4p4b
52
5kbe
7
5nbw
21
3q6z
59
1a30




26
2z3h
30
4ck3
26
4anm
22
4b72
30
2fb8
34
3m3c
35
3b1m


[15:21:06] Can't kekulize mol.  Unkekulized atoms: 0 1 16 23 24


38
3fcl
25
3p3u
20
4kax
47
6ms1


[15:21:07] Can't kekulize mol.  Unkekulized atoms: 22 23 24 27 31


58
4ib5
98
4y2b


[15:21:07] Can't kekulize mol.  Unkekulized atoms: 78 79 80 81 82
[15:21:07] Can't kekulize mol.  Unkekulized atoms: 0 9 11 16 19


22
3myg
30
3pyy
25
3pdq
21
6ggn
38
6q7w
22
4wr7
18
2pv1


[15:21:08] Can't kekulize mol.  Unkekulized atoms: 5 6 7 8 9 10 11 12 13


66
4yjl
108
5f29
70
1x38
20
1ppl
92
6guh
26
1q91




47
6uwp
27
1uk0
28
4x6k
15
5jr2
98
6hv7


[15:21:09] Can't kekulize mol.  Unkekulized atoms: 76 77 78 79 80 81 82 83 84


41
4jlm
29
4qtn
18
1hef
48
6mlj
12
3mw1
33
2xsb


[15:21:09] Can't kekulize mol.  Unkekulized atoms: 8 9 10 11 12 13 14 15 17


15
3g3d
34
3kec




33
3u5l
23
5u7k
27
5yij


[15:21:10] Can't kekulize mol.  Unkekulized atoms: 36 37 41 42 43
[15:21:10] Can't kekulize mol.  Unkekulized atoms: 6 7 8 9 10 11 12 13 21


72
5ngt
16
4wkp
26
1h24
78
1wv7
43
2emt


[15:21:10] Can't kekulize mol.  Unkekulized atoms: 30 31 32 33 34 35 36 37 38
[15:21:11] Can't kekulize mol.  Unkekulized atoms: 48 49 50 51 52


132
4jj7
49
6h7f


[15:21:11] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4


25
4k5y
24
2z8e
15
1njb
33
4hmq




49
6hop
14
1urc
46
5dd9
26
4asj
27
5fqv
27
6mj4
63
5t4f
33
3mbp


[15:21:12] Can't kekulize mol.  Unkekulized atoms: 0 3 4 5 8


34
3dp4
13
2ole
30
4bt9
55
5yg4


[15:21:13] Can't kekulize mol.  Unkekulized atoms: 3 4 7 31 32


34
4j2t
51
3nq9
10
6rna
31
1gu3
56
6h50
22
4nrm
13
4cpw
52
5wxh
57
5lvr
15
6f8j
26
4aq4
19
3t01




10
1okx
67
4hy5
43
3ff3
10
3suf
54
6baw
30
1odj
20
2vr0


[15:21:16] Can't kekulize mol.  Unkekulized atoms: 0 2 3 6 7 8 9 10 11


40
3tv5
36
4p0b
34
4gh6
31
6gj5
24
3gbq
86
6h5w


[15:21:16] sanitise [15:21:16] 2vr0_ligand: [15:21:16] Can't kekulize mol.  Unkekulized atoms: 0 5 6 7 8 14 15 16 22


27
3udy
27
3ex6


[15:21:17] Can't kekulize mol.  Unkekulized atoms: 2


34
6asz
51
4o4k
15
5ap2
35
1qaw


[15:21:17] Can't kekulize mol.  Unkekulized atoms: 5 6 7 8 9 10 11 12 13


15
2qu6
36
4b32
11
4std
20
5mpz
14
6g2r
31
1vj5
17
4ozo
21
6mo1
33
4oeg
41
6djj
13
1fkg
33
3rw9
23
2bw7
21
1iky
27
6n3e
65
4gao
68
4q1x
38
2ydm


[15:21:19] Can't kekulize mol.  Unkekulized atoms: 24 25 26 27 28 29 30 31 32


14
5d1s
19
6jof
28
4oc1
30
6dh4
39
2xah




106
5n2x
66
3l16
33
3dnj
26
3rpr
25
4u71


[15:21:21] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 6 7 8
[15:21:21] Can't kekulize mol.  Unkekulized atoms: 0 1 14 17 21 22 23 29 30


29
3kac
14
6eyz
34
4gm3
46
6edr
73
2weq
40
1ugw




12
5qar
21
2ql9
33
5ho6
38
4ocx
33
1zp5
23
3cke


[15:21:22] Can't kekulize mol.  Unkekulized atoms: 14 15 17 18 19


52
4hiq
21
5k00
42
4zsh
22
6bw2
46
3h0z
43
2wq5
33
3f17
21
3ipa
6
4o72
30
5d6p
18
1azx
100
1ogx


[15:21:23] Can't kekulize mol.  Unkekulized atoms: 2 3 4 5 6


20
2jkk
33
6jtc
16
2y68
35
6mtu
127
1mh5
62
3k5d


[15:21:23] Can't kekulize mol.  Unkekulized atoms: 12 13 14 15 16


37
3bm8
12
4ks3
28
4i9i
33
3iu9
16
4gd6


[15:21:24] Can't kekulize mol.  Unkekulized atoms: 2 5 10 14 15


32
5c3k
13
6o9x
21
5al2


[15:21:24] Can't kekulize mol.  Unkekulized atoms: 0 3 12 13 14


20
2wxh
38
3cpb
27
3qrj
41
3t3d
19
2hzi
29
5i8g
29
4tq3




39
5l8y
42
6eg9
39
6rvl


[15:21:26] Explicit valence for atom # 7 O, 3, is greater than permitted


21
4ygf
13
3s7m
25
4x5y


[15:21:26] Can't kekulize mol.  Unkekulized atoms: 35 36 37 38 39
[15:21:26] Can't kekulize mol.  Unkekulized atoms: 3 4 7 8 9


40
1a9q
10
4gki
25
4umq
30
3sow
52
1o4k
35
6f9g




6
3d9n
94
1hdt
39
1olu




45
1std
20
3hyg
27
3rlr


[15:21:27] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 13 14 15 19


23
4mpe
21
2pnx
89
6euz
26
1gvw
96
2pgj
53
5gjg


[15:21:28] Can't kekulize mol.  Unkekulized atoms: 23 24 25 26 27


42
2uw6
21
4jg1


[15:21:28] Can't kekulize mol.  Unkekulized atoms: 16 17 18 19 20
[15:21:28] Can't kekulize mol.  Unkekulized atoms: 70 71 72 73 74


149
5cpr
24
3d78
14
3gvb
16
5os7
18
3hw1
16
1jmg




33
3r22
28
6hzd
71
2oym
22
6dkg
37
5moc
197
5bmm




52
3c43
31
1tnl
10
1u71
24
4dwb




36
1a0t
23
6c7f
30
1ljt
17
5aqv
28
1f92
43
5o4f
15
2ow7
11
4mnv
98
5tg4
17
1zsh




54
6qxd
28
3n1v
15
3pd2
25
1kui
32
2psu


[15:21:32] Can't kekulize mol.  Unkekulized atoms: 22 23 24 25 26 27 28 29 30
[15:21:32] Can't kekulize mol.  Unkekulized atoms: 0 1 2 9 10


36
6pge
14
4lkq
11
1lv8


[15:21:32] Can't kekulize mol.  Unkekulized atoms: 1 2 3 5 6 7 8 9 10


35
4eqj
48
2f80
38
5nwe
20
1jdj


[15:21:33] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 6 7 8 9


11
1t4s
8
3t6r
42
3qnj
69
4aua


[15:21:33] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 6 7 14


16
5fsm
14
1ke9
25
4dus
35
2p95
30
6oyh
56
2vci
34
5w6o
52
3prs
50
4wk1
68
3s3m




30
4eo4
29
2gej
63
3lcv
27
3pke


[15:21:35] Can't kekulize mol.  Unkekulized atoms: 2 4 5 8 9


27
1loq
34
6oi8
31




3zsx
34
5lca
23
4f3h


[15:21:36] Can't kekulize mol.  Unkekulized atoms: 13 14 15 18 22


68
1swr
16
4i3z




42
2zp0
38
2cia


[15:21:36] Can't kekulize mol.  Unkekulized atoms: 8 9 10 11 12


135
3i02
33
6ow2
34
5my8
37
4jsa
10
4mss
14
3r0h


[15:21:37] Can't kekulize mol.  Unkekulized atoms: 62 63 64 65 66 67 68 69 70
[15:21:37] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 12 13 14 15 16


79
3vbg
28
1bl6
22
4yv8
36
1kll
23
5gmu
25
6bqh
34
5u0e
14
4ibb
33
2l6e
124
4ehm
25
4jx9
17
4jc1
23
5f2f
33
3nzc
31
1fax
33
3qem
25
6hhj
46
1db5
28
2vot
22
3fvk
19
3i7e
39
4deh
27
4iu4


[15:21:39] Explicit valence for atom # 3 C, 6, is greater than permitted


13
3gb2
24
2a29
48
3wqw
56
4c1g
14
1puq
38
6gzm




25
5f00
26
1ct8




56
2v54
41
5eue
29
2zg3
20
5huw
77
4qdk
26
5zjf
24
3ft5
12
5y8y
25
6boy
59
4trw
44
3c0z
8
5ddf
29
6i8b
54
6mja
68
1bq3


[15:21:42] Can't kekulize mol.  Unkekulized atoms: 113 114 115 116 117


54
6q38
119
6ipl
46
3h98
33
6nk0
92
5lm6
21
6b59
26
4acg


[15:21:42] Can't kekulize mol.  Unkekulized atoms: 10 11 12 13 14 15 16 17 18


38
4k3p
45
5fas
21
4dcy
28
1x8b


[15:21:43] Explicit valence for atom # 0 N, 4, is greater than permitted
[15:21:43] Can't kekulize mol.  Unkekulized atoms: 4 5 7 8 9 10 13 17 18


25
5i7u
32
2xv1


[15:21:43] Can't kekulize mol.  Unkekulized atoms: 7 8 9 18 22


42
6hmg
22
1r10


[15:21:44] Explicit valence for atom # 0 B, 4, is greater than permitted


47
5db0
38
5fom
63
5tqu


[15:21:44] Can't kekulize mol.  Unkekulized atoms: 5 6 7 8 9 10 26 27 28


29
1y91
33
4hyb
65
4r4c
41
6g9f
17
3mna
24
3ui7
24
2uxz
47
4f6v


[15:21:45] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 6 7 8 9 18 19 20


25
6gjb
47
4aa1


[15:21:45] Can't kekulize mol.  Unkekulized atoms: 25 26 27 28 29


38
1qf5
53
3hcm
18
4oq3
30
3veu
40
2fes
32
3ms9
12
1owi
28
1oba
7
1y2j
21
4rj7
30
5n9l
26
3ggv
63
1vjb
29
5kqd
119
1j4k
131
3lp4




10
3nhi
23
4jt9
25
6hkx
29
5sz4
16
6aqq
27
4o2b
29
4hws
29
6r4s
47
6mt6
80
3dz6




28
5e2o
37
6dha
26
4rcp




90
5a8z
31
2yir


[15:21:48] Can't kekulize mol.  Unkekulized atoms: 1 7 8 9 10 11 12 13 14
[15:21:48] Can't kekulize mol.  Unkekulized atoms: 42 43 44 47 51


30
2rok
83
6npt
30
4z46
9
3nf3
61
6jaw
24
2c9b
21
4uoh
42
4k67




32
3zdg
13
3obq
62
6p5w
34
1d4l
43
2ot1




38
4xhl
18
2qlf
32
6mmo




37
6bky
14
3w37
44
1hte
31
3g4i
28
5khg




34
4gr8
65
5yjo
31
3dla


[15:21:52] Can't kekulize mol.  Unkekulized atoms: 3 4 5 6 7
[15:21:52] Can't kekulize mol.  Unkekulized atoms: 3 9 10 11 14 15 16 17 19


71
5nhy
21
1h39
29
5ea5
41
4prn
96
5upz


[15:21:52] Can't kekulize mol.  Unkekulized atoms: 5 6 7 8 9


47
2q94
19
6ng0
29
1jcx


[15:21:52] Can't kekulize mol.  Unkekulized atoms: 4 12 14 15 16


47
2wqb
28
4v1f
13
6oxv
42
4mw4
23
4ih7
17
3rl3


[15:21:53] Can't kekulize mol.  Unkekulized atoms: 14 15 16 19 23


37
3tdh
37
4ybt
36
2rfn
45
3wdz
174
5j20
31
1ni1




33
1pvn




36
2jbv
90
4ufk
9
4nld
47
3lnj
101
4qlv


[15:21:55] Can't kekulize mol.  Unkekulized atoms: 11 12 13 14 15 16 17 18 19


42
5t35
69
3zpr
17
4xtw
40
2qci
40
5hff
63
4g11
23
3e37
43
2xyd




89
1g7g
43
3gcv
38
4w9h
33
4piq
26
4gw5
95
2cf9
33
3usx
16
5cil
130
6hgx
28
3st5


[15:21:57] Can't kekulize mol.  Unkekulized atoms: 13 14 15 16 17 18 19 20 21


40
1ujk
125
5d21
18
5hd0




32
6ooy
20
5xsu
22
4ofl


[15:21:57] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 6 14 17 19
[15:21:57] Can't kekulize mol.  Unkekulized atoms: 5 6 7 8 9


11
4otf
43
3mvl
30
6rfi
29
5l6p
21
2yer
21
1iep


[15:21:58] Can't kekulize mol.  Unkekulized atoms: 0 3 5 9 16


37
5q14
37
5inh
18
4urv
14
3ert
29
4j1h
30
1wur


[15:21:59] Can't kekulize mol.  Unkekulized atoms: 18


45
4nxo
33
4ncg
29
3lzu
38
2gga
19
2f5t




23
2jgs
16
1hiy




44
5qb0
15
6cvd


[15:22:02] Can't kekulize mol.  Unkekulized atoms: 4 5 6 7 8 9 10 11 12


13
2vj8
29
5acb


[15:22:02] Can't kekulize mol.  Unkekulized atoms: 12 23 24 25 26 27 30 31 36


40
1syi
17
5osd
16
3g0i
10
3u2q
92
1c4u
36
1nnu
20
2q54
48
1a52
20
6jn5
18
2pzi


[15:22:04] Explicit valence for atom # 7 C, 5, is greater than permitted


18
4abu
18
4apo
47
2hai
26
3m5e
23
1i9q
23
2hoc
22
1dy4
19
6e91
30
2pj9




59
4n7h
68
6ps1
21
3l9n
26
4uiy


[15:22:07] Can't kekulize mol.  Unkekulized atoms: 1 8 15 17 19 20 21 22 23


32
1l0a
143
3cj4
25
4q0l
28
5a7c
14
3ebi




50
3ljj
27
5i29
22
3zlr


[15:22:08] Can't kekulize mol.  Unkekulized atoms: 12 13 14 15 16


41
5aom
17
6dn6
38
6qha
22
2vo5
20
1fo3
16
3ia6
31
5tz3
29
4irx
12
2w3i
40
2bvr
27
3ral
27
2baj
27
1kzn


[15:22:11] Can't kekulize mol.  Unkekulized atoms: 39 40 41 42 43


49
4a6v
19
3u7n
27
2wks
24
1h46


KeyboardInterrupt: 

In [36]:
file_list

['2lbv',
 '1lt6',
 '4eoh',
 '3zc5',
 '6hqy',
 '1utl',
 '4lwi',
 '3t4p',
 '6oyz',
 '5y13',
 '4i11',
 '5i2e',
 '6n0q',
 '2vvo',
 '4tk1',
 '1p1n',
 '5l2w',
 '2yay',
 '2y7i',
 '1i7g',
 '1nj5',
 '5wei',
 '3l7d',
 '4v04',
 '4qmz',
 '6f3e',
 '3vby',
 '4b7q',
 '6qme',
 '1ft7',
 '2xln',
 '2oz2',
 '5ih9',
 '5u0w',
 '2ga2',
 '6eox',
 '5ypp',
 '5z95',
 '4a4l',
 '4efk',
 '4us3',
 '4alv',
 '5he4',
 '5jrq',
 '3k97',
 '3cvk',
 '3qck',
 '1uyi',
 '3oy0',
 '2w67',
 '5ivz',
 '4c1y',
 '3rr4',
 '1mnc',
 '4gqr',
 '5ur1',
 '5qc4',
 '1apv',
 '6cwh',
 '6qyp',
 '1o3f',
 '1szm',
 '2xxy',
 '1hxw',
 '4dv8',
 '5kly',
 '3hxb',
 '4mvn',
 '3u3u',
 '4pax',
 '5j8x',
 '1c5y',
 '5a14',
 '5cas',
 '5c7c',
 '4a9i',
 '5ukj',
 '5ab0',
 '4zuq',
 '3juo',
 '6hke',
 '6g9x',
 '2p33',
 '5g60',
 '4g0k',
 '6kdi',
 '2c4w',
 '1ru2',
 '4npv',
 '4rxe',
 '3ck8',
 '5abw',
 '4uco',
 '6f6s',
 '4lq3',
 '6cnk',
 '3zmv',
 '1sqp',
 '5ory',
 '6jad',
 '6t1m',
 '2b1r',
 '1v2j',
 '4m5g',
 '5zma',
 '4rh5',
 '3bh3',
 '6e3n',
 '1g4k',
 '5c5t',
 '5h08',
 

In [25]:
df_plifSpecs

Unnamed: 0_level_0,AA_COORDS,FRAGMENT_ATOMS_COORDS,INTERACTION_TYPE,DIST
FRAGMENT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
70,"[[(28.507, -14.875, 6.05), (28.507, -14.875, 6...","[[(29.075, -14.822, 3.877), (27.378, -16.857, ...",[metal],"[[2.25, 2.28, 2.35, 2.18, 2.50]]"


In [2]:
import os
os.chdir(f'/groups/cherkasvgrp/share/progressive_docking/hmslati/plif_cnn/general_refined_set/3bbb')

In [10]:
import re
content = open("3bbb_ligand.pdb").read()
print(re.findall("^ATOM (.*)", content, re.M))

['     1  O5*   A A   1     -18.744 -22.299  44.582  1.00  0.00           O  ', '     2  C5*   A A   1     -18.829 -22.566  43.185  1.00  0.00           C  ', '     3  C4*   A A   1     -19.139 -21.309  42.388  1.00  0.00           C  ', '     4  O4*   A A   1     -19.503 -21.667  41.032  1.00  0.00           O  ', '     5  C3*   A A   1     -20.292 -20.451  42.917  1.00  0.00           C  ', '     6  O3*   A A   1     -19.790 -19.271  43.570  1.00  0.00           O  ', '     7  C2*   A A   1     -21.131 -20.137  41.673  1.00  0.00           C  ', '     8  C1*   A A   1     -20.246 -20.587  40.511  1.00  0.00           C  ', '     9  N9    A A   1     -20.916 -21.004  39.269  1.00  0.00           N  ', '    10  C8    A A   1     -20.378 -21.817  38.308  1.00  0.00           C  ', '    11  N7    A A   1     -21.161 -22.033  37.277  1.00  0.00           N  ', '    12  C5    A A   1     -22.300 -21.309  37.576  1.00  0.00           C  ', '    13  C6    A A   1     -23.513 -21.126  36.879 