In [1]:
import os, pickle
import pandas as pd

In [2]:
with open("../../../training_data/8.Apos/Extra_set/features.pkl", "rb") as f:
    featuresd = pickle.load(f)

len(featuresd), featuresd

(9,
 {'8sgj':     Residues                                                          \
           pdb label_entity_id label_asym_id label_seq_id auth_asym_id   
  0       8sgj               1             A           52            A   
  1       8sgj               1             A           53            A   
  2       8sgj               1             A           54            A   
  3       8sgj               1             A           55            A   
  4       8sgj               1             A           56            A   
  ..       ...             ...           ...          ...          ...   
  746     8sgj               1             A          941            A   
  747     8sgj               1             A          942            A   
  748     8sgj               1             A          943            A   
  749     8sgj               1             A          944            A   
  750     8sgj               1             A          945            A   
  
                       

# "Predict"

On website.

AllositePro doesn't find any sites for: **8vw5 (errors), 4jqi, 5uak, 6yhr, 7xlq, 8sgj, the AlphaFold model**

<br>

In [3]:
# Any missing
for pdb in featuresd:
    pdbdir = f"{pdb}"
    if not os.path.isdir(pdbdir):
        os.makedirs(pdbdir, exist_ok=True)
    elif len(os.listdir(pdbdir)) < 1:
        pass
    else:
        print("OK:", pdb)
        continue
    print("Missing:", pdb)

Missing: 8sgj
Missing: AF-A0A1D8PQM9-F1
OK: 7l6r
Missing: 8vw5
Missing: 6yhr
Missing: 7xlq
OK: 5b0u
Missing: 5uak
Missing: 4jqi


In [6]:
import tarfile

In [7]:
# Uncompress manually placed result
for pdb in featuresd:
    pdbdir = f"{pdb}"
    tar_exists = False
    for f in os.listdir(pdbdir):
        if f.endswith(".tar.gz"):
            tar_exists = True
            outdir = f"{pdbdir}/{f.replace('.tar.gz', '')}"
            if not os.path.isdir(outdir):
                with tarfile.open(f"{pdbdir}/{f}", 'r:gz') as tar:
                    tar.extractall(path=pdbdir)
                
    if not tar_exists:
        print("Missing result .tar.gz:", pdb)

Missing result .tar.gz: 8sgj
Missing result .tar.gz: AF-A0A1D8PQM9-F1
Missing result .tar.gz: 6yhr
Missing result .tar.gz: 7xlq
Missing result .tar.gz: 2ldr
Missing result .tar.gz: 5uak
Missing result .tar.gz: 4jqi


In [8]:
from Bio import PDB

In [9]:
# Sanity check
for pdb in featuresd:
    pdbdir = f"{pdb}"
    pdb_exists = False
    for d in os.listdir(pdbdir):
        if os.path.isdir(f"{pdbdir}/{d}") and not d.startswith("."):
            pdbf = f"{pdbdir}/{d}/{d.replace('_download', '')}.pdb"
            if os.path.isfile(pdbf):
                # Check that the length of the input pdb and output pdb (without heteroatoms of the pocket spheres) are the same
                if len(tuple(
                    residue
                    for model in PDB.PDBParser(QUIET=True).get_structure(pdb, f"../structures/{pdb}.pdb")
                    for chain in model
                    for residue in chain
                )) != len(tuple(
                    residue
                    for model in PDB.PDBParser(QUIET=True).get_structure(pdb, pdbf)
                    for chain in model
                    for residue in chain
                    if residue.id[0] == ' ' # it's not heteroatom                    
                )):
                    print("Input and output PDBs with different number of residues:", pdb)
            else:
                print("Missing pdb:", pdb)

# Processing

In [4]:
def process_allositepro_txt(file):
    with open(file, "r") as f:
        text = f.read().strip()
        lines = text.split('\n')
        pockets = {}
        current_pocket = None
    
        for line in lines:
            if line.startswith('pocket'):
                current_pocket = line
                pockets[current_pocket] = {}
            else:
                try:
                    key, value = line.split(':')
                except:
                    print(line)
                pockets[current_pocket][key.strip()] = float(value.strip())
    
        return pd.DataFrame(pockets).T.to_dict(orient="index")#.sort_values("hitScore", ascending=False)

In [5]:
import re

def process_allositepro_pml(file):
    with open(file, "r") as f:
        pymol_text = f.read().strip()
        pockets_dict = {}

        ## Lines can look like either:
        ## cmd.select("pocket2","///A/569+619+495+493+534+301+491+567+553+")
        ## cmd.select("pocket0","///A/187.0+15.0+250.0+251.0+186.0+247.0+254.0+") # PDBs with insertion codes
        
        # Pattern to match each pocket selection command
        pocket_pattern = re.compile(r'cmd\.select\("(?P<pocket>pocket\d+)",".+?"\)')
        # Pattern to match chain and residues within each selection command
        chain_residue_pattern = re.compile(r'///(?P<chain>\w+)/(?P<residues>[\d\.\+]+)')
    
        for pocket_match in pocket_pattern.finditer(pymol_text):
            pocket = pocket_match.group('pocket')
            selection_command = pocket_match.group(0)
            residues_list = []

            for chain_residue_match in chain_residue_pattern.finditer(selection_command):
                chain = chain_residue_match.group('chain')
                residues = chain_residue_match.group('residues').split('+')
                
                if any(res.endswith(".0") for res in residues): # Residues of PDBs with insertion codes
                    residues = tuple(res.replace(".0", "") for res in residues)

                residues_list.extend([(res, chain) for res in residues if res.isdigit()])
    
            if pocket not in pockets_dict:
                pockets_dict[pocket] = {
                    'auth_asym_id': [],
                    'auth_seq_id': []
                }
    
            for res, chain in residues_list:
                pockets_dict[pocket]['auth_seq_id'].append(res)
                pockets_dict[pocket]['auth_asym_id'].append(chain)
    
        return pockets_dict, chain_residue_match

In [6]:
allositepro_results = {}

for pdb in featuresd:
    pdbdir = f"{pdb}"
    if len([d for d in os.listdir(pdbdir) if not d.startswith(".")]) == 0:
        print("Missing:", pdb)
        continue

    jobid = next(f for f in os.listdir(pdbdir) if f.endswith(".tar.gz")).replace("_download.tar.gz", "")
    d = f"{pdbdir}/{jobid}_download"

    pockets = process_allositepro_txt(f"{d}/{jobid}.txt")
    pockets_res, chain_residue_match = process_allositepro_pml(f"{d}/{jobid}.pml")
    assert set(pockets.keys()) == set(pockets_res.keys())
    for p in list(pockets.keys()):
        pockets[p]["residues"] = pd.DataFrame(pockets_res[p], dtype=str)

    allositepro_results[pdb] = pockets

len(allositepro_results), allositepro_results

Missing: 8sgj
Missing: AF-A0A1D8PQM9-F1
Missing: 8vw5
Missing: 6yhr
Missing: 7xlq
Missing: 5uak
Missing: 4jqi


(2,
 {'7l6r': {'pocket0': {'Volume': 2222.145,
    'SASA': 1065.897,
    'Druggability Score': 0.576,
    'logitProb': 0.844,
    'nmaScore': 0.971,
    'hitScore': 0.869,
    'residues':    auth_asym_id auth_seq_id
    0             A        6972
    1             A        6935
    2             A        6996
    3             A        6947
    4             A        6932
    5             A        6931
    6             A        6999
    7             A        6913
    8             A        6897
    9             A        6824
    10            A        6946
    11            A        6871
    12            A        6896
    13            A        6872
    14            A        6911
    15            A        6844
    16            A        6823
    17            A        6929
    18            A        6971
    19            A        6822
    20            A        6828
    21            A        6841
    22            A        6912
    23            A        6928
    24          

In [7]:
allositepro_resultsf = "allositepro_results.pkl"

with open(allositepro_resultsf, "wb") as f:
    pickle.dump(allositepro_results, f)