In [None]:
import os, pickle
import pandas as pd

In [4]:
with open("../../../training_data/7.Extra_set/features.pkl", "rb") as f:
    featuresd = pickle.load(f)

len(featuresd), featuresd

(16,
 {'7gqu': {'label_asym_id': ['D']},
  '7x6v': {'label_asym_id': ['A']},
  '7yg5': {'label_asym_id': ['V']},
  '8aq6': {'label_asym_id': ['IA']},
  '8cgw': {'label_asym_id': ['P']},
  '8f4s': {'label_asym_id': ['D']},
  '8jp0': {'label_asym_id': ['B']},
  '8qni': {'label_asym_id': ['B']},
  '8qns': {'label_asym_id': ['E']},
  '8r4b': [{'label_asym_id': ['B']}, {'label_asym_id': ['C']}],
  '8uk6': {'label_asym_id': ['C']},
  '8v81': {'label_asym_id': ['L']},
  '8vsd': {'label_asym_id': ['B', 'C']},
  '9ckc': {'label_asym_id': ['E']},
  '9dnm': {'label_asym_id': ['D']},
  '9fvb': {'label_asym_id': ['C']}})

# "Predict"

On website.

AllositePro doesn't find any sites for **9dnm** (and 8r4b, 8vsd; proteic modulators)

<br>

In [7]:
# Any missing
for pdb in featuresd:
    pdbdir = f"{pdb}"
    if not os.path.isdir(pdbdir):
        os.makedirs(pdbdir, exist_ok=True)
    elif len(os.listdir(pdbdir)) < 2:
        pass
    else:
        continue
    print("Missing:", pdb)

Missing: 8aq6
Missing: 8r4b
Missing: 8vsd
Missing: 9dnm


In [8]:
import tarfile

In [9]:
# Uncompress manually placed result
for pdb in featuresd:
    pdbdir = f"{pdb}"
    tar_exists = False
    for f in os.listdir(pdbdir):
        if f.endswith(".tar.gz"):
            tar_exists = True
            outdir = f"{pdbdir}/{f.replace('.tar.gz', '')}"
            if not os.path.isdir(outdir):
                with tarfile.open(f"{pdbdir}/{f}", 'r:gz') as tar:
                    tar.extractall(path=pdbdir)
                
    if not tar_exists:
        print("Missing result .tar.gz:", pdb)

Missing result .tar.gz: 8r4b
Missing result .tar.gz: 8vsd
Missing result .tar.gz: 9dnm


In [11]:
from Bio import PDB

In [12]:
# Sanity check
for pdb in featuresd:
    pdbdir = f"{pdb}"
    pdb_exists = False
    for d in os.listdir(pdbdir):
        if os.path.isdir(f"{pdbdir}/{d}") and not d.startswith("."):
            pdbf = f"{pdbdir}/{d}/{d.replace('_download', '')}.pdb"
            if os.path.isfile(pdbf):
                # Check that the length of the input pdb and output pdb (without heteroatoms of the pocket spheres) are the same
                if len(tuple(
                    residue
                    for model in PDB.PDBParser(QUIET=True).get_structure(pdb, f"../structures/{pdb}.pdb")
                    for chain in model
                    for residue in chain
                )) != len(tuple(
                    residue
                    for model in PDB.PDBParser(QUIET=True).get_structure(pdb, pdbf)
                    for chain in model
                    for residue in chain
                    if residue.id[0] == ' ' # it's not heteroatom                    
                )):
                    print("Input and output PDBs with different number of residues:", pdb)
            else:
                print("Missing pdb:", pdb)

# Processing

In [13]:
def process_allositepro_txt(file):
    with open(file, "r") as f:
        text = f.read().strip()
        lines = text.split('\n')
        pockets = {}
        current_pocket = None
    
        for line in lines:
            if line.startswith('pocket'):
                current_pocket = line
                pockets[current_pocket] = {}
            else:
                try:
                    key, value = line.split(':')
                except:
                    print(line)
                pockets[current_pocket][key.strip()] = float(value.strip())
    
        return pd.DataFrame(pockets).T.to_dict(orient="index")#.sort_values("hitScore", ascending=False)

In [14]:
import re

def process_allositepro_pml(file):
    with open(file, "r") as f:
        pymol_text = f.read().strip()
        pockets_dict = {}

        ## Lines can look like either:
        ## cmd.select("pocket2","///A/569+619+495+493+534+301+491+567+553+")
        ## cmd.select("pocket0","///A/187.0+15.0+250.0+251.0+186.0+247.0+254.0+") # PDBs with insertion codes
        
        # Pattern to match each pocket selection command
        pocket_pattern = re.compile(r'cmd\.select\("(?P<pocket>pocket\d+)",".+?"\)')
        # Pattern to match chain and residues within each selection command
        chain_residue_pattern = re.compile(r'///(?P<chain>\w+)/(?P<residues>[\d\.\+]+)')
    
        for pocket_match in pocket_pattern.finditer(pymol_text):
            pocket = pocket_match.group('pocket')
            selection_command = pocket_match.group(0)
            residues_list = []

            for chain_residue_match in chain_residue_pattern.finditer(selection_command):
                chain = chain_residue_match.group('chain')
                residues = chain_residue_match.group('residues').split('+')
                
                if any(res.endswith(".0") for res in residues): # Residues of PDBs with insertion codes
                    residues = tuple(res.replace(".0", "") for res in residues)

                residues_list.extend([(res, chain) for res in residues if res.isdigit()])
    
            if pocket not in pockets_dict:
                pockets_dict[pocket] = {
                    'auth_asym_id': [],
                    'auth_seq_id': []
                }
    
            for res, chain in residues_list:
                pockets_dict[pocket]['auth_seq_id'].append(res)
                pockets_dict[pocket]['auth_asym_id'].append(chain)
    
        return pockets_dict, chain_residue_match

In [15]:
allositepro_results = {}

for pdb in featuresd:
    pdbdir = f"{pdb}"
    if len([d for d in os.listdir(pdbdir) if not d.startswith(".")]) == 0:
        print("Missing:", pdb)
        continue

    jobid = next(f for f in os.listdir(pdbdir) if f.endswith(".tar.gz")).replace("_download.tar.gz", "")
    d = f"{pdbdir}/{jobid}_download"

    pockets = process_allositepro_txt(f"{d}/{jobid}.txt")
    pockets_res, chain_residue_match = process_allositepro_pml(f"{d}/{jobid}.pml")
    assert set(pockets.keys()) == set(pockets_res.keys())
    for p in list(pockets.keys()):
        pockets[p]["residues"] = pd.DataFrame(pockets_res[p], dtype=str)

    allositepro_results[pdb] = pockets

len(allositepro_results), allositepro_results

Missing: 8r4b
Missing: 8vsd
Missing: 9dnm


{'7gqu': {'pocket0': {'Volume': 425.497,
   'SASA': 211.983,
   'Druggability Score': 0.565,
   'logitProb': 0.564,
   'nmaScore': 0.994,
   'hitScore': 0.65,
   'residues':    auth_asym_id auth_seq_id
   0             A         703
   1             A         849
   2             A         846
   3             A         704
   4             A         845
   5             A         913
   6             A         891
   7             A         916
   8             A         917
   9             A         850
   10            A         728
   11            A         727
   12            A         706
   13            A         895
   14            A         898
   15            A         572
   16            A         570
   17            A         571}},
 '7x6v': {'pocket0': {'Volume': 2655.098,
   'SASA': 1562.978,
   'Druggability Score': 0.78,
   'logitProb': 0.64,
   'nmaScore': 0.121,
   'hitScore': 0.537,
   'residues':    auth_asym_id auth_seq_id
   0             A        1611
   

In [None]:
allositepro_resultsf = "allositepro_results.pkl"

with open(allositepro_resultsf, "wb") as f:
    pickle.dump(allositepro_results, f)