# Requisites

- Environment with dependencies in `conda_env.yaml`
- External dependencies to be downloaded and place in `training_data/utils/external`:
    - predict_ddG.py script from PyRosetta (https://github.com/RosettaCommons/PyRosetta.notebooks/blob/master/notebooks/additional_scripts/predict_ddG.py)
    - DSPP software executable (https://github.com/PDB-REDO/dssp/releases/download/v4.4.0/mkdssp-4.4.0-linux-x64)
- Downloaded UniRef database https://wwwuser.gwdguser.de/~compbiol/uniclust/2023_02

In [1]:
path = "predict"

uniref_path = "/data/fnerin/UniRef30_2023_02/UniRef30_2023_02"

# Imports

In [2]:
import sys
sys.path.append("training_data")

In [3]:
from utils.new_pdbs import Pdb
from utils.structure_fixing import get_fixed_structure, CifFileWriter
from utils.utils import Cif

In [4]:
import os, tempfile
import pandas as pd
from tqdm.notebook import tqdm

# Folder

In [5]:
os.makedirs(path, exist_ok=True)

Pdb.path = path
Pdb.original_cifs_path = path
Cif.path = path
Cif.original_cifs_path = path

# Functions

In [6]:
def get_cif(
    pdb_id,
    path=path
):
    pdb = Pdb(pdb_id.lower())

    # Save original and uncompressed cif
    with open(f"{path}/{pdb.entry_id}_updated.cif.gz", "wb") as f:
        f.write(pdb.cif._cif_content)
    with open(f"{path}/{pdb.entry_id}_updated.cif", "w") as f:
        f.write(pdb.cif.text)
        
    # Cache the contents of the file
    pdb.cif.data
    return pdb

In [7]:
import pymol2

In [8]:
def get_site(site, only_protein=True, threshold=6): # site.pdb CAN BE PDB OR ASSEMBLY (must have .cif and .residues)
    """
    Function to, given a site, return a standardized list of residues from the parent structure that define the site with the Python interface of open-source PyMOL
    """    
    # Define the PyMOL-style selection of the modulator residues 
    sele = " or ".join(
        f"{res['label_asym_id']}/{res['auth_asym_id']}/{res['auth_comp_id']}`{res['auth_seq_id']}{res['pdbx_PDB_ins_code'].replace('?', '')}/*"
        for i, res in site.modulator_residues.iterrows()
    )
    
    with pymol2.PyMOL() as pymol:
        pymol.cmd.feedback(
            "disable", "executive", "details"
        )  # to silence "ExecutiveLoad-Detail: Detected mmCIF"

        # Load the parent structure of the site to PyMOL (it can only read a "real" file and not from string)
        with tempfile.NamedTemporaryFile("w+", suffix=".cif") as f:
            f.write(site.pdb.cif.text)
            pymol.cmd.load(f.name)

        # Retrieve all atoms within the threshold of the modulator selection
        site_atoms = pymol.cmd.get_model(f"br. all within {threshold} of {sele}")

    # Process the atom selection to obtain residue identifiers
    site_list = set(
        tuple(
            (
                a.segi, a.chain, a.resn,
                a.resi_number, a.ins_code or '?' # pdbx_PDB_ins_code or "?" if none
            ) 
            for a in site_atoms.atom
        )
    )

    # Transform the PyMOL-derived residue identifiers into a standard table of residues that can be used to retrieve the rows/residues from the parent structure's .residues table
    site_res = site.pdb.residues.merge(
        pd.DataFrame(
            site_list,
            columns=[
                "label_asym_id", "auth_asym_id", "auth_comp_id",
                "auth_seq_id", "pdbx_PDB_ins_code"
            ],
            dtype=str
        )
    ).query("pdbx_PDB_model_num == '1'")

    if only_protein:
        site_res = site_res.query(f"label_entity_id in {site.pdb._protein_entities} and label_asym_id not in {site.modulator_residues.label_asym_id.unique().tolist()}")

    assert len(site_res) > 0, "Site selection doesn't have any residues"

    return site_res

In [9]:
class Site:
    def __init__(self, pdb, modulator_residues=None, residues=None, only_protein=True, distance_threshold=6):
        self.pdb = pdb
        if modulator_residues is not None:
            self.modulator_residues = modulator_residues
            self.residues = get_site(self, only_protein=only_protein, threshold=distance_threshold)
        elif residues is not None:
            self.residues = pdb.residues.merge(pd.DataFrame(residues, dtype=str)).query("pdbx_PDB_model_num == '1'")
            if only_protein:
                self.residues = self.residues.query(f"label_entity_id in {site.pdb._protein_entities}")
        else:
            raise Exception("Pass one of 'modulator_residues' or 'residues'")

In [10]:
def get_clean_pdb(pdb, protein_chains, path=path):
    fixed_structure = get_fixed_structure(pdb, pdb, list(protein_chains), path, save=True)
    with open(f"{path}/{pdb.entry_id}.cif", "w+") as f:
        writer = CifFileWriter(f.name, compress=False)
        writer.write({
            pdb.entry_id.upper(): {
                "_atom_site": fixed_structure.to_dict(orient="list"),
                "_entity_poly": pdb.cif.data["_entity_poly"]
            }
        })
        
    cif = Cif(pdb.entry_id)
    # Cache the contents of the files
    cif.origcif.data
    cif.cif.data
    return cif

In [11]:
from ipymolstar import PDBeMolstar

In [12]:
def view_pdb(pdb, **kwargs):
    return PDBeMolstar(
        custom_data = {
                'data': pdb.cif.text,
                'format': 'cif',
                'binary': False,
            },
        sequence_panel = True,
        assembly_id='',
        **kwargs
    )

In [13]:
colors = {
    "orange": "#0FD55E00".lower(),
    "green": "#0F009E73".lower(),
    "blue": "#0F0072B2".lower()
}

def get_pocket(pdb, pocket, path=path):
    pocketn = pocket.replace('pocket', '')
    pocket_atoms = (
        Cif(pdb, f"{path}/{pdb}/{pdb}_out/{pdb}_out.cif", name=f"{pdb}_out")
        .atoms
        .query(f"label_comp_id == 'STP' and label_seq_id == '{pocketn}'")
    )
    pocket_atoms["label_asym_id"] = 'ZZZ'
    pocket_atoms["label_entity_id"] = '99'

    return pocket_atoms
    

def view_pockets(
    pdb, 
    pockets:dict, # {"pocketn": {"color": ""}}
    site_residues=None,
    modulator_residues=None,
    path=path
):
    chains = pdb.residues.label_asym_id.unique().tolist()
    pdb = pdb.entry_id
    cif = Cif(pdb, f"{path}/{pdb}_updated.cif")

    pockets = {
        pocketn: {
            "atoms": get_pocket(pdb, pocketn, path=path),
            "color": colors.get(pocket["color"], pocket["color"])
        }
        for pocketn, pocket in pockets.items()
    }

    # Fake entity data
    entities = pd.concat((
        pd.DataFrame(cif.cif.data["_entity"], dtype=str),#.query(f"id in {minimal_elements('label_entity_id')}"),
        pd.DataFrame([{"id": "99", "type": "branched", "pdbx_description": "pockets"}]) # Fake the pockets as carbohydrates to manage their representation
    )).fillna(".")

    
    columns = list( set.intersection( *map(set, (pocket["atoms"].columns for pocket in pockets.values())) ) )
    atoms = pd.concat((
        cif.atoms[columns],
        *(pocket["atoms"][columns] for pocket in pockets.values())
    ))

    with tempfile.NamedTemporaryFile("w+", suffix=".cif") as f:
        writer = CifFileWriter(f.name)
        writer.write({cif.entry_id.upper(): {
            "_entity": entities.to_dict(orient="list"),
            "_atom_site": atoms.to_dict(orient="list"),
        }})
        combined = Cif(pdb, filename=f.name)
        combined.cif.data # to cache it while 'f' exists

    data = [
        # Protein
        {"struct_asym_id": asym_id, 'representation': 'cartoon', 'representationColor': '#AEAEAE', 'focus': True}
        for asym_id in chains
    ]

    if site_residues is not None:
        data += [
            {'struct_asym_id': r["label_asym_id"], 'residue_number': int(r["label_seq_id"]), 'representationColor': colors["green"]}
            for i, r in site_residues.iterrows()
        ]
        
    # Ligands and molecules
    if modulator_residues is not None:
        data += [
            {'struct_asym_id': r["label_asym_id"], 'color': 'white'}
            for i, r in (
                combined.residues
                # Not modulator residues and only small molecule entities
                .merge(
                    modulator_residues if modulator_residues is not None else pd.DataFrame(columns=combined.residues.columns), # if modulator_residues not passed, empty df
                    how="outer", indicator=True
                )
                .query(f"""_merge == 'left_only' and label_entity_id in {entities.query("type == 'non-polymer'").id.unique().tolist()}""")
                .drop(columns="_merge")
                .iterrows()
            
            )
        ]

    # Pockets
    data += [
        {
            "struct_asym_id": "ZZZ", 'residue_number': int(pocketn.replace('pocket', '')), 'representation': 'point', 'representationColor': pocket["color"]
        }
        for pocketn, pocket in pockets.items()
    ]

    data += [
        {
            "struct_asym_id": "ZZZ", 'residue_number': int(pocketn.replace('pocket', '')), 'representation': 'gaussian-volume', 'representationColor': pocket["color"]
        }
        for pocketn, pocket in pockets.items()
    ]

    return view_pdb(
        combined,
        
        hide_polymer = True,
        # hide_heteroatoms = True,
        # hide_non_standard = True,
        hide_carbs = True,
        hide_water = True,
        
        color_data = {
            "data": data,
            "nonSelectedColor": None,
            "keepColors": True,
            "keepRepresentations": False,
        }
    )

# Predict

- Load or download .cif
    - If it is pdb, convert to cif with pymol. It will be working with auth_asym_id! Or fill a fake label_asym_id?
    - We need the sequence, if it is not in records get it from pymol
    - DSSP looks for self._cif._extended_temp_ciff({"_pdbx_poly_seq_scheme": extra})
- Be able to select chains, specify modulator and/or site
- Fix/standardize the structure
- Run feats, run pocket
- Extra: find closest clust rep and if it's in train or test

## Get protein structure

In [14]:
pdb = get_cif(
    pdb_id="4or2"
)

In [15]:
view_pdb(pdb)

PDBeMolstar(bg_color='#F7F7F7', custom_data={'data': "data_4OR2\n#\n_entry.id 4OR2\n#\n_citation.id primary\n_…

In [16]:
pdb.atoms

Unnamed: 0,group_PDB,id,type_symbol,label_atom_id,label_alt_id,label_comp_id,label_asym_id,label_entity_id,label_seq_id,pdbx_PDB_ins_code,...,auth_seq_id,auth_comp_id,auth_asym_id,auth_atom_id,pdbx_PDB_model_num,pdbx_label_index,pdbx_sifts_xref_db_name,pdbx_sifts_xref_db_acc,pdbx_sifts_xref_db_num,pdbx_sifts_xref_db_res
0,ATOM,1,N,N,.,ASP,A,1,5,?,...,1002,ASP,A,N,1,5,UNP,P0ABE7,24,D
1,ATOM,2,C,CA,.,ASP,A,1,5,?,...,1002,ASP,A,CA,1,5,UNP,P0ABE7,24,D
2,ATOM,3,C,C,.,ASP,A,1,5,?,...,1002,ASP,A,C,1,5,UNP,P0ABE7,24,D
3,ATOM,4,O,O,.,ASP,A,1,5,?,...,1002,ASP,A,O,1,5,UNP,P0ABE7,24,D
4,ATOM,5,C,CB,.,ASP,A,1,5,?,...,1002,ASP,A,CB,1,5,UNP,P0ABE7,24,D
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5901,HETATM,5902,O,O,.,HOH,R,7,.,?,...,2003,HOH,B,O,1,2003,?,?,?,?
5902,HETATM,5903,O,O,.,HOH,R,7,.,?,...,2004,HOH,B,O,1,2004,?,?,?,?
5903,HETATM,5904,O,O,.,HOH,R,7,.,?,...,2005,HOH,B,O,1,2005,?,?,?,?
5904,HETATM,5905,O,O,.,HOH,R,7,.,?,...,2006,HOH,B,O,1,2006,?,?,?,?


In [17]:
pdb.residues

Unnamed: 0,label_comp_id,label_asym_id,label_entity_id,label_seq_id,pdbx_PDB_ins_code,auth_seq_id,auth_comp_id,auth_asym_id,pdbx_PDB_model_num,pdbx_label_index,pdbx_sifts_xref_db_name,pdbx_sifts_xref_db_acc,pdbx_sifts_xref_db_num,pdbx_sifts_xref_db_res
0,ASP,A,1,5,?,1002,ASP,A,1,5,UNP,P0ABE7,24,D
8,LEU,A,1,6,?,1003,LEU,A,1,6,UNP,P0ABE7,25,L
13,GLU,A,1,7,?,1004,GLU,A,1,7,UNP,P0ABE7,26,E
22,ASP,A,1,8,?,1005,ASP,A,1,8,UNP,P0ABE7,27,D
30,ASN,A,1,9,?,1006,ASN,A,1,9,UNP,P0ABE7,28,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5901,HOH,R,7,.,?,2003,HOH,B,1,2003,?,?,?,?
5902,HOH,R,7,.,?,2004,HOH,B,1,2004,?,?,?,?
5903,HOH,R,7,.,?,2005,HOH,B,1,2005,?,?,?,?
5904,HOH,R,7,.,?,2006,HOH,B,1,2006,?,?,?,?


## Optional: set a target site

### With a modulator molecule

In [18]:
# Desired modulator is label_asym_id 'C'
pdb.residues.query("label_asym_id == 'C'")

Unnamed: 0,label_comp_id,label_asym_id,label_entity_id,label_seq_id,pdbx_PDB_ins_code,auth_seq_id,auth_comp_id,auth_asym_id,pdbx_PDB_model_num,pdbx_label_index,pdbx_sifts_xref_db_name,pdbx_sifts_xref_db_acc,pdbx_sifts_xref_db_num,pdbx_sifts_xref_db_res
5603,FM9,C,2,.,?,1901,FM9,A,1,1901,?,?,?,?


In [19]:
site = Site(
    pdb, 
    modulator_residues=pdb.residues.query("label_asym_id == 'C'"), 
    only_protein=True
)
site

<__main__.Site at 0x7c0eb14bff10>

In [20]:
site.modulator_residues

Unnamed: 0,label_comp_id,label_asym_id,label_entity_id,label_seq_id,pdbx_PDB_ins_code,auth_seq_id,auth_comp_id,auth_asym_id,pdbx_PDB_model_num,pdbx_label_index,pdbx_sifts_xref_db_name,pdbx_sifts_xref_db_acc,pdbx_sifts_xref_db_num,pdbx_sifts_xref_db_res
5603,FM9,C,2,.,?,1901,FM9,A,1,1901,?,?,?,?


In [21]:
site.residues

Unnamed: 0,label_comp_id,label_asym_id,label_entity_id,label_seq_id,pdbx_PDB_ins_code,auth_seq_id,auth_comp_id,auth_asym_id,pdbx_PDB_model_num,pdbx_label_index,pdbx_sifts_xref_db_name,pdbx_sifts_xref_db_acc,pdbx_sifts_xref_db_num,pdbx_sifts_xref_db_res
0,LEU,A,1,177,?,648,LEU,A,1,177,UNP,Q13255,648,L
1,GLN,A,1,189,?,660,GLN,A,1,189,UNP,Q13255,660,Q
2,ARG,A,1,190,?,661,ARG,A,1,190,UNP,Q13255,661,R
3,VAL,A,1,193,?,664,VAL,A,1,193,UNP,Q13255,664,V
4,GLY,A,1,194,?,665,GLY,A,1,194,UNP,Q13255,665,G
5,SER,A,1,197,?,668,SER,A,1,197,UNP,Q13255,668,S
6,TYR,A,1,201,?,672,TYR,A,1,201,UNP,Q13255,672,Y
7,GLN,A,1,246,?,717,GLN,A,1,246,UNP,Q13255,717,Q
8,CYS,A,1,275,?,746,CYS,A,1,275,UNP,Q13255,746,C
9,THR,A,1,277,?,748,THR,A,1,277,UNP,Q13255,748,T


In [22]:
# List of residue numbers of site
resnums = site.residues.label_seq_id.to_list()
resnums

['177',
 '189',
 '190',
 '193',
 '194',
 '197',
 '201',
 '246',
 '275',
 '277',
 '281',
 '282',
 '285',
 '286',
 '287',
 '289',
 '290',
 '293',
 '322',
 '323',
 '324',
 '326',
 '327',
 '328',
 '330',
 '334',
 '340',
 '341',
 '344',
 '347',
 '348',
 '351']

### With a list of residues

In [23]:
# Site can be defined with a list of residues instead of a modulator
res_site = Site(
    pdb=pdb,
    residues=[{"label_asym_id": "A", "label_seq_id": seqnum} for seqnum in resnums],
    only_protein=True
)
res_site.residues

Unnamed: 0,label_comp_id,label_asym_id,label_entity_id,label_seq_id,pdbx_PDB_ins_code,auth_seq_id,auth_comp_id,auth_asym_id,pdbx_PDB_model_num,pdbx_label_index,pdbx_sifts_xref_db_name,pdbx_sifts_xref_db_acc,pdbx_sifts_xref_db_num,pdbx_sifts_xref_db_res
0,LEU,A,1,177,?,648,LEU,A,1,177,UNP,Q13255,648,L
1,GLN,A,1,189,?,660,GLN,A,1,189,UNP,Q13255,660,Q
2,ARG,A,1,190,?,661,ARG,A,1,190,UNP,Q13255,661,R
3,VAL,A,1,193,?,664,VAL,A,1,193,UNP,Q13255,664,V
4,GLY,A,1,194,?,665,GLY,A,1,194,UNP,Q13255,665,G
5,SER,A,1,197,?,668,SER,A,1,197,UNP,Q13255,668,S
6,TYR,A,1,201,?,672,TYR,A,1,201,UNP,Q13255,672,Y
7,GLN,A,1,246,?,717,GLN,A,1,246,UNP,Q13255,717,Q
8,CYS,A,1,275,?,746,CYS,A,1,275,UNP,Q13255,746,C
9,THR,A,1,277,?,748,THR,A,1,277,UNP,Q13255,748,T


## Process protein structure

In [24]:
clean_pdb = get_clean_pdb(
    pdb,
    protein_chains=["A"]
)
clean_pdb.residues

Unnamed: 0,label_comp_id,label_asym_id,label_entity_id,label_seq_id,pdbx_PDB_ins_code,auth_seq_id,auth_comp_id,auth_asym_id,pdbx_PDB_model_num,pdbx_label_index,pdbx_sifts_xref_db_name,pdbx_sifts_xref_db_acc,pdbx_sifts_xref_db_num,pdbx_sifts_xref_db_res
0,ASP,A,1,5,?,1002,ASP,A,1,5,UNP,P0ABE7,24,D
8,LEU,A,1,6,?,1003,LEU,A,1,6,UNP,P0ABE7,25,L
16,GLU,A,1,7,?,1004,GLU,A,1,7,UNP,P0ABE7,26,E
25,ASP,A,1,8,?,1005,ASP,A,1,8,UNP,P0ABE7,27,D
33,ASN,A,1,9,?,1006,ASN,A,1,9,UNP,P0ABE7,28,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2779,ILE,A,1,368,?,839,ILE,A,1,368,UNP,Q13255,839,I
2787,ALA,A,1,369,?,840,ALA,A,1,369,UNP,Q13255,840,A
2792,LYS,A,1,370,?,841,LYS,A,1,370,UNP,Q13255,841,K
2801,PRO,A,1,371,?,842,PRO,A,1,371,UNP,Q13255,842,P


## Pockets

In [25]:
from utils.pocket_utils import Pocket, get_pockets_info

In [26]:
# os.system("ln -s /home/fnerin/Desktop/allodb_new/training_data/utils/external/fpocket/bin/fpocket fpocket")

In [27]:
if not os.path.isdir(f"{path}/{clean_pdb.entry_id}/{clean_pdb.entry_id}_out"):
    os.makedirs(f"{path}/{clean_pdb.entry_id}", exist_ok=True)
    os.system(f"cp {clean_pdb.filename} {path}/{clean_pdb.entry_id}/")
    os.system(f"fpocket -m 3 -M 6 -i 35 --file {path}/{clean_pdb.entry_id}/{clean_pdb.entry_id}.cif")

***** POCKET HUNTING BEGINS ***** 
***** POCKET HUNTING ENDS ***** 


In [28]:
pockets = pd.DataFrame((
    {"pocket": (
        pocketf.split("_")[0]
        for pocketf in os.listdir(f"{path}/{clean_pdb.entry_id}/{clean_pdb.entry_id}_out/pockets")
            if pocketf.endswith(".cif")
    )}
    
))

pockets

Unnamed: 0,pocket
0,pocket11
1,pocket7
2,pocket1
3,pocket2
4,pocket4
5,pocket9
6,pocket10
7,pocket5
8,pocket8
9,pocket3


## Optional: labelled pocket

In [29]:
# Number of residues forming the pocket, and pocket- site overlap percentages
## site_in_pocket: % of residues of the site that are part of the pocket (% with the site of the PDB that gives the maximum %, comparing all sites of the PDB with the pocket)
## pocket_in_site: % of residues of the pocket that are part of the allosetric site (% with the site of the PDBthat gives the maximum %)

pockets = pd.DataFrame((
    pocket
    for pocket in get_pockets_info(
        clean_pdb, 
        sites = ({
            # "mod": site.modulator_residues,
            "site": site.residues
        },),
        pockets_path = f"{path}"
    )
))

pockets.loc[
    pockets[['site_in_pocket', 'pocket_in_site']].max(axis=1).sort_values(ascending=False).index
]

Unnamed: 0,pdb,pocket,nres,site_in_pocket,pocket_in_site
2,4or2,pocket1,28,0.75,0.857143
8,4or2,pocket8,8,0.1875,0.75
6,4or2,pocket10,10,0.09375,0.3
10,4or2,pocket12,30,0.25,0.266667
3,4or2,pocket2,19,0.0,0.0
1,4or2,pocket7,13,0.0,0.0
0,4or2,pocket11,10,0.0,0.0
4,4or2,pocket4,25,0.0,0.0
7,4or2,pocket5,22,0.0,0.0
5,4or2,pocket9,27,0.0,0.0


In [30]:
pocket_id = "pocket1"

pocket = Pocket(f"{path}/{clean_pdb.entry_id}/{clean_pdb.entry_id}_out/pockets/{pocket_id}_atm.cif")
pocket

<utils.pocket_utils.Pocket at 0x7c0e9e0b4210>

In [31]:
pocket.residues.sort_values("label_seq_id")

Unnamed: 0,label_comp_id,label_asym_id,label_seq_id,pdbx_PDB_ins_code,auth_seq_id,auth_asym_id
52,LEU,A,177,?,648,A
16,GLN,A,189,?,660,A
0,ARG,A,190,?,661,A
17,VAL,A,193,?,664,A
55,GLY,A,194,?,665,A
27,SER,A,197,?,668,A
58,TYR,A,201,?,672,A
85,GLN,A,246,?,717,A
10,LEU,A,249,?,720,A
1,VAL,A,250,?,721,A


In [32]:
site.residues

Unnamed: 0,label_comp_id,label_asym_id,label_entity_id,label_seq_id,pdbx_PDB_ins_code,auth_seq_id,auth_comp_id,auth_asym_id,pdbx_PDB_model_num,pdbx_label_index,pdbx_sifts_xref_db_name,pdbx_sifts_xref_db_acc,pdbx_sifts_xref_db_num,pdbx_sifts_xref_db_res
0,LEU,A,1,177,?,648,LEU,A,1,177,UNP,Q13255,648,L
1,GLN,A,1,189,?,660,GLN,A,1,189,UNP,Q13255,660,Q
2,ARG,A,1,190,?,661,ARG,A,1,190,UNP,Q13255,661,R
3,VAL,A,1,193,?,664,VAL,A,1,193,UNP,Q13255,664,V
4,GLY,A,1,194,?,665,GLY,A,1,194,UNP,Q13255,665,G
5,SER,A,1,197,?,668,SER,A,1,197,UNP,Q13255,668,S
6,TYR,A,1,201,?,672,TYR,A,1,201,UNP,Q13255,672,Y
7,GLN,A,1,246,?,717,GLN,A,1,246,UNP,Q13255,717,Q
8,CYS,A,1,275,?,746,CYS,A,1,275,UNP,Q13255,746,C
9,THR,A,1,277,?,748,THR,A,1,277,UNP,Q13255,748,T


In [33]:
pocket.feats

{'Pocket Score': 0.5354,
 'Drug Score': 0.9596,
 'Number of alpha spheres': 176.0,
 'Mean alpha-sphere radius': 3.7032,
 'Mean alpha-sphere Solvent Acc.': 0.5048,
 'Mean B-factor of pocket residues': 0.2235,
 'Hydrophobicity Score': 45.5,
 'Polarity Score': 12.0,
 'Amino Acid based volume Score': 4.1786,
 'Pocket volume (Monte Carlo)': 746.6119,
 'Pocket volume (convex hull)': 409.8742,
 'Charge Score': 1.0,
 'Local hydrophobic density Score': 65.9439,
 'Number of apolar alpha sphere': 107.0,
 'Proportion of apolar alpha sphere': 0.608,
 'Total SASA': 154.66,
 'Polar SASA': 38.727,
 'Apolar SASA': 115.932,
 'Proportion of polar atoms': 36.047,
 'Alpha sphere density': 7.006,
 'Cent. of mass - Alpha Sphere max dist': 17.099,
 'Flexibility': 0.223}

### Pocket features

In [34]:
pockets_features = pd.concat(
    (
        pockets,
        pockets.apply(
            lambda row: pd.Series(
                Pocket(f"{path}/{clean_pdb.entry_id}/{clean_pdb.entry_id}_out/pockets/{row['pocket']}_atm.cif").feats
            ), axis=1
        )
    ),
    axis=1
)

pockets_features

Unnamed: 0,pdb,pocket,nres,site_in_pocket,pocket_in_site,Pocket Score,Drug Score,Number of alpha spheres,Mean alpha-sphere radius,Mean alpha-sphere Solvent Acc.,...,Local hydrophobic density Score,Number of apolar alpha sphere,Proportion of apolar alpha sphere,Total SASA,Polar SASA,Apolar SASA,Proportion of polar atoms,Alpha sphere density,Cent. of mass - Alpha Sphere max dist,Flexibility
0,4or2,pocket11,10,0.0,0.0,-0.0533,0.1476,57.0,3.8019,0.5178,...,37.6522,46.0,0.807,140.024,31.338,108.687,32.353,4.356,12.024,0.425
1,4or2,pocket7,13,0.0,0.0,0.1373,0.1119,62.0,3.5579,0.5686,...,31.7273,44.0,0.7097,108.396,11.786,96.61,31.579,4.934,11.649,0.22
2,4or2,pocket1,28,0.75,0.857143,0.5354,0.9596,176.0,3.7032,0.5048,...,65.9439,107.0,0.608,154.66,38.727,115.932,36.047,7.006,17.099,0.223
3,4or2,pocket2,19,0.0,0.0,0.5091,0.7153,92.0,3.4119,0.4119,...,39.0294,68.0,0.7391,68.9,19.308,49.592,29.091,6.157,16.407,0.322
4,4or2,pocket4,25,0.0,0.0,0.2184,0.2519,126.0,3.5027,0.4786,...,48.1136,88.0,0.6984,253.416,84.348,169.068,31.25,7.736,20.583,0.434
5,4or2,pocket9,27,0.0,0.0,-0.0041,0.006,155.0,3.6452,0.4664,...,22.6923,52.0,0.3355,410.017,151.585,258.432,36.471,9.895,24.056,0.326
6,4or2,pocket10,10,0.09375,0.3,-0.0422,0.093,45.0,3.9094,0.643,...,30.1875,32.0,0.7111,109.349,4.286,105.064,32.258,3.571,10.906,0.198
7,4or2,pocket5,22,0.0,0.0,0.201,0.0353,94.0,3.5219,0.4737,...,17.3939,33.0,0.3511,193.05,101.27,91.78,47.368,7.23,20.426,0.373
8,4or2,pocket8,8,0.1875,0.75,0.0493,0.1594,51.0,3.6869,0.5619,...,44.5217,46.0,0.902,85.278,5.357,79.921,16.129,3.369,7.951,0.254
9,4or2,pocket3,18,0.0,0.0,0.3147,0.0476,105.0,3.4779,0.4851,...,31.6066,61.0,0.581,185.133,77.654,107.479,33.824,7.109,16.704,0.674


## Features

In [35]:
from utils.features_classes import * # Each FClass

# Path to the mkdssp executable downloaded from https://github.com/PDB-REDO/dssp/releases/tag/v4.4.0
BiopythonF.dssp_path = "training_data/utils/external/mkdssp-4.4.0-linux-x64" 
os.chmod(BiopythonF.dssp_path, 0o755)
# Path to the UniRef (or other protein sequences database) downloaded from https://wwwuser.gwdguser.de/~compbiol/uniclust/2023_02
HHBlitsF.uniref_path = uniref_path



┌──────────────────────────────────────────────────────────────────────────────┐
│                                 PyRosetta-4                                  │
│              Created in JHU by Sergey Lyskov and PyRosetta Team              │
│              (C) Copyright Rosetta Commons Member Institutions               │
│                                                                              │
│ NOTE: USE OF PyRosetta FOR COMMERCIAL PURPOSES REQUIRE PURCHASE OF A LICENSE │
│         See LICENSE.PyRosetta.md or email license@uw.edu for details         │
└──────────────────────────────────────────────────────────────────────────────┘
PyRosetta-4 2025 [Rosetta PyRosetta4.conda.ubuntu.cxx11thread.serialization.Ubuntu.python311.Release 2025.17+release.356248d2035a0749e09a4a79479678a8f54e7220 2025-04-17T21:51:51] retrieved from: http://www.pyrosetta.org
core.init: Checking for fconfig files in pwd and ./rosetta/flags
core.init: Rosetta version: PyRosetta4.conda.ubuntu.cxx11thread.ser

In [36]:
from utils.features_utils import calculate_features, get_pdb_features

In [37]:
FClasses

[utils.features_classes.GrapheinF,
 utils.features_classes.FreeSASAF,
 utils.features_classes.DSSPF,
 utils.features_classes.MelodiaF,
 utils.features_classes.BiopythonF,
 utils.features_classes.PyRosettaF,
 utils.features_classes.ProDyF,
 utils.features_classes.TransferEntropyF,
 utils.features_classes.HHBlitsF]

In [38]:
os.makedirs(f"{path}/features/{clean_pdb.entry_id}", exist_ok=True)

progressbar = tqdm(FClasses)
for fc in progressbar:
    progressbar.set_description(f"Calculating {fc.__name__[:-1]}")
    file = f"{path}/features/{clean_pdb.entry_id}/{fc.__name__}.pkl"
    if not os.path.isfile(file):
        calculated = calculate_features(clean_pdb.entry_id, fc, file, path, path)
        assert calculated, f"Feature calculation failed: {fc}"

  0%|          | 0/9 [00:00<?, ?it/s]

core.chemical.GlobalResidueTypeSet: Finished initializing fa_standard residue type set.  Created 985 residue types
core.chemical.GlobalResidueTypeSet: Total time to initialize 0.347299 seconds.
core.import_pose.import_pose: File 'predict/4or2.cif' automatically determined to be of type mmCIF from contents.
core.conformation.Conformation: Found disulfide between residues 182 263


2025-05-16 15:12:58,107 - .prody - DEBUG - 2817 atoms and 1 coordinate set(s) were parsed in 0.02s.
2025-05-16 15:12:58,158 - .prody - DEBUG - Hessian was built in 0.05s.
2025-05-16 15:12:59,893 - .prody - DEBUG - 1074 modes were calculated in 1.73s.
2025-05-16 15:12:59,982 - .prody - INFO - Calculating stiffness matrix.
2025-05-16 15:13:00,181 - .prody - DEBUG - Stiffness matrix calculated in 0.20s.
2025-05-16 15:13:00,182 - .prody - INFO - The range of effective force constant is: 4.607130302470655 to 26.14793112720262.
2025-05-16 15:13:00,192 - .prody - DEBUG - Kirchhoff was built in 0.01s.
2025-05-16 15:13:00,212 - .prody - DEBUG - 10 modes were calculated in 0.02s.
2025-05-16 15:13:00,221 - .prody - DEBUG - Kirchhoff was built in 0.01s.
2025-05-16 15:13:00,226 - .prody - DEBUG - 10 modes were calculated in 0.00s.
2025-05-16 15:13:00,234 - .prody - DEBUG - Kirchhoff was built in 0.01s.
2025-05-16 15:13:00,239 - .prody - DEBUG - 10 modes were calculated in 0.00s.
2025-05-16 15:13:00

In [39]:
test = DSSPF(clean_pdb)
test

<utils.features_classes.DSSPF at 0x7c0e5619e890>

In [40]:
test.dssp()

Unnamed: 0,auth_asym_id,auth_seq_id,pdbx_PDB_ins_code,secondary structure,relative ASA,phi,psi,NH_O_1_relidx,NH_O_1_energy,O_NH_1_relidx,O_NH_1_energy,NH_O_2_relidx,NH_O_2_energy,O_NH_2_relidx,O_NH_2_energy
0,A,1002,?,-,0.822086,360.0,132.1,0,0.0,4,-1.3,0,0.0,3,-0.3
1,A,1003,?,H,0.146341,-70.4,-27.1,1,-0.2,4,-1.3,2,-0.2,5,-0.1
2,A,1004,?,H,0.778351,-69.4,-34.0,1,-0.2,-1,-0.2,2,-0.2,4,-0.2
3,A,1005,?,H,0.466258,-89.6,-23.5,-3,-0.3,4,-1.8,2,-0.2,-2,-0.2
4,A,1006,?,H,0.121019,-69.3,-33.6,-4,-1.3,4,-1.4,1,-0.2,-2,-0.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
355,A,839,?,H,0.437870,-92.6,-0.0,-4,-3.6,-3,-0.2,2,-0.1,-2,-0.1
356,A,840,?,T,0.500000,-135.9,-25.3,-4,-1.1,-3,-0.2,-5,-0.5,-4,-0.1
357,A,841,?,-,0.526829,-135.1,69.3,-5,-1.6,2,-2.3,-6,-0.3,-2,-0.1
358,A,842,?,-,0.779412,-75.4,49.9,0,0.0,-4,-0.1,0,0.0,-5,-0.1


In [41]:
features = get_pdb_features(
    clean_pdb,
    sites = [pd.DataFrame(columns=clean_pdb.residues.columns),],
    features_path = path
)
features

Unnamed: 0_level_0,Residues,Residues,Residues,Residues,Residues,Residues,Label,Amino acids,Amino acids,Amino acids,...,HHBlits,HHBlits,HHBlits,HHBlits,HHBlits,HHBlits,HHBlits,HHBlits,HHBlits,HHBlits
Unnamed: 0_level_1,label_entity_id,label_asym_id,label_seq_id,auth_asym_id,auth_seq_id,pdbx_PDB_ins_code,label,label_comp_id_A,label_comp_id_C,label_comp_id_D,...,M->M,M->I,M->D,I->M,I->I,D->M,D->D,Neff,Neff_I,Neff_D
0,1,A,5,A,1002,?,0,0,0,1,...,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,10.035,0.000,0.000
1,1,A,6,A,1003,?,0,0,0,0,...,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,10.042,0.000,0.000
2,1,A,7,A,1004,?,0,0,0,0,...,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,10.042,0.000,0.000
3,1,A,8,A,1005,?,0,0,0,1,...,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,10.026,0.000,0.000
4,1,A,9,A,1006,?,0,0,0,0,...,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,10.026,0.000,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
355,1,A,368,A,839,?,0,0,0,0,...,0.986917,0.008747,0.004298,0.927231,0.072594,0.415811,0.584389,11.096,1.134,1.363
356,1,A,369,A,840,?,0,1,0,0,...,0.985549,0.007993,0.006448,0.432269,0.567621,0.239816,0.760489,11.083,1.097,1.285
357,1,A,370,A,841,?,0,0,0,0,...,0.968618,0.016770,0.014854,0.506277,0.493800,0.411510,0.588453,11.085,1.154,1.299
358,1,A,371,A,842,?,0,0,0,0,...,0.979420,0.016793,0.003538,0.435275,0.564874,0.739181,0.260978,11.108,1.261,1.359


In [42]:
from utils.pocket_utils import get_mean_pocket_features

In [43]:
pockets_features = pd.concat(
    (
        pockets_features[["pdb", "pocket", "nres", "site_in_pocket", "pocket_in_site"]],
        # pockets_features["label"],
        pockets_features.drop(columns=["pdb", "pocket", "nres", "site_in_pocket", "pocket_in_site"]) # "label", 
    ),
    axis=1,
    # keys=["Pockets", "Label", "FPocket"]
    keys=["Pockets", "FPocket"]
)

pockets_features

Unnamed: 0_level_0,Pockets,Pockets,Pockets,Pockets,Pockets,FPocket,FPocket,FPocket,FPocket,FPocket,FPocket,FPocket,FPocket,FPocket,FPocket,FPocket,FPocket,FPocket,FPocket,FPocket,FPocket
Unnamed: 0_level_1,pdb,pocket,nres,site_in_pocket,pocket_in_site,Pocket Score,Drug Score,Number of alpha spheres,Mean alpha-sphere radius,Mean alpha-sphere Solvent Acc.,...,Local hydrophobic density Score,Number of apolar alpha sphere,Proportion of apolar alpha sphere,Total SASA,Polar SASA,Apolar SASA,Proportion of polar atoms,Alpha sphere density,Cent. of mass - Alpha Sphere max dist,Flexibility
0,4or2,pocket11,10,0.0,0.0,-0.0533,0.1476,57.0,3.8019,0.5178,...,37.6522,46.0,0.807,140.024,31.338,108.687,32.353,4.356,12.024,0.425
1,4or2,pocket7,13,0.0,0.0,0.1373,0.1119,62.0,3.5579,0.5686,...,31.7273,44.0,0.7097,108.396,11.786,96.61,31.579,4.934,11.649,0.22
2,4or2,pocket1,28,0.75,0.857143,0.5354,0.9596,176.0,3.7032,0.5048,...,65.9439,107.0,0.608,154.66,38.727,115.932,36.047,7.006,17.099,0.223
3,4or2,pocket2,19,0.0,0.0,0.5091,0.7153,92.0,3.4119,0.4119,...,39.0294,68.0,0.7391,68.9,19.308,49.592,29.091,6.157,16.407,0.322
4,4or2,pocket4,25,0.0,0.0,0.2184,0.2519,126.0,3.5027,0.4786,...,48.1136,88.0,0.6984,253.416,84.348,169.068,31.25,7.736,20.583,0.434
5,4or2,pocket9,27,0.0,0.0,-0.0041,0.006,155.0,3.6452,0.4664,...,22.6923,52.0,0.3355,410.017,151.585,258.432,36.471,9.895,24.056,0.326
6,4or2,pocket10,10,0.09375,0.3,-0.0422,0.093,45.0,3.9094,0.643,...,30.1875,32.0,0.7111,109.349,4.286,105.064,32.258,3.571,10.906,0.198
7,4or2,pocket5,22,0.0,0.0,0.201,0.0353,94.0,3.5219,0.4737,...,17.3939,33.0,0.3511,193.05,101.27,91.78,47.368,7.23,20.426,0.373
8,4or2,pocket8,8,0.1875,0.75,0.0493,0.1594,51.0,3.6869,0.5619,...,44.5217,46.0,0.902,85.278,5.357,79.921,16.129,3.369,7.951,0.254
9,4or2,pocket3,18,0.0,0.0,0.3147,0.0476,105.0,3.4779,0.4851,...,31.6066,61.0,0.581,185.133,77.654,107.479,33.824,7.109,16.704,0.674


In [44]:
pockets_features = pd.concat(
    (
        pockets_features,
        pockets_features.apply(
            lambda row: get_mean_pocket_features(
                row[("Pockets", "pdb")],
                row[("Pockets", "pocket")],
                pdb_features = features,
                pockets_path = path # # f"{pockets_path}/{pdb}/{pdb}_out/pockets/{pocket}_atm.cif"
            ), 
            axis=1 
        )
    ),
    axis=1
)

pockets_features

Unnamed: 0_level_0,Pockets,Pockets,Pockets,Pockets,Pockets,FPocket,FPocket,FPocket,FPocket,FPocket,...,HHBlits,HHBlits,HHBlits,HHBlits,HHBlits,HHBlits,HHBlits,HHBlits,HHBlits,HHBlits
Unnamed: 0_level_1,pdb,pocket,nres,site_in_pocket,pocket_in_site,Pocket Score,Drug Score,Number of alpha spheres,Mean alpha-sphere radius,Mean alpha-sphere Solvent Acc.,...,M->M,M->I,M->D,I->M,I->I,D->M,D->D,Neff,Neff_I,Neff_D
0,4or2,pocket11,10,0.0,0.0,-0.0533,0.1476,57.0,3.8019,0.5178,...,0.975362,0.012005,0.012689,0.361213,0.638922,0.345077,0.654891,10.9634,1.1742,1.4551
1,4or2,pocket7,13,0.0,0.0,0.1373,0.1119,62.0,3.5579,0.5686,...,0.98352,0.011379,0.005167,0.344427,0.578638,0.182173,0.817806,10.867308,1.205615,1.313538
2,4or2,pocket1,28,0.75,0.857143,0.5354,0.9596,176.0,3.7032,0.5048,...,0.973977,0.020165,0.005782,0.34408,0.655868,0.170294,0.829742,11.105929,1.319107,1.502357
3,4or2,pocket2,19,0.0,0.0,0.5091,0.7153,92.0,3.4119,0.4119,...,0.985968,0.008549,0.005442,0.426407,0.521019,0.202248,0.797734,11.105947,1.114316,1.421684
4,4or2,pocket4,25,0.0,0.0,0.2184,0.2519,126.0,3.5027,0.4786,...,0.961205,0.033332,0.005394,0.192687,0.487311,0.209549,0.630429,10.41436,0.89828,1.18488
5,4or2,pocket9,27,0.0,0.0,-0.0041,0.006,155.0,3.6452,0.4664,...,0.936055,0.052766,0.011122,0.237796,0.762216,0.171077,0.828914,10.713889,1.675556,1.489852
6,4or2,pocket10,10,0.09375,0.3,-0.0422,0.093,45.0,3.9094,0.643,...,0.988391,0.008011,0.003656,0.35658,0.643385,0.300527,0.699503,11.0991,1.1122,1.2339
7,4or2,pocket5,22,0.0,0.0,0.201,0.0353,94.0,3.5219,0.4737,...,0.98028,0.006291,0.013395,0.424238,0.57578,0.186306,0.81366,11.147909,1.0985,2.058273
8,4or2,pocket8,8,0.1875,0.75,0.0493,0.1594,51.0,3.6869,0.5619,...,0.990785,0.004884,0.004312,0.377633,0.62226,0.172774,0.827281,11.116875,1.05925,1.420375
9,4or2,pocket3,18,0.0,0.0,0.3147,0.0476,105.0,3.4779,0.4851,...,0.93765,0.047822,0.014487,0.244772,0.421913,0.284378,0.660057,10.769667,0.938778,1.404167


## Predict

In [45]:
from autogluon.tabular import TabularDataset, TabularPredictor

In [46]:
model = TabularPredictor.load("models/pockets_physchem_deploy")

In [47]:
def prepare_data(df):
    df.index = df["Pockets"][["pdb", "pocket"]].apply(lambda x: "_".join(x), axis=1)
    df = df.drop(columns=["Pockets"], level=0)
    df.columns = map(lambda x: "_".join(x), df.columns.values)
    # df.loc[:,'Label_label'] = df['Label_label'].astype("category")
    return TabularDataset(df)

In [48]:
data = prepare_data(pockets_features)
data

Unnamed: 0,FPocket_Pocket Score,FPocket_Drug Score,FPocket_Number of alpha spheres,FPocket_Mean alpha-sphere radius,FPocket_Mean alpha-sphere Solvent Acc.,FPocket_Mean B-factor of pocket residues,FPocket_Hydrophobicity Score,FPocket_Polarity Score,FPocket_Amino Acid based volume Score,FPocket_Pocket volume (Monte Carlo),...,HHBlits_M->M,HHBlits_M->I,HHBlits_M->D,HHBlits_I->M,HHBlits_I->I,HHBlits_D->M,HHBlits_D->D,HHBlits_Neff,HHBlits_Neff_I,HHBlits_Neff_D
4or2_pocket11,-0.0533,0.1476,57.0,3.8019,0.5178,0.4253,30.4,4.0,4.6,402.9479,...,0.975362,0.012005,0.012689,0.361213,0.638922,0.345077,0.654891,10.9634,1.1742,1.4551
4or2_pocket7,0.1373,0.1119,62.0,3.5579,0.5686,0.2205,62.2308,3.0,4.3077,361.7323,...,0.98352,0.011379,0.005167,0.344427,0.578638,0.182173,0.817806,10.867308,1.205615,1.313538
4or2_pocket1,0.5354,0.9596,176.0,3.7032,0.5048,0.2235,45.5,12.0,4.1786,746.6119,...,0.973977,0.020165,0.005782,0.34408,0.655868,0.170294,0.829742,11.105929,1.319107,1.502357
4or2_pocket2,0.5091,0.7153,92.0,3.4119,0.4119,0.3218,43.8421,7.0,4.4211,395.9802,...,0.985968,0.008549,0.005442,0.426407,0.521019,0.202248,0.797734,11.105947,1.114316,1.421684
4or2_pocket4,0.2184,0.2519,126.0,3.5027,0.4786,0.4338,33.12,10.0,4.4,1024.5051,...,0.961205,0.033332,0.005394,0.192687,0.487311,0.209549,0.630429,10.41436,0.89828,1.18488
4or2_pocket9,-0.0041,0.006,155.0,3.6452,0.4664,0.326,13.5556,16.0,4.5926,1406.9862,...,0.936055,0.052766,0.011122,0.237796,0.762216,0.171077,0.828914,10.713889,1.675556,1.489852
4or2_pocket10,-0.0422,0.093,45.0,3.9094,0.643,0.1979,78.8,1.0,4.4,343.2634,...,0.988391,0.008011,0.003656,0.35658,0.643385,0.300527,0.699503,11.0991,1.1122,1.2339
4or2_pocket5,0.201,0.0353,94.0,3.5219,0.4737,0.3731,27.3636,11.0,3.7727,695.938,...,0.98028,0.006291,0.013395,0.424238,0.57578,0.186306,0.81366,11.147909,1.0985,2.058273
4or2_pocket8,0.0493,0.1594,51.0,3.6869,0.5619,0.254,64.125,1.0,4.125,315.2336,...,0.990785,0.004884,0.004312,0.377633,0.62226,0.172774,0.827281,11.116875,1.05925,1.420375
4or2_pocket3,0.3147,0.0476,105.0,3.4779,0.4851,0.674,36.4444,9.0,4.8333,766.5284,...,0.93765,0.047822,0.014487,0.244772,0.421913,0.284378,0.660057,10.769667,0.938778,1.404167


In [49]:
preds = model.predict_proba(data)[[1]].sort_values(1, ascending=False)
preds

Unnamed: 0,1
4or2_pocket1,0.920452
4or2_pocket12,0.43281
4or2_pocket2,0.029576
4or2_pocket4,0.004039
4or2_pocket9,0.003933
4or2_pocket11,0.002201
4or2_pocket5,0.001827
4or2_pocket3,0.000979
4or2_pocket10,0.000667
4or2_pocket6,9e-05


### View

In [50]:
v = view_pockets(
    clean_pdb,
    pockets={"pocket1": {"color": "green"}, "pocket12": {"color": "blue"}}, # {"pocketn": {"color": ""}}
    site_residues=site.residues,
    modulator_residues=site.modulator_residues,
)
v

PDBeMolstar(bg_color='#F7F7F7', color_data={'data': [{'struct_asym_id': 'A', 'representation': 'cartoon', 'rep…