# Requisites

- Environment with dependencies in `conda_env.yaml`
- External dependencies to be downloaded and place in `training_data/utils/external`:
    - predict_ddG.py script from PyRosetta (https://github.com/RosettaCommons/PyRosetta.notebooks/blob/master/notebooks/additional_scripts/predict_ddG.py)
    - DSPP software executable (https://github.com/PDB-REDO/dssp/releases/download/v4.4.0/mkdssp-4.4.0-linux-x64)
- Downloaded UniRef database https://wwwuser.gwdguser.de/~compbiol/uniclust/2023_02

In [1]:
path = "predict"

uniref_path = "/data/fnerin/UniRef30_2023_02/UniRef30_2023_02"

# Imports

In [2]:
import sys
sys.path.append("training_data")

In [3]:
from utils.new_pdbs import Pdb
from utils.structure_fixing import get_fixed_structure, CifFileWriter
from utils.utils import Cif

In [4]:
import os, tempfile
import pandas as pd
from tqdm.notebook import tqdm

# Folder

In [5]:
os.makedirs(path, exist_ok=True)

Pdb.path = path
Pdb.original_cifs_path = path
Cif.path = path
Cif.original_cifs_path = path

# Functions

In [6]:
def get_cif(
    pdb_id,
    path=path
):
    pdb = Pdb(pdb_id.lower())

    # Save original and uncompressed cif
    with open(f"{path}/{pdb.entry_id}_updated.cif.gz", "wb") as f:
        f.write(pdb.cif._cif_content)
    with open(f"{path}/{pdb.entry_id}_updated.cif", "w") as f:
        f.write(pdb.cif.text)
        
    # Cache the contents of the file
    pdb.cif.data
    return pdb

In [7]:
import pymol2

In [8]:
def get_site(site, only_protein=True, threshold=6): # site.pdb CAN BE PDB OR ASSEMBLY (must have .cif and .residues)
    """
    Function to, given a site, return a standardized list of residues from the parent structure that define the site with the Python interface of open-source PyMOL
    """    
    # Define the PyMOL-style selection of the modulator residues 
    sele = " or ".join(
        f"{res['label_asym_id']}/{res['auth_asym_id']}/{res['auth_comp_id']}`{res['auth_seq_id']}{res['pdbx_PDB_ins_code'].replace('?', '')}/*"
        for i, res in site.modulator_residues.iterrows()
    )
    
    with pymol2.PyMOL() as pymol:
        pymol.cmd.feedback(
            "disable", "executive", "details"
        )  # to silence "ExecutiveLoad-Detail: Detected mmCIF"

        # Load the parent structure of the site to PyMOL (it can only read a "real" file and not from string)
        with tempfile.NamedTemporaryFile("w+", suffix=".cif") as f:
            f.write(site.pdb.cif.text)
            pymol.cmd.load(f.name)

        # Retrieve all atoms within the threshold of the modulator selection
        site_atoms = pymol.cmd.get_model(f"br. all within {threshold} of {sele}")

    # Process the atom selection to obtain residue identifiers
    site_list = set(
        tuple(
            (
                a.segi, a.chain, a.resn,
                a.resi_number, a.ins_code or '?' # pdbx_PDB_ins_code or "?" if none
            ) 
            for a in site_atoms.atom
        )
    )

    # Transform the PyMOL-derived residue identifiers into a standard table of residues that can be used to retrieve the rows/residues from the parent structure's .residues table
    site_res = site.pdb.residues.merge(
        pd.DataFrame(
            site_list,
            columns=[
                "label_asym_id", "auth_asym_id", "auth_comp_id",
                "auth_seq_id", "pdbx_PDB_ins_code"
            ],
            dtype=str
        )
    ).query("pdbx_PDB_model_num == '1'")

    if only_protein:
        site_res = site_res.query(f"label_entity_id in {site.pdb._protein_entities} and label_asym_id not in {site.modulator_residues.label_asym_id.unique().tolist()}")

    assert len(site_res) > 0, "Site selection doesn't have any residues"

    return site_res

In [9]:
class Site:
    def __init__(self, pdb, modulator_residues=None, residues=None, only_protein=True, distance_threshold=6):
        self.pdb = pdb
        if modulator_residues is not None:
            self.modulator_residues = modulator_residues
            self.residues = get_site(self, only_protein=only_protein, threshold=distance_threshold)
        elif residues is not None:
            self.residues = pdb.residues.merge(pd.DataFrame(residues, dtype=str)).query("pdbx_PDB_model_num == '1'")
            if only_protein:
                self.residues = self.residues.query(f"label_entity_id in {site.pdb._protein_entities}")
        else:
            raise Exception("Pass one of 'modulator_residues' or 'residues'")

In [10]:
def get_clean_pdb(pdb, protein_chains, path=path):
    fixed_structure = get_fixed_structure(pdb, pdb, list(protein_chains), path, save=True)
    with open(f"{path}/{pdb.entry_id}.cif", "w+") as f:
        writer = CifFileWriter(f.name, compress=False)
        writer.write({
            pdb.entry_id.upper(): {
                "_atom_site": fixed_structure.to_dict(orient="list"),
                "_entity_poly": pdb.cif.data["_entity_poly"]
            }
        })
        
    cif = Cif(pdb.entry_id)
    # Cache the contents of the files
    cif.origcif.data
    cif.cif.data
    return cif

In [11]:
from ipymolstar import PDBeMolstar

In [12]:
def view_pdb(pdb, **kwargs):
    return PDBeMolstar(
        custom_data = {
                'data': pdb.cif.text,
                'format': 'cif',
                'binary': False,
            },
        sequence_panel = True,
        assembly_id='',
        **kwargs
    )

In [13]:
colors = {
    "orange": "#0FD55E00".lower(),
    "green": "#0F009E73".lower(),
    "blue": "#0F0072B2".lower()
}

def get_pocket(pdb, pocket, path=path):
    pocketn = pocket.replace('pocket', '')
    pocket_atoms = (
        Cif(pdb, f"{path}/{pdb}/{pdb}_out/{pdb}_out.cif", name=f"{pdb}_out")
        .atoms
        .query(f"label_comp_id == 'STP' and label_seq_id == '{pocketn}'")
    )
    pocket_atoms["label_asym_id"] = 'ZZZ'
    pocket_atoms["label_entity_id"] = '99'

    return pocket_atoms
    

def view_pockets(
    pdb, 
    pockets:dict, # {"pocketn": {"color": ""}}
    site_residues=None,
    modulator_residues=None,
    path=path
):
    chains = pdb.residues.label_asym_id.unique().tolist()
    pdb = pdb.entry_id
    cif = Cif(pdb, f"{path}/{pdb}_updated.cif")

    pockets = {
        pocketn: {
            "atoms": get_pocket(pdb, pocketn, path=path),
            "color": colors.get(pocket["color"], pocket["color"])
        }
        for pocketn, pocket in pockets.items()
    }

    # Fake entity data
    entities = pd.concat((
        pd.DataFrame(cif.cif.data["_entity"], dtype=str),#.query(f"id in {minimal_elements('label_entity_id')}"),
        pd.DataFrame([{"id": "99", "type": "branched", "pdbx_description": "pockets"}]) # Fake the pockets as carbohydrates to manage their representation
    )).fillna(".")

    
    columns = list( set.intersection( *map(set, (pocket["atoms"].columns for pocket in pockets.values())) ) )
    atoms = pd.concat((
        cif.atoms[columns],
        *(pocket["atoms"][columns] for pocket in pockets.values())
    ))

    with tempfile.NamedTemporaryFile("w+", suffix=".cif") as f:
        writer = CifFileWriter(f.name)
        writer.write({cif.entry_id.upper(): {
            "_entity": entities.to_dict(orient="list"),
            "_atom_site": atoms.to_dict(orient="list"),
        }})
        combined = Cif(pdb, filename=f.name)
        combined.cif.data # to cache it while 'f' exists

    data = [
        # Protein
        {"struct_asym_id": asym_id, 'representation': 'cartoon', 'representationColor': '#AEAEAE', 'focus': True}
        for asym_id in chains
    ]

    if site_residues is not None:
        data += [
            {'struct_asym_id': r["label_asym_id"], 'residue_number': int(r["label_seq_id"]), 'representationColor': colors["green"]}
            for i, r in site_residues.iterrows()
        ]
        
    # Ligands and molecules
    if modulator_residues is not None:
        data += [
            {'struct_asym_id': r["label_asym_id"], 'color': 'white'}
            for i, r in (
                combined.residues
                # Not modulator residues and only small molecule entities
                .merge(
                    modulator_residues if modulator_residues is not None else pd.DataFrame(columns=combined.residues.columns), # if modulator_residues not passed, empty df
                    how="outer", indicator=True
                )
                .query(f"""_merge == 'left_only' and label_entity_id in {entities.query("type == 'non-polymer'").id.unique().tolist()}""")
                .drop(columns="_merge")
                .iterrows()
            
            )
        ]

    # Pockets
    data += [
        {
            "struct_asym_id": "ZZZ", 'residue_number': int(pocketn.replace('pocket', '')), 'representation': 'point', 'representationColor': pocket["color"]
        }
        for pocketn, pocket in pockets.items()
    ]

    data += [
        {
            "struct_asym_id": "ZZZ", 'residue_number': int(pocketn.replace('pocket', '')), 'representation': 'gaussian-volume', 'representationColor': pocket["color"]
        }
        for pocketn, pocket in pockets.items()
    ]

    return view_pdb(
        combined,
        
        hide_polymer = True,
        # hide_heteroatoms = True,
        # hide_non_standard = True,
        hide_carbs = True,
        hide_water = True,
        
        color_data = {
            "data": data,
            "nonSelectedColor": None,
            "keepColors": True,
            "keepRepresentations": False,
        }
    )

# Predict

- Load or download .cif
    - If it is pdb, convert to cif with pymol. It will be working with auth_asym_id! Or fill a fake label_asym_id?
    - We need the sequence, if it is not in records get it from pymol
    - DSSP looks for self._cif._extended_temp_ciff({"_pdbx_poly_seq_scheme": extra})
- Be able to select chains, specify modulator and/or site
- Fix/standardize the structure
- Run feats, run pocket
- Extra: find closest clust rep and if it's in train or test

## Get protein structure

In [14]:
pdb = get_cif(
    pdb_id="6t4k"
)

In [15]:
view_pdb(pdb)

PDBeMolstar(bg_color='#F7F7F7', custom_data={'data': "data_6T4K\n#\n_entry.id 6T4K\n#\n_citation.abstract ?\n_…

In [16]:
pdb.atoms

Unnamed: 0,group_PDB,id,type_symbol,label_atom_id,label_alt_id,label_comp_id,label_asym_id,label_entity_id,label_seq_id,pdbx_PDB_ins_code,...,auth_seq_id,auth_comp_id,auth_asym_id,auth_atom_id,pdbx_PDB_model_num,pdbx_label_index,pdbx_sifts_xref_db_name,pdbx_sifts_xref_db_acc,pdbx_sifts_xref_db_num,pdbx_sifts_xref_db_res
0,ATOM,1,N,N,.,LEU,A,1,23,?,...,267,LEU,A,N,1,23,UNP,P51449,267,L
1,ATOM,2,C,CA,.,LEU,A,1,23,?,...,267,LEU,A,CA,1,23,UNP,P51449,267,L
2,ATOM,3,C,C,.,LEU,A,1,23,?,...,267,LEU,A,C,1,23,UNP,P51449,267,L
3,ATOM,4,O,O,.,LEU,A,1,23,?,...,267,LEU,A,O,1,23,UNP,P51449,267,L
4,ATOM,5,C,CB,.,LEU,A,1,23,?,...,267,LEU,A,CB,1,23,UNP,P51449,267,L
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2170,HETATM,2171,O,O,.,HOH,F,5,.,?,...,767,HOH,A,O,1,767,?,?,?,?
2171,HETATM,2172,O,O,.,HOH,F,5,.,?,...,768,HOH,A,O,1,768,?,?,?,?
2172,HETATM,2173,O,O,.,HOH,F,5,.,?,...,769,HOH,A,O,1,769,?,?,?,?
2173,HETATM,2174,O,O,.,HOH,F,5,.,?,...,770,HOH,A,O,1,770,?,?,?,?


In [17]:
pdb.residues

Unnamed: 0,label_comp_id,label_asym_id,label_entity_id,label_seq_id,pdbx_PDB_ins_code,auth_seq_id,auth_comp_id,auth_asym_id,pdbx_PDB_model_num,pdbx_label_index,pdbx_sifts_xref_db_name,pdbx_sifts_xref_db_acc,pdbx_sifts_xref_db_num,pdbx_sifts_xref_db_res
0,LEU,A,1,23,?,267,LEU,A,1,23,UNP,P51449,267,L
8,THR,A,1,24,?,268,THR,A,1,24,UNP,P51449,268,T
15,GLU,A,1,25,?,269,GLU,A,1,25,UNP,P51449,269,E
24,ILE,A,1,26,?,270,ILE,A,1,26,UNP,P51449,270,I
32,GLU,A,1,27,?,271,GLU,A,1,27,UNP,P51449,271,E
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2170,HOH,F,5,.,?,767,HOH,A,1,767,?,?,?,?
2171,HOH,F,5,.,?,768,HOH,A,1,768,?,?,?,?
2172,HOH,F,5,.,?,769,HOH,A,1,769,?,?,?,?
2173,HOH,F,5,.,?,770,HOH,A,1,770,?,?,?,?


## Optional: set a target site

### With a modulator molecule

In [18]:
# Desired modulator is label_asym_id 'C'
pdb.residues.query("label_asym_id == 'C'")

Unnamed: 0,label_comp_id,label_asym_id,label_entity_id,label_seq_id,pdbx_PDB_ins_code,auth_seq_id,auth_comp_id,auth_asym_id,pdbx_PDB_model_num,pdbx_label_index,pdbx_sifts_xref_db_name,pdbx_sifts_xref_db_acc,pdbx_sifts_xref_db_num,pdbx_sifts_xref_db_res
2061,4F1,C,3,.,?,602,4F1,A,1,602,?,?,?,?


In [19]:
site = Site(
    pdb, 
    modulator_residues=pdb.residues.query("label_asym_id == 'C'"), 
    only_protein=True
)
site

<__main__.Site at 0x78a68eab0dd0>

In [20]:
site.modulator_residues

Unnamed: 0,label_comp_id,label_asym_id,label_entity_id,label_seq_id,pdbx_PDB_ins_code,auth_seq_id,auth_comp_id,auth_asym_id,pdbx_PDB_model_num,pdbx_label_index,pdbx_sifts_xref_db_name,pdbx_sifts_xref_db_acc,pdbx_sifts_xref_db_num,pdbx_sifts_xref_db_res
2061,4F1,C,3,.,?,602,4F1,A,1,602,?,?,?,?


In [21]:
site.residues

Unnamed: 0,label_comp_id,label_asym_id,label_entity_id,label_seq_id,pdbx_PDB_ins_code,auth_seq_id,auth_comp_id,auth_asym_id,pdbx_PDB_model_num,pdbx_label_index,pdbx_sifts_xref_db_name,pdbx_sifts_xref_db_acc,pdbx_sifts_xref_db_num,pdbx_sifts_xref_db_res
0,TRP,A,1,73,?,317,TRP,A,1,73,UNP,P51449,317,W
1,ALA,A,1,77,?,321,ALA,A,1,77,UNP,P51449,321,A
2,HIS,A,1,78,?,322,HIS,A,1,78,UNP,P51449,322,H
3,LEU,A,1,80,?,324,LEU,A,1,80,UNP,P51449,324,L
4,THR,A,1,81,?,325,THR,A,1,81,UNP,P51449,325,T
5,ILE,A,1,84,?,328,ILE,A,1,84,UNP,P51449,328,I
6,GLN,A,1,85,?,329,GLN,A,1,85,UNP,P51449,329,Q
7,VAL,A,1,88,?,332,VAL,A,1,88,UNP,P51449,332,V
8,LEU,A,1,109,?,353,LEU,A,1,109,UNP,P51449,353,L
9,LYS,A,1,110,?,354,LYS,A,1,110,UNP,P51449,354,K


In [22]:
# List of residue numbers of site
resnums = site.residues.label_seq_id.to_list()
resnums

['73',
 '77',
 '78',
 '80',
 '81',
 '84',
 '85',
 '88',
 '109',
 '110',
 '111',
 '112',
 '113',
 '114',
 '232',
 '235',
 '236',
 '238',
 '239',
 '240',
 '241',
 '243',
 '248',
 '250',
 '251',
 '252',
 '253',
 '254',
 '255',
 '257',
 '258',
 '261',
 '262']

### With a list of residues

In [23]:
# Site can be defined with a list of residues instead of a modulator
res_site = Site(
    pdb=pdb,
    residues=[{"label_asym_id": "A", "label_seq_id": seqnum} for seqnum in resnums],
    only_protein=True
)
res_site.residues

Unnamed: 0,label_comp_id,label_asym_id,label_entity_id,label_seq_id,pdbx_PDB_ins_code,auth_seq_id,auth_comp_id,auth_asym_id,pdbx_PDB_model_num,pdbx_label_index,pdbx_sifts_xref_db_name,pdbx_sifts_xref_db_acc,pdbx_sifts_xref_db_num,pdbx_sifts_xref_db_res
0,TRP,A,1,73,?,317,TRP,A,1,73,UNP,P51449,317,W
1,ALA,A,1,77,?,321,ALA,A,1,77,UNP,P51449,321,A
2,HIS,A,1,78,?,322,HIS,A,1,78,UNP,P51449,322,H
3,LEU,A,1,80,?,324,LEU,A,1,80,UNP,P51449,324,L
4,THR,A,1,81,?,325,THR,A,1,81,UNP,P51449,325,T
5,ILE,A,1,84,?,328,ILE,A,1,84,UNP,P51449,328,I
6,GLN,A,1,85,?,329,GLN,A,1,85,UNP,P51449,329,Q
7,VAL,A,1,88,?,332,VAL,A,1,88,UNP,P51449,332,V
8,LEU,A,1,109,?,353,LEU,A,1,109,UNP,P51449,353,L
9,LYS,A,1,110,?,354,LYS,A,1,110,UNP,P51449,354,K


## Process protein structure

In [24]:
clean_pdb = get_clean_pdb(
    pdb,
    protein_chains=["A"]
)
clean_pdb.residues

Unnamed: 0,label_comp_id,label_asym_id,label_entity_id,label_seq_id,pdbx_PDB_ins_code,auth_seq_id,auth_comp_id,auth_asym_id,pdbx_PDB_model_num,pdbx_label_index,pdbx_sifts_xref_db_name,pdbx_sifts_xref_db_acc,pdbx_sifts_xref_db_num,pdbx_sifts_xref_db_res
0,LEU,A,1,23,?,267,LEU,A,1,23,UNP,P51449,267,L
8,THR,A,1,24,?,268,THR,A,1,24,UNP,P51449,268,T
15,GLU,A,1,25,?,269,GLU,A,1,25,UNP,P51449,269,E
24,ILE,A,1,26,?,270,ILE,A,1,26,UNP,P51449,270,I
32,GLU,A,1,27,?,271,GLU,A,1,27,UNP,P51449,271,E
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1930,LYS,A,1,259,?,503,LYS,A,1,259,UNP,P51449,503,K
1939,GLU,A,1,260,?,504,GLU,A,1,260,UNP,P51449,504,E
1948,LEU,A,1,261,?,505,LEU,A,1,261,UNP,P51449,505,L
1956,PHE,A,1,262,?,506,PHE,A,1,262,UNP,P51449,506,F


## Pockets

In [25]:
from utils.pocket_utils import Pocket, get_pockets_info

In [26]:
if not os.path.isdir(f"{path}/{clean_pdb.entry_id}/{clean_pdb.entry_id}_out"):
    os.makedirs(f"{path}/{clean_pdb.entry_id}", exist_ok=True)
    os.system(f"cp {clean_pdb.filename} {path}/{clean_pdb.entry_id}/")
    os.system(f"fpocket -m 3 -M 6 -i 35 --file {path}/{clean_pdb.entry_id}/{clean_pdb.entry_id}.cif")

In [27]:
pockets = pd.DataFrame((
    {"pocket": (
        pocketf.split("_")[0]
        for pocketf in os.listdir(f"{path}/{clean_pdb.entry_id}/{clean_pdb.entry_id}_out/pockets")
            if pocketf.endswith(".cif")
    )}
    
))

pockets

Unnamed: 0,pocket
0,pocket7
1,pocket1
2,pocket2
3,pocket4
4,pocket9
5,pocket10
6,pocket5
7,pocket8
8,pocket3
9,pocket6


## Optional: labelled pocket

In [28]:
# Number of residues forming the pocket, and pocket- site overlap percentages
## site_in_pocket: % of residues of the site that are part of the pocket (% with the site of the PDB that gives the maximum %, comparing all sites of the PDB with the pocket)
## pocket_in_site: % of residues of the pocket that are part of the allosetric site (% with the site of the PDBthat gives the maximum %)

pockets = pd.DataFrame((
    pocket
    for pocket in get_pockets_info(
        clean_pdb, 
        sites = ({
            # "mod": site.modulator_residues,
            "site": site.residues
        },),
        pockets_path = f"{path}"
    )
))

pockets.loc[
    pockets[['site_in_pocket', 'pocket_in_site']].max(axis=1).sort_values(ascending=False).index
]

Unnamed: 0,pdb,pocket,nres,site_in_pocket,pocket_in_site
2,6t4k,pocket2,24,0.727273,1.0
4,6t4k,pocket9,12,0.121212,0.333333
1,6t4k,pocket1,32,0.212121,0.21875
0,6t4k,pocket7,18,0.0,0.0
3,6t4k,pocket4,11,0.0,0.0
5,6t4k,pocket10,8,0.0,0.0
6,6t4k,pocket5,18,0.0,0.0
7,6t4k,pocket8,9,0.0,0.0
8,6t4k,pocket3,12,0.0,0.0
9,6t4k,pocket6,12,0.0,0.0


In [29]:
pocket_id = "pocket1"

pocket = Pocket(f"{path}/{clean_pdb.entry_id}/{clean_pdb.entry_id}_out/pockets/{pocket_id}_atm.cif")
pocket

<utils.pocket_utils.Pocket at 0x78a68d0a96d0>

In [30]:
pocket.residues.sort_values("label_seq_id")

Unnamed: 0,label_comp_id,label_asym_id,label_seq_id,pdbx_PDB_ins_code,auth_seq_id,auth_asym_id
54,MET,A,114,?,358,A
1,VAL,A,117,?,361,A
6,LEU,A,118,?,362,A
3,ARG,A,120,?,364,A
20,MET,A,121,?,365,A
30,ARG,A,123,?,367,A
31,ALA,A,124,?,368,A
68,VAL,A,132,?,376,A
82,PHE,A,133,?,377,A
15,PHE,A,134,?,378,A


In [31]:
site.residues

Unnamed: 0,label_comp_id,label_asym_id,label_entity_id,label_seq_id,pdbx_PDB_ins_code,auth_seq_id,auth_comp_id,auth_asym_id,pdbx_PDB_model_num,pdbx_label_index,pdbx_sifts_xref_db_name,pdbx_sifts_xref_db_acc,pdbx_sifts_xref_db_num,pdbx_sifts_xref_db_res
0,TRP,A,1,73,?,317,TRP,A,1,73,UNP,P51449,317,W
1,ALA,A,1,77,?,321,ALA,A,1,77,UNP,P51449,321,A
2,HIS,A,1,78,?,322,HIS,A,1,78,UNP,P51449,322,H
3,LEU,A,1,80,?,324,LEU,A,1,80,UNP,P51449,324,L
4,THR,A,1,81,?,325,THR,A,1,81,UNP,P51449,325,T
5,ILE,A,1,84,?,328,ILE,A,1,84,UNP,P51449,328,I
6,GLN,A,1,85,?,329,GLN,A,1,85,UNP,P51449,329,Q
7,VAL,A,1,88,?,332,VAL,A,1,88,UNP,P51449,332,V
8,LEU,A,1,109,?,353,LEU,A,1,109,UNP,P51449,353,L
9,LYS,A,1,110,?,354,LYS,A,1,110,UNP,P51449,354,K


In [32]:
pocket.feats

{'Pocket Score': 0.6382,
 'Drug Score': 0.9502,
 'Number of alpha spheres': 149.0,
 'Mean alpha-sphere radius': 3.6986,
 'Mean alpha-sphere Solvent Acc.': 0.521,
 'Mean B-factor of pocket residues': 0.0775,
 'Hydrophobicity Score': 64.7188,
 'Polarity Score': 8.0,
 'Amino Acid based volume Score': 4.875,
 'Pocket volume (Monte Carlo)': 971.2631,
 'Pocket volume (convex hull)': 617.746,
 'Charge Score': 5.0,
 'Local hydrophobic density Score': 68.7478,
 'Number of apolar alpha sphere': 115.0,
 'Proportion of apolar alpha sphere': 0.7718,
 'Total SASA': 187.486,
 'Polar SASA': 32.277,
 'Apolar SASA': 155.21,
 'Proportion of polar atoms': 20.879,
 'Alpha sphere density': 8.34,
 'Cent. of mass - Alpha Sphere max dist': 20.956,
 'Flexibility': 0.077}

### Pocket features

In [33]:
pockets_features = pd.concat(
    (
        pockets,
        pockets.apply(
            lambda row: pd.Series(
                Pocket(f"{path}/{clean_pdb.entry_id}/{clean_pdb.entry_id}_out/pockets/{row['pocket']}_atm.cif").feats
            ), axis=1
        )
    ),
    axis=1
)

pockets_features

Unnamed: 0,pdb,pocket,nres,site_in_pocket,pocket_in_site,Pocket Score,Drug Score,Number of alpha spheres,Mean alpha-sphere radius,Mean alpha-sphere Solvent Acc.,...,Local hydrophobic density Score,Number of apolar alpha sphere,Proportion of apolar alpha sphere,Total SASA,Polar SASA,Apolar SASA,Proportion of polar atoms,Alpha sphere density,Cent. of mass - Alpha Sphere max dist,Flexibility
0,6t4k,pocket7,18,0.0,0.0,0.1011,0.0012,75.0,3.6407,0.5436,...,14.1176,17.0,0.2267,167.041,50.713,116.328,48.0,5.768,14.737,0.203
1,6t4k,pocket1,32,0.212121,0.21875,0.6382,0.9502,149.0,3.6986,0.521,...,68.7478,115.0,0.7718,187.486,32.277,155.21,20.879,8.34,20.956,0.077
2,6t4k,pocket2,24,0.727273,1.0,0.4838,0.942,171.0,3.6925,0.5051,...,96.219,137.0,0.8012,159.964,24.71,135.254,26.316,5.959,13.859,0.26
3,6t4k,pocket4,11,0.0,0.0,0.2847,0.0029,49.0,3.4688,0.5294,...,12.125,16.0,0.3265,83.951,35.646,48.305,48.571,5.228,10.871,0.08
4,6t4k,pocket9,12,0.121212,0.333333,0.0635,0.0188,49.0,3.6567,0.5735,...,18.0769,26.0,0.5306,156.814,66.895,89.919,40.0,6.13,15.602,0.324
5,6t4k,pocket10,8,0.0,0.0,-0.0422,0.0012,38.0,3.6787,0.6109,...,17.7895,19.0,0.5,111.598,45.178,66.42,41.379,3.528,8.134,0.276
6,6t4k,pocket5,18,0.0,0.0,0.1913,0.0119,63.0,3.5531,0.5207,...,25.5152,33.0,0.5238,125.998,57.164,68.835,39.13,5.278,12.785,0.178
7,6t4k,pocket8,9,0.0,0.0,0.0946,0.0053,52.0,3.486,0.496,...,22.9167,24.0,0.4615,97.08,35.491,61.589,45.161,3.845,10.852,0.15
8,6t4k,pocket3,12,0.0,0.0,0.3352,0.0126,48.0,3.2805,0.3951,...,26.5517,29.0,0.6042,35.684,4.286,31.398,29.032,4.061,8.514,0.206
9,6t4k,pocket6,12,0.0,0.0,0.1324,0.002,61.0,3.5795,0.5048,...,25.3333,42.0,0.6885,151.707,57.274,94.433,37.778,5.922,14.17,0.23


## Features

In [34]:
from utils.features_classes import * # Each FClass

# Path to the mkdssp executable downloaded from https://github.com/PDB-REDO/dssp/releases/tag/v4.4.0
BiopythonF.dssp_path = "training_data/utils/external/mkdssp-4.4.0-linux-x64" 
os.chmod(BiopythonF.dssp_path, 0o755)
# Path to the UniRef (or other protein sequences database) downloaded from https://wwwuser.gwdguser.de/~compbiol/uniclust/2023_02
HHBlitsF.uniref_path = uniref_path



┌──────────────────────────────────────────────────────────────────────────────┐
│                                 PyRosetta-4                                  │
│              Created in JHU by Sergey Lyskov and PyRosetta Team              │
│              (C) Copyright Rosetta Commons Member Institutions               │
│                                                                              │
│ NOTE: USE OF PyRosetta FOR COMMERCIAL PURPOSES REQUIRE PURCHASE OF A LICENSE │
│         See LICENSE.PyRosetta.md or email license@uw.edu for details         │
└──────────────────────────────────────────────────────────────────────────────┘
PyRosetta-4 2025 [Rosetta PyRosetta4.conda.ubuntu.cxx11thread.serialization.Ubuntu.python311.Release 2025.19+release.1354d05daa4c339d591afeecef3c94ca2d38680e 2025-05-07T12:36:04] retrieved from: http://www.pyrosetta.org
core.init: Checking for fconfig files in pwd and ./rosetta/flags
core.init: Rosetta version: PyRosetta4.conda.ubuntu.cxx11thread.ser

In [35]:
from utils.features_utils import calculate_features, get_pdb_features

In [36]:
FClasses

[utils.features_classes.GrapheinF,
 utils.features_classes.FreeSASAF,
 utils.features_classes.DSSPF,
 utils.features_classes.MelodiaF,
 utils.features_classes.BiopythonF,
 utils.features_classes.PyRosettaF,
 utils.features_classes.ProDyF,
 utils.features_classes.TransferEntropyF,
 utils.features_classes.HHBlitsF]

In [37]:
os.makedirs(f"{path}/features/{clean_pdb.entry_id}", exist_ok=True)

progressbar = tqdm(FClasses)
for fc in progressbar:
    progressbar.set_description(f"Calculating {fc.__name__[:-1]}")
    file = f"{path}/features/{clean_pdb.entry_id}/{fc.__name__}.pkl"
    if not os.path.isfile(file):
        calculated = calculate_features(clean_pdb.entry_id, fc, file, path, path)
        assert calculated, f"Feature calculation failed: {fc}"

  0%|          | 0/9 [00:00<?, ?it/s]

In [38]:
features = get_pdb_features(
    clean_pdb,
    sites = [pd.DataFrame(columns=clean_pdb.residues.columns),],
    features_path = path
)
features

Unnamed: 0_level_0,Residues,Residues,Residues,Residues,Residues,Residues,Label,Amino acids,Amino acids,Amino acids,...,HHBlits,HHBlits,HHBlits,HHBlits,HHBlits,HHBlits,HHBlits,HHBlits,HHBlits,HHBlits
Unnamed: 0_level_1,label_entity_id,label_asym_id,label_seq_id,auth_asym_id,auth_seq_id,pdbx_PDB_ins_code,label,label_comp_id_A,label_comp_id_C,label_comp_id_D,...,M->M,M->I,M->D,I->M,I->I,D->M,D->D,Neff,Neff_I,Neff_D
0,1,A,23,A,267,?,0,0,0,0,...,0.997231,0.000000,0.002488,0.000000,0.000000,0.871154,0.128693,10.988,0.000,1.095
1,1,A,24,A,268,?,0,0,0,0,...,1.000000,0.000000,0.000000,0.000000,0.000000,0.455651,0.544499,10.967,0.000,1.018
2,1,A,25,A,269,?,0,0,0,0,...,0.982140,0.011049,0.006597,0.404441,0.595428,0.000000,1.000000,10.979,1.108,1.004
3,1,A,26,A,270,?,0,0,0,0,...,0.948027,0.029544,0.022483,0.089871,0.910039,0.643049,0.357001,10.920,1.232,1.058
4,1,A,27,A,271,?,0,0,0,0,...,0.967276,0.029935,0.002522,1.000000,0.000000,0.603740,0.396392,10.944,1.225,1.203
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235,1,A,259,A,503,?,0,0,0,0,...,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,11.081,0.000,0.000
236,1,A,260,A,504,?,0,0,0,0,...,0.993781,0.004900,0.001596,0.500000,0.500000,0.000000,0.000000,11.076,1.028,0.000
237,1,A,261,A,505,?,0,0,0,0,...,0.998615,0.001072,0.000000,0.166662,0.833353,1.000000,0.000000,11.056,1.001,1.001
238,1,A,262,A,506,?,0,0,0,0,...,0.997231,0.002639,0.000000,0.500000,0.500000,0.000000,0.000000,11.028,1.000,0.000


In [39]:
from utils.pocket_utils import get_mean_pocket_features

In [40]:
pockets_features = pd.concat(
    (
        pockets_features[["pdb", "pocket", "nres", "site_in_pocket", "pocket_in_site"]],
        # pockets_features["label"],
        pockets_features.drop(columns=["pdb", "pocket", "nres", "site_in_pocket", "pocket_in_site"]) # "label", 
    ),
    axis=1,
    # keys=["Pockets", "Label", "FPocket"]
    keys=["Pockets", "FPocket"]
)

pockets_features

Unnamed: 0_level_0,Pockets,Pockets,Pockets,Pockets,Pockets,FPocket,FPocket,FPocket,FPocket,FPocket,FPocket,FPocket,FPocket,FPocket,FPocket,FPocket,FPocket,FPocket,FPocket,FPocket,FPocket
Unnamed: 0_level_1,pdb,pocket,nres,site_in_pocket,pocket_in_site,Pocket Score,Drug Score,Number of alpha spheres,Mean alpha-sphere radius,Mean alpha-sphere Solvent Acc.,...,Local hydrophobic density Score,Number of apolar alpha sphere,Proportion of apolar alpha sphere,Total SASA,Polar SASA,Apolar SASA,Proportion of polar atoms,Alpha sphere density,Cent. of mass - Alpha Sphere max dist,Flexibility
0,6t4k,pocket7,18,0.0,0.0,0.1011,0.0012,75.0,3.6407,0.5436,...,14.1176,17.0,0.2267,167.041,50.713,116.328,48.0,5.768,14.737,0.203
1,6t4k,pocket1,32,0.212121,0.21875,0.6382,0.9502,149.0,3.6986,0.521,...,68.7478,115.0,0.7718,187.486,32.277,155.21,20.879,8.34,20.956,0.077
2,6t4k,pocket2,24,0.727273,1.0,0.4838,0.942,171.0,3.6925,0.5051,...,96.219,137.0,0.8012,159.964,24.71,135.254,26.316,5.959,13.859,0.26
3,6t4k,pocket4,11,0.0,0.0,0.2847,0.0029,49.0,3.4688,0.5294,...,12.125,16.0,0.3265,83.951,35.646,48.305,48.571,5.228,10.871,0.08
4,6t4k,pocket9,12,0.121212,0.333333,0.0635,0.0188,49.0,3.6567,0.5735,...,18.0769,26.0,0.5306,156.814,66.895,89.919,40.0,6.13,15.602,0.324
5,6t4k,pocket10,8,0.0,0.0,-0.0422,0.0012,38.0,3.6787,0.6109,...,17.7895,19.0,0.5,111.598,45.178,66.42,41.379,3.528,8.134,0.276
6,6t4k,pocket5,18,0.0,0.0,0.1913,0.0119,63.0,3.5531,0.5207,...,25.5152,33.0,0.5238,125.998,57.164,68.835,39.13,5.278,12.785,0.178
7,6t4k,pocket8,9,0.0,0.0,0.0946,0.0053,52.0,3.486,0.496,...,22.9167,24.0,0.4615,97.08,35.491,61.589,45.161,3.845,10.852,0.15
8,6t4k,pocket3,12,0.0,0.0,0.3352,0.0126,48.0,3.2805,0.3951,...,26.5517,29.0,0.6042,35.684,4.286,31.398,29.032,4.061,8.514,0.206
9,6t4k,pocket6,12,0.0,0.0,0.1324,0.002,61.0,3.5795,0.5048,...,25.3333,42.0,0.6885,151.707,57.274,94.433,37.778,5.922,14.17,0.23


In [41]:
pockets_features = pd.concat(
    (
        pockets_features,
        pockets_features.apply(
            lambda row: get_mean_pocket_features(
                row[("Pockets", "pdb")],
                row[("Pockets", "pocket")],
                pdb_features = features,
                pockets_path = path # # f"{pockets_path}/{pdb}/{pdb}_out/pockets/{pocket}_atm.cif"
            ), 
            axis=1 
        )
    ),
    axis=1
)

pockets_features

Unnamed: 0_level_0,Pockets,Pockets,Pockets,Pockets,Pockets,FPocket,FPocket,FPocket,FPocket,FPocket,...,HHBlits,HHBlits,HHBlits,HHBlits,HHBlits,HHBlits,HHBlits,HHBlits,HHBlits,HHBlits
Unnamed: 0_level_1,pdb,pocket,nres,site_in_pocket,pocket_in_site,Pocket Score,Drug Score,Number of alpha spheres,Mean alpha-sphere radius,Mean alpha-sphere Solvent Acc.,...,M->M,M->I,M->D,I->M,I->I,D->M,D->D,Neff,Neff_I,Neff_D
0,6t4k,pocket7,18,0.0,0.0,0.1011,0.0012,75.0,3.6407,0.5436,...,0.961363,0.028005,0.010576,0.474704,0.525247,0.156638,0.843338,11.458333,1.514556,2.2745
1,6t4k,pocket1,32,0.212121,0.21875,0.6382,0.9502,149.0,3.6986,0.521,...,0.968954,0.020083,0.011013,0.284121,0.684639,0.178734,0.821316,11.441063,1.327094,1.956344
2,6t4k,pocket2,24,0.727273,1.0,0.4838,0.942,171.0,3.6925,0.5051,...,0.968146,0.007305,0.024513,0.344764,0.613628,0.287439,0.629219,11.264625,1.025542,1.790083
3,6t4k,pocket4,11,0.0,0.0,0.2847,0.0029,49.0,3.4688,0.5294,...,0.936394,0.058865,0.004742,0.292798,0.707195,0.199877,0.800076,11.429636,1.776364,1.718091
4,6t4k,pocket9,12,0.121212,0.333333,0.0635,0.0188,49.0,3.6567,0.5735,...,0.982837,0.012367,0.004887,0.231718,0.768295,0.213525,0.703135,11.434167,1.223,1.386417
5,6t4k,pocket10,8,0.0,0.0,-0.0422,0.0012,38.0,3.6787,0.6109,...,0.969307,0.027487,0.00326,0.244588,0.755453,0.234273,0.765662,11.445875,1.511375,1.680125
6,6t4k,pocket5,18,0.0,0.0,0.1913,0.0119,63.0,3.5531,0.5207,...,0.927347,0.053829,0.018836,0.420397,0.579574,0.178947,0.821046,11.448111,1.582111,1.830111
7,6t4k,pocket8,9,0.0,0.0,0.0946,0.0053,52.0,3.486,0.496,...,0.936657,0.044453,0.019009,0.230809,0.76919,0.185328,0.814739,11.415,1.804,2.987111
8,6t4k,pocket3,12,0.0,0.0,0.3352,0.0126,48.0,3.2805,0.3951,...,0.991789,0.003775,0.004393,0.368985,0.547733,0.159149,0.840835,11.43075,0.9835,1.384667
9,6t4k,pocket6,12,0.0,0.0,0.1324,0.002,61.0,3.5795,0.5048,...,0.989298,0.005863,0.004753,0.221352,0.361977,0.15524,0.844794,11.353583,0.6415,1.319917


## Predict

In [42]:
from autogluon.tabular import TabularDataset, TabularPredictor

In [43]:
model = TabularPredictor.load("models/pockets_physchem_deploy")

In [44]:
def prepare_data(df):
    df.index = df["Pockets"][["pdb", "pocket"]].apply(lambda x: "_".join(x), axis=1)
    df = df.drop(columns=["Pockets"], level=0)
    df.columns = map(lambda x: "_".join(x), df.columns.values)
    # df.loc[:,'Label_label'] = df['Label_label'].astype("category")
    return TabularDataset(df)

In [45]:
data = prepare_data(pockets_features)
data

Unnamed: 0,FPocket_Pocket Score,FPocket_Drug Score,FPocket_Number of alpha spheres,FPocket_Mean alpha-sphere radius,FPocket_Mean alpha-sphere Solvent Acc.,FPocket_Mean B-factor of pocket residues,FPocket_Hydrophobicity Score,FPocket_Polarity Score,FPocket_Amino Acid based volume Score,FPocket_Pocket volume (Monte Carlo),...,HHBlits_M->M,HHBlits_M->I,HHBlits_M->D,HHBlits_I->M,HHBlits_I->I,HHBlits_D->M,HHBlits_D->D,HHBlits_Neff,HHBlits_Neff_I,HHBlits_Neff_D
6t4k_pocket7,0.1011,0.0012,75.0,3.6407,0.5436,0.2026,30.4444,10.0,4.2222,587.1933,...,0.961363,0.028005,0.010576,0.474704,0.525247,0.156638,0.843338,11.458333,1.514556,2.2745
6t4k_pocket1,0.6382,0.9502,149.0,3.6986,0.521,0.0775,64.7188,8.0,4.875,971.2631,...,0.968954,0.020083,0.011013,0.284121,0.684639,0.178734,0.821316,11.441063,1.327094,1.956344
6t4k_pocket2,0.4838,0.942,171.0,3.6925,0.5051,0.2603,61.2083,7.0,4.5,714.6551,...,0.968146,0.007305,0.024513,0.344764,0.613628,0.287439,0.629219,11.264625,1.025542,1.790083
6t4k_pocket4,0.2847,0.0029,49.0,3.4688,0.5294,0.0798,17.0,8.0,4.2727,423.3977,...,0.936394,0.058865,0.004742,0.292798,0.707195,0.199877,0.800076,11.429636,1.776364,1.718091
6t4k_pocket9,0.0635,0.0188,49.0,3.6567,0.5735,0.324,12.8333,6.0,4.3333,503.6429,...,0.982837,0.012367,0.004887,0.231718,0.768295,0.213525,0.703135,11.434167,1.223,1.386417
6t4k_pocket10,-0.0422,0.0012,38.0,3.6787,0.6109,0.2755,11.75,6.0,3.75,284.0205,...,0.969307,0.027487,0.00326,0.244588,0.755453,0.234273,0.765662,11.445875,1.511375,1.680125
6t4k_pocket5,0.1913,0.0119,63.0,3.5531,0.5207,0.1777,33.6111,8.0,4.7778,470.7274,...,0.927347,0.053829,0.018836,0.420397,0.579574,0.178947,0.821046,11.448111,1.582111,1.830111
6t4k_pocket8,0.0946,0.0053,52.0,3.486,0.496,0.1498,29.1111,4.0,4.1111,342.7868,...,0.936657,0.044453,0.019009,0.230809,0.76919,0.185328,0.814739,11.415,1.804,2.987111
6t4k_pocket3,0.3352,0.0126,48.0,3.2805,0.3951,0.2059,50.5,5.0,4.25,197.7561,...,0.991789,0.003775,0.004393,0.368985,0.547733,0.159149,0.840835,11.43075,0.9835,1.384667
6t4k_pocket6,0.1324,0.002,61.0,3.5795,0.5048,0.2302,6.9167,8.0,4.25,627.7432,...,0.989298,0.005863,0.004753,0.221352,0.361977,0.15524,0.844794,11.353583,0.6415,1.319917


In [46]:
preds = model.predict_proba(data)[[1]].sort_values(1, ascending=False)
preds

Unnamed: 0,1
6t4k_pocket2,0.911848
6t4k_pocket1,0.650117
6t4k_pocket5,0.022494
6t4k_pocket7,0.001133
6t4k_pocket6,0.000772
6t4k_pocket4,0.000538
6t4k_pocket3,0.000361
6t4k_pocket9,0.000175
6t4k_pocket8,0.000146
6t4k_pocket10,6.7e-05


### View

In [47]:
v = view_pockets(
    clean_pdb,
    pockets={"pocket2": {"color": "green"}, "pocket1": {"color": "blue"}}, # {"pocketn": {"color": ""}}
    site_residues=site.residues,
    modulator_residues=site.modulator_residues,
)
v

PDBeMolstar(bg_color='#F7F7F7', color_data={'data': [{'struct_asym_id': 'A', 'representation': 'cartoon', 'rep…