This notebook is a demonstration of PyBioPAX that contains a mix of functions that have been pre-implemented in `pybiopax.tools` as well as *ad-hoc* traversal tools implemented in the notebook.

In [None]:
import pystow
import gzip
from typing import Optional
import pickle
import pybiopax
from lxml import etree
from tabulate import tabulate
from pybiopax.biopax import *
from tqdm.auto import tqdm
from collections import Counter
from IPython.display import HTML

In [None]:
def ensure_pc_detailed(version: Optional[str], force: bool = False):
    if version is None:
        import bioversions
        
        version = bioversions.get_version("pathwaycommons")

    url = f"https://www.pathwaycommons.org/archives/PC2/v{version}/PathwayCommons{version}.Detailed.BIOPAX.owl.gz"
    path = pystow.ensure("bio", "pathwaycommons", version, url=url)    
    return pybiopax.model_from_owl_gz(path)

pc12 = ensure_pc_detailed(version="12")

In [None]:
type_counter = Counter(
    obj.__class__.__name__
    for obj in pc12.objects.values()
)
print(tabulate(type_counter.most_common(), headers=["Type", "Count"]))

## Which enzymes need to be phosphorylated to catalyze a reaction?

There are a few different perspectives for the concept of "active states", but this is a quick and dirty way of identifying them.

In [None]:
def iter_modifications(entity: PhysicalEntity, query: str) -> Iterable[ModificationFeature]:
    """Iterate over modification features in a protein that have the query string as a substring."""
    for feature in entity.feature or []:
        # If this is a modification feature which has a known type
        # and that type includes "phospho", i.e., is a phosphorylation
        if (
            isinstance(feature, ModificationFeature)
            and feature.modification_type
            and any(query in mod for mod in feature.modification_type.term)
        ):
            yield feature

def iter_phosphosites(protein: Protein):
    yield from iter_modifications(protein, "phospho")

rows = []
for obj in tqdm(pc12.get_objects_by_type(Catalysis)):
    for protein in obj.controller:
        if not isinstance(protein, Protein):
            continue
        features = list(iter_phosphosites(protein))
        if not features:
            continue
        rows.append((
            obj.display_name or "",
            protein.display_name,
            ", ".join(o.display_name for o in obj.controlled.left),
            ", ".join(o.display_name for o in obj.controlled.right),
        ))

HTML(tabulate(rows, tablefmt="html", headers=["Name", "Enzyme", "Reactants", "Products"]))

## Which catalyses of biochemical reactions require a cofactor?

It turns out Pathway Commons has a few more than 80 with this granularity.

In [None]:
def iter_cofactored_catalyses(model: BioPaxModel) -> Iterable[Catalysis]:
    """Iterate over catalyses of biochemical reactions that require a cofactor."""
    for obj in model.get_objects_by_type(Catalysis):
        if not obj.cofactor:
            continue
        if not isinstance(obj.controlled, BiochemicalReaction):
            continue
        yield obj

rows = [
    (
        obj.display_name,
        obj.controller,
        obj.cofactor.display_name,
        ", ".join(o.display_name for o in obj.controlled.left),
        ", ".join(o.display_name for o in obj.controlled.right),
    )
    for obj in iter_cofactored_catalyses(pc12)
]

HTML(tabulate(rows, tablefmt="html", headers=["Name", "Controller", "Cofactor", "Reactants", "Products"]))

## Find Phosphorylation Reactions

And later, generalize it to find any kind of addition of a modification.

In [None]:
def get_simple_physical_entity_xrefs(obj: SimplePhysicalEntity) -> Set[Tuple[str, str]]:
    """Get xrefs from a simple physical entity as pairs."""
    if not obj.entity_reference:
        return set()
    return {(xref.db, xref.id) for xref in obj.entity_reference.xref or []}

def is_modification_reaction(obj: Any) -> bool:
    """Check if the object is a biochemical reaction with the same
    entity as reactant/product but it's modified.
    """
    if not isinstance(obj, BiochemicalReaction):
        return False
    if len(obj.left) != 1 or len(obj.right) != 1:
        return False
    left, right = obj.left[0], obj.right[0]
    if not isinstance(left, Protein) or not isinstance(right, Protein):
        return False
    left_xrefs = get_simple_physical_entity_xrefs(left)
    right_xrefs = get_simple_physical_entity_xrefs(right)
    return 0 < len(left_xrefs.intersection(right_xrefs))

def iter_modification_reactions(model: BioPaxModel) -> Iterable[BiochemicalReaction]:
    """Iterate over biochemical reactions in the model that are modification reactions which
    pass :func:`is_modification_reaction`.
    """
    for obj in model.get_objects_by_type(BiochemicalReaction):
        if is_modification_reaction(obj):
            yield obj

def iter_phosphorylations(m):
    for obj in iter_modification_reactions(m):
        left = list(iter_phosphosites(obj.left[0]))
        right = list(iter_phosphosites(obj.right[0]))
        if not left and right:
            yield obj

rows = [
    (
        obj.display_name, obj.left[0]
    )
    for obj in iter_phosphorylations(pc12)
]

HTML(tabulate(rows, tablefmt="html", headers=["Name", "Reactant"]))

## Get Proteins with Bound Small Molecules 

In general, complexes are pretty easy to get with `get_objects_by_type()` then iterating over the `component` attribute. The following functions iterate over complexes between proteins and one or more small molecules.

In [None]:
def head(it, n=10):
    for _, obj in zip(range(n), it):
        yield obj

def iter_bound(m):
    for obj in m.get_objects_by_type(Complex):
        c = Counter(c.__class__ for c in obj.component)
        if c.get(Protein) != 1:
            continue
        if {SmallMolecule, Protein} != set(c):
            continue
        yield obj

for obj in head(iter_bound(pc12)):
    print(obj)
    for component in obj.component:
        print(" ", component)

Get proteins with multiple bound small molecules

In [None]:
def iter_bound_multiple(m):
    for obj in m.get_objects_by_type(Complex):
        c = Counter(c.__class__ for c in obj.component)
        if c.get(Protein) != 1:
            continue
        if {SmallMolecule, Protein} != set(c):
            continue
        if c.get(SmallMolecule) < 2:
            continue
        yield obj

for obj in head(iter_bound_multiple(pc12)):
    print(obj)
    for component in obj.component:
        print(" ", component)

In [None]:
counter = Counter(
    xref.__class__.__name__
    for obj in pc12.get_objects_by_type(SimplePhysicalEntity)
    if obj.entity_reference
    for xref in obj.entity_reference.xref or []
)
counter.most_common()