# Database setup

The cell below creates the database and all associated tables. Existing tables will not be cleared. You should-re run this cell if there are changes to the database tables that should be applied.

In [1]:
import pickle
import logging

from itertools import chain
from typing import List, Set, Generator, Dict

from src import settings
from src.database import models
from src.constants import Paths


logger = logging.getLogger(settings.LOGGER_NAME)

# Bind models and create tables
settings.DATABASE.bind(models.MODELS, bind_refs=True, bind_backrefs=True)
settings.DATABASE.create_tables(models.MODELS)

# REST client setup

Clients for Uniprot and Kegg are also created so downloads can be cached during this session and re-used if something goes wrong.

In [2]:
from src.clients import Kegg, UniprotClient

# Use the download caches for UniProt and Kegg clients.
cache_kegg = True
cache_uniprot = True

# Use global clients to cache results in case of error.
kegg_client = Kegg(use_cache=cache_kegg)
uniprot_client = UniprotClient(use_cache=cache_uniprot, verbose=True)

# Annotations

The cell directly below will populate the annotation tables, which includ Pfam, InterPro and Gene Ontology terms. This is optional but if performed will result in less time being required to populate the protein and interaction tables.

In [3]:
from src.parsers import go, interpro, pfam
from src.database.utilities import create_terms
from src.database.models import GeneOntologyTerm, PfamTerm, InterproTerm


with settings.DATABASE.atomic():
    create_terms(
        terms=pfam.parse_clans_file(path=Paths.pfam_clans), 
        model=PfamTerm
    )
    create_terms(
        terms=go.parse_go_obo(path=Paths.go_obo), 
        model=GeneOntologyTerm
    )
    create_terms(
        terms=interpro.parse_entry_list(path=Paths.interpro_entries), 
        model=InterproTerm
    )

[INFO] 2019-09-05 12:43:38,574 go Unzipping '/home/daniel/.tamago/data/go.obo.gz' into '/tmp/uzippedleq4f4ky.obo'


/tmp/uzippedleq4f4ky.obo: fmt(1.2) rel(2019-06-01) 47,444 GO Terms


# Proteins and Interactions

The cell below will parse interactions and update the accessions of all `source` and `target` nodes to the most recent
UniProt accessions. Interactions which cannot be mapped will be discarded.

In [4]:
from src.parsers import kegg, hprd, bioplex, pina, innate
from src.parsers.types import InteractionData
from src.clients.uniprot import UniprotEntry
from src.database.utilities import update_accessions


# Normalizing interactions formats all identifiers/fields and set removes 
# duplicates.
logger.info("Parsing interactions.")
interactions: Set[InteractionData] = set(
    interaction.normalize()
    for interaction in chain(
        kegg.parse_interactions(client=kegg_client),
        hprd.parse_interactions(
            ptms=hprd.parse_ptm(path=Paths.hprd_ptms), 
            xrefs=hprd.parse_xref_mapping(path=Paths.hprd_xref)
        ),
        pina.parse_interactions(path=Paths.pina2_mitab),
        innate.parse_interactions(Paths.innate_all),
        innate.parse_interactions(Paths.innate_curated),
        bioplex.parse_interactions(path=Paths.bioplex),
    )
)

[INFO] 2019-09-05 12:44:34,349 <ipython-input-4-c619f0ce3991> Parsing interactions.
[INFO] 2019-09-05 12:44:36,781 kegg Downloading Kegg to UniProt mapping.
[INFO] 2019-09-05 12:44:36,783 kegg Downloading HSA pathways.
100%|██████████| 335/335 [00:07<00:00, 46.25it/s]
[INFO] 2019-09-05 12:44:47,478 kegg Generating interactions.


In [5]:
# Map all UniProt identifiers to their most recent versions.
logger.info("Fetching updated UniProt identifiers.")
uniprot_ids: Set[str] = set(
    accession
    for interaction in interactions
    for accession in (interaction.source, interaction.target)
)

uniprot_id_mapping: Dict[str, List[str]]
uniprot_id_mapping = uniprot_client.get_mapping_table(
    identifiers=uniprot_ids,
    fr='ACC+ID',
    to="ACC",
    batch_size=500,
)

[INFO] 2019-09-05 12:45:55,433 <ipython-input-5-50afc3efc0e9> Fetching updated UniProt identifiers.
[INFO] 2019-09-05 12:45:55,884 uniprot Requesting 41 batches of size 500.
100%|██████████| 41/41 [03:51<00:00,  5.64s/it]


In [6]:
# Map all source and target nodes to their most recent accessions. This 
# call will also aggregate all equal interactions into a single instance
# after the mapping process.
logger.info("Mapping interactions to updated UniProt identifiers.")
mapped_interactions: List[InteractionData]
mapped_interactions = update_accessions(
    interactions=interactions,
    mapping=uniprot_id_mapping,
)

[INFO] 2019-09-05 12:49:47,310 <ipython-input-6-3a47dc86cc0b> Mapping interactions to updated UniProt identifiers.


# Querying UniProt

The cell below queries UniProt for the XML flat files for each uniprot identifier. This can take a while if this is a first time run, or there are a lot of cache misses.

In [7]:
uniprot_entries: Generator[UniprotEntry, None, None]
uniprot_entries = uniprot_client.get_entries(
    identifiers=uniprot_ids, 
    batch_size=500
)

# Database population

The cell below will parse and validate all interactions, including UniProt identifiers and associated metadata, and then commit these to the database. Additional metadata will be associated with interactions such as PubMed/PSIMI identifiers, labels and the source databases. Directionality information is retained such that separate rows will be added for the interactions `(A, B)` and `(B, A)` where `A` is the source node and `B` is the target node. This information is particularly useful for analysing Kegg interactions, which have directionality information in the labels.

In [8]:
from src.database.utilities import create_interactions, create_proteins

proteins: List[models.Protein] = create_proteins((
    entry for entry in uniprot_entries if entry is not None
))
logger.info(f"Created {len(proteins)} protein rows")

[INFO] 2019-09-05 12:50:17,922 uniprot Requesting 41 batches of size 500.
100%|██████████| 41/41 [1:00:17<00:00, 88.24s/it]
[INFO] 2019-09-05 13:51:10,170 uniprot Parsing XML into UniprotEntry instances.
[INFO] 2019-09-05 14:05:59,851 utilities Populating annotation tables.
[INFO] 2019-09-05 14:06:33,153 utilities Populating gene symbol table.
[INFO] 2019-09-05 14:06:35,144 utilities Populating uniprot identifier table.
[INFO] 2019-09-05 14:06:39,314 utilities Updating Protein table.
[INFO] 2019-09-05 14:20:58,687 <ipython-input-8-2d88604cf55f> Created 19552 protein rows


In [9]:
interactions: List[models.Interaction] = create_interactions(
    mapped_interactions
)
logger.info(f"Created {len(interactions)} interaction rows")

[INFO] 2019-09-05 14:21:10,954 utilities Populating interaction evidence and metadata tables.
[INFO] 2019-09-05 14:23:51,478 utilities Populating interaction table.








































[INFO] 2019-09-05 14:53:19,380 <ipython-input-9-bb2367705e0a> Created 372232 interaction rows


In [2]:
identifiers = [("P08670", "Q8TF68"), ('Q9P2W3', 'P48051')]
models.Interaction.filter_by_edge([("P08670", "Q8TF68")]).count()

OperationalError: no such column: t1.index

In [3]:
from src.parsers import types

e = types.InteractionEvidenceData('1', '0001')
es = [x.normalize() for x in list([e])]

In [4]:
e

InteractionEvidenceData(pubmed='PUBMED:1', psimi='MI:0001')