# Database setup

The cell below creates the database and all associated tables. Existing tables will not be cleared. You should-re run this cell if there are changes to the database tables that should be applied.

In [1]:
%load_ext autoreload
%autoreload 2

import pickle
import logging

from itertools import chain
from typing import List, Set, Generator, Dict

from src import settings
from src.database import models
from src.constants import Paths


logger = logging.getLogger(settings.LOGGER_NAME)

# Bind models and create tables
settings.DATABASE.bind(models.MODELS, bind_refs=True, bind_backrefs=True)
settings.DATABASE.create_tables(models.MODELS)

# REST client setup

Clients for Uniprot and Kegg are also created so downloads can be cached during this session and re-used if something goes wrong.

In [2]:
from src.clients import Kegg, UniprotClient

# Use the download caches for UniProt and Kegg clients.
cache_kegg = True
cache_uniprot = True
kegg_organism = 'hsa'

# Use global clients to cache results in case of error.
kegg_client = Kegg(use_cache=cache_kegg)
uniprot_client = UniprotClient(use_cache=cache_uniprot, verbose=True)

# Annotations

The cell directly below will populate the annotation tables, which includ Pfam, InterPro and Gene Ontology terms. This is optional but if performed will result in less time being required to populate the protein and interaction tables.

In [3]:
from src.parsers import go, interpro, pfam
from src.database.utilities import create_terms
from src.database.models import GeneOntologyTerm, PfamTerm, InterproTerm


with settings.DATABASE.atomic():
    create_terms(
        terms=pfam.parse_clans_file(path=Paths.pfam_clans), 
        model=PfamTerm
    )
    create_terms(
        terms=go.parse_go_obo(path=Paths.go_obo), 
        model=GeneOntologyTerm
    )
    create_terms(
        terms=interpro.parse_entry_list(path=Paths.interpro_entries), 
        model=InterproTerm
    )

[INFO] 2019-09-08 11:21:26,024 go Unzipping '/home/daniel/.tamago/data/go.obo.gz' into '/tmp/uzippedjpddpzfm.obo'


/tmp/uzippedjpddpzfm.obo: fmt(1.2) rel(2019-06-01) 47,444 GO Terms


# Proteins and Interactions

The cells below will:

1. Parse all interaction data sources.

2. Collect a list of UniProt accessions to download XML records for.

3. Download a mapping table for the the accessions collected in step **(2)** which maps them to their most recent **primary** accessions on UniProt, or nothing if the accession is no longer valid.

4. Update the accessions of all `source` and `target` nodes to the most recent UniProt accessions. Interactions which cannot be mapped will be discarded.

5. Normalize interactions such that all fields will have the same format and can be combined seamlessly.

6. Aggregate all interactions by their `source` and `target` nodes. Metadata fields (labels, databases etc) will be combined into a single data instance.

In [4]:
from src.parsers import kegg, hprd, bioplex, pina, innate
from src.parsers.types import InteractionData
from src.clients.uniprot import UniprotEntry
from src.clients.kegg import KeggPathway
from src.database.utilities import update_accessions


# Download required pathway and ID mapping information from KEGG.
pathways: List[KeggPathway] = kegg_client.parse_all_pathways(
    organism=kegg_organism,
    verbose=True
)

logger.info(f"Downloading '{kegg_organism}' gene to UniProt conversion table.")
kegg_gene_to_uniprot: Dict[str, List[str]]
kegg_gene_to_uniprot = kegg_client.convert(source=kegg_organism)


# Normalizing interactions formats all identifiers/fields and set removes 
# duplicates.
logger.info("Parsing interactions.")
interactions: Set[InteractionData] = set(
    interaction.normalize()
    for interaction in chain(
        kegg.parse_interactions(
            pathways=pathways,
            kegg_gene_to_uniprot=kegg_gene_to_uniprot,
            exclude_labels=(
                "indirect effect",
                "compound",
                "hidden compound",
                "state change",
                "missing interaction",
            )
        ),
        hprd.parse_interactions(
            ptms=hprd.parse_ptm(path=Paths.hprd_ptms), 
            xrefs=hprd.parse_xref_mapping(path=Paths.hprd_xref)
        ),
        pina.parse_interactions(path=Paths.pina2_mitab),
        innate.parse_interactions(Paths.innate_all),
        innate.parse_interactions(Paths.innate_curated),
        bioplex.parse_interactions(path=Paths.bioplex),
    )
)

[INFO] 2019-09-08 11:21:48,819 kegg Downloading and parsing 335 'hsa' pathways.
100%|██████████| 335/335 [00:05<00:00, 63.12it/s] 
[INFO] 2019-09-08 11:21:54,138 <ipython-input-4-0061b4f498ec> Downloading 'hsa' gene to UniProt conversion table.
[INFO] 2019-09-08 11:21:54,139 <ipython-input-4-0061b4f498ec> Parsing interactions.
[INFO] 2019-09-08 11:21:59,147 kegg Parsing pathway XML files for interactions.














































































IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

















































































IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

















































































IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

















































































IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

















































































IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

















































































IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

















































































IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)













In [5]:
# Map all UniProt identifiers to their most recent versions.
logger.info("Fetching updated UniProt identifiers.")
uniprot_ids: Set[str] = set(
    accession
    for interaction in interactions
    for accession in (interaction.source, interaction.target)
)

uniprot_id_mapping: Dict[str, List[str]]
uniprot_id_mapping = uniprot_client.get_mapping_table(
    identifiers=uniprot_ids,
    fr='ACC+ID',
    to="ACC",
    batch_size=500,
)

# Loop through and check isoforms with no mapping. These identifiers may
# still exist but the isoform identifier has since been deleted/changed and
# UniProt is unable to map these identifiers to their primary versions.
# Clip the isoform key and link to the non-isoform version. We want to keep 
# the interactions with these isoforms as the related XML record might still 
# exist.
for key, value in uniprot_id_mapping.items():
    # Check if isoform.
    if len(key.split('-')) == 2 and not len(value):
        uniprot_id_mapping[key] = [key.split('-')[0]]

[INFO] 2019-09-08 11:23:37,212 <ipython-input-5-412b911e4ebc> Fetching updated UniProt identifiers.
[INFO] 2019-09-08 11:23:37,664 uniprot Requesting 48 batches.
100%|██████████| 48/48 [06:15<00:00,  7.82s/it]


In [6]:
# Map all source and target nodes to their most recent accessions. This 
# call will also aggregate all equal interactions into a single instance
# after the mapping process.
logger.info("Mapping interactions to updated UniProt identifiers.")
mapped_interactions: List[InteractionData]
mapped_interactions = update_accessions(
    interactions=interactions,
    mapping=uniprot_id_mapping,
    keep_isoforms=True,
)

logger.info("Aggregating interactions by souce and target nodes.")
aggregated_interactions: List[InteractionData]
aggregated_interactions = InteractionData.aggregate(i.normalize() for i in mapped_interactions)

[INFO] 2019-09-08 11:29:53,070 <ipython-input-6-141f2b885137> Mapping interactions to updated UniProt identifiers.
[INFO] 2019-09-08 11:29:56,400 <ipython-input-6-141f2b885137> Aggregating interactions by souce and target nodes.


# Querying UniProt

The cell below queries UniProt for the XML flat files for each uniprot identifier. This can take a while if this is a first time run, or there are a lot of cache misses. Try again with a lower `batch_size` if you see a `Connection reset` or similar error.

In [11]:
uniprot_entries: Generator[UniprotEntry, None, None]
uniprot_entries = (
    (identifier, entry) for (identifier, entry)
    in uniprot_client.get_entries(
        identifiers=uniprot_ids, 
        batch_size=500
    )
    if entry is not None
)

# Database population

The cell below will parse and validate all interactions, including UniProt identifiers and associated metadata, and then commit these to the database. Additional metadata will be associated with interactions such as PubMed/PSIMI identifiers, labels and the source databases. Directionality information is retained such that separate rows will be added for the interactions `(A, B)` and `(B, A)` where `A` is the source node and `B` is the target node. This information is particularly useful for analysing Kegg interactions, which have directionality information in the labels.

In [12]:
from src.database.utilities import create_proteins


proteins: List[models.Protein]
proteins = create_proteins(uniprot_entries)
logger.info(f"Created {len(proteins)} protein rows")

[INFO] 2019-09-08 13:12:38,757 uniprot Requesting 1 batches.
100%|██████████| 1/1 [00:06<00:00,  6.92s/it]
[INFO] 2019-09-08 13:12:45,679 uniprot Parsing XML into UniprotEntry instances.




[INFO] 2019-09-08 13:31:48,956 utilities Populating annotation tables.
[INFO] 2019-09-08 13:32:17,439 utilities Populating gene symbol table.
[INFO] 2019-09-08 13:32:19,125 utilities Populating uniprot identifier table.
[INFO] 2019-09-08 13:32:24,553 utilities Updating Protein data table.


{'go_terms': {GeneOntologyTermData(identifier='GO:0043025', category='Cellular component', obsolete=False, name='neuronal cell body', description=None), GeneOntologyTermData(identifier='GO:0035145', category='Cellular component', obsolete=False, name='exon-exon junction complex', description=None), GeneOntologyTermData(identifier='GO:0008380', category='Biological process', obsolete=False, name='RNA splicing', description=None)}, 'interpro_terms': {InterproTermData(identifier='IPR004023', name='Mago_nashi', description=None, entry_type=None), InterproTermData(identifier='IPR036605', name='Mago_nashi_sf', description=None, entry_type=None)}, 'pfam_terms': {PfamTermData(identifier='PF02792', name='Mago_nashi', description=None)}, 'keywords': set(), 'genes': {GeneData(symbol='HCG_1773848', relation='orf'), GeneData(symbol='FLJ10292', relation='primary')}, 'accessions': {'A0A023T6R1'}, 'taxonomy': 9606, 'reviewed': False, 'version': '44', 'sequence': 'MAVASDFYLRYYVGHKGKFGHEFLEFEFRPDGKLRYAN

IntegrityError: NOT NULL constraint failed: proteindata.created

In [None]:
from src.database.utilities import create_interactions


interactions: List[models.Interaction]
interactions = create_interactions(mapped_interactions)
logger.info(f"Created {len(interactions)} interaction rows")