# Database setup

The cell below creates the database and all associated tables. Existing tables will not be cleared. You should-re run this cell if there are changes to the database tables that should be applied.

In [1]:
import pickle
from itertools import chain

from src import settings
from src.database.models import MODELS
from src.clients import Kegg, UniprotClient
from src.constants import Paths


# Use the download caches for UniProt and Kegg clients.
cache_kegg = True
cache_uniprot = True

# Bind models and create tables
settings.DATABASE.bind(MODELS, bind_refs=True, bind_backrefs=True)
settings.DATABASE.create_tables(MODELS)

# REST client setup

Clients for Uniprot and Kegg are also created so downloads can be cached during this session and re-used if something goes wrong.

In [2]:
# Use global clients to cache results in case of error.
kegg_client = Kegg(use_cache=cache_kegg)
uniprot_client = UniprotClient(use_cache=cache_uniprot)

# Annotations

The cell directly below will populate the annotation tables, which includ Pfam, InterPro and Gene Ontology terms. This is optional but if performed will result in less time being required to populate the protein and interaction tables.

In [None]:
from src.parsers import go, interpro, pfam
from src.database.utilities import create_terms
from src.database.models import GeneOntologyTerm, PfamTerm, InterproTerm


with settings.DATABASE.atomic():
    create_terms(terms=pfam.parse_clans_file(path=Paths.pfam_clans), model=PfamTerm)
    create_terms(terms=go.parse_go_obo(path=Paths.go_obo), model=GeneOntologyTerm)
    create_terms(terms=interpro.parse_entry_list(path=Paths.interpro_entries), model=InterproTerm)

# Proteins and Interactions

The cell below will parse and validate all interactions, including UniProt identifiers and associated metadata, and then commit these to the database. Additional metadata will be associated with interactions such as PubMed/PSIMI identifiers, labels and the source databases. Directionality information is retained such that separate rows will be added for the interactions (A, B) and (B, A) where A is the source node and B is the target node. This information is particularly useful for analysing Kegg interactions, which have directionality information in the labels.

In [3]:
from src.parsers import kegg, hprd, bioplex, pina, innate
from src.database.utilities import create_interactions


_ = create_interactions(
    chain(
        kegg.parse_interactions(client=kegg_client),
        hprd.parse_interactions(
            ptms=hprd.parse_ptm(path=Paths.hprd_ptms), 
            xrefs=hprd.parse_xref_mapping(path=Paths.hprd_xref)
        ),
        pina.parse_interactions(path=Paths.pina2_mitab),
        innate.parse_interactions(Paths.innate_all),
        innate.parse_interactions(Paths.innate_curated),
        bioplex.parse_interactions(path=Paths.bioplex),
    ), 
    client=uniprot_client
)

[INFO] 2019-09-02 19:31:38,465 kegg Downloading UniProt mapping. Please stand by.
[INFO] 2019-09-02 19:31:38,466 kegg Downloading pathways. Please stand by.
100%|██████████| 335/335 [00:10<00:00, 31.36it/s]
[INFO] 2019-09-02 19:31:53,792 kegg Generating interactions.


IntegrityError: NOT NULL constraint failed: pubmedidentifier.created