In [1]:
"""
This script runs classifier training over the entire training data and then
output predictions over the interactome.

Usage:
  build_data.py [--clear_cache] [--n_jobs=J]
  build_data.py -h | --help

Options:
  -h --help  Show this screen.
  --n_jobs=J  Number of processes to run in parallel [default: 1]
  --clear_cache  Delete previous bioservices KEGG/UniProt cache
"""

import os
import pandas as pd
import logging
from Bio import SwissProt
from joblib import Parallel, delayed
from docopt import docopt

from pyppi.base.utilities import delete_cache, is_null
from pyppi.base.utilities import generate_interaction_tuples
from pyppi.base.arg_parsing import parse_args
from pyppi.base.constants import SOURCE, TARGET, LABEL
from pyppi.base.constants import PUBMED, EXPERIMENT_TYPE
from pyppi.base.log import create_logger

from pyppi.base.file_paths import bioplex_network_path, pina2_network_path
from pyppi.base.file_paths import innate_i_network_path, innate_c_network_path
from pyppi.base.file_paths import interactome_network_path, full_training_network_path
from pyppi.base.file_paths import kegg_network_path, hprd_network_path
from pyppi.base.file_paths import testing_network_path, training_network_path
from pyppi.base.file_paths import default_db_path

from pyppi.base.io import save_uniprot_accession_map, save_network_to_path
from pyppi.base.io import bioplex_v4, pina2_mitab, innate_curated, innate_imported
from pyppi.base.io import uniprot_sprot, uniprot_trembl

from pyppi.database import delete_database, db_session
from pyppi.database.models import Protein, Interaction
from pyppi.database.models import Pubmed, Psimi, Reference
from pyppi.database.utilities import create_interaction, uniprotid_entry_map

from pyppi.data_mining.uniprot import parse_record_into_protein
from pyppi.data_mining.uniprot import batch_map
from pyppi.data_mining.generic import bioplex_func
from pyppi.data_mining.generic import pina_mitab_func, innate_mitab_func
from pyppi.data_mining.generic import generic_to_dataframe
from pyppi.data_mining.hprd import hprd_to_dataframe
from pyppi.data_mining.tools import process_interactions, make_interaction_frame
from pyppi.data_mining.tools import remove_common_ppis, remove_labels
from pyppi.data_mining.tools import map_network_accessions
from pyppi.data_mining.kegg import download_pathway_ids, pathways_to_dataframe
from pyppi.data_mining.ontology import get_active_instance
from pyppi.data_mining.psimi import get_active_instance as load_mi_ontology
from pyppi.data_mining.features import compute_interaction_features


logger = create_logger("scripts", logging.INFO)


# if __name__ == "__main__":
#     args = docopt(__doc__)
#     args = parse_args(args)
n_jobs = 16 #args['n_jobs']
clear_cache = False #args['clear_cache']

# Setup the protein table in the database
# ----------------------------------------------------------------------- #
if clear_cache:
    logger.info("Clearing Biopython/Bioservices cache.")
    delete_cache()

logger.info("Clearing existing database tables.")
delete_database(db_session)

logger.info("Parsing UniProt and PSI-MI into database.")
records = list(SwissProt.parse(uniprot_sprot())) + \
    list(SwissProt.parse(uniprot_trembl()))
proteins = [parse_record_into_protein(r) for r  in records]

psimi_objects = []
mi_ont = load_mi_ontology()
for key, term in mi_ont.items():
    obj = Psimi(accession=key, description=term.name)
    psimi_objects.append(obj)

try:
    db_session.add_all(proteins + psimi_objects)
    db_session.commit()
except:
    db_session.rollback()
    raise

2018-03-06 20:47:09 scripts  INFO     Clearing existing database tables.
2018-03-06 20:47:11 scripts  INFO     Parsing UniProt and PSI-MI into database.


In [2]:
# Construct all the networks
# ----------------------------------------------------------------------- #
logger.info("Building KEGG interactions.")
kegg = pathways_to_dataframe(
    pathway_ids=None,
    map_to_uniprot=True,
    drop_nan='default',
    allow_self_edges=True,
    allow_duplicates=False,
    cache=True,
    org='hsa'
)

logger.info("Building HPRD interactions.")
hprd = hprd_to_dataframe(
    drop_nan='default',
    allow_self_edges=True,
    allow_duplicates=False
)

logger.info("Building Interactome interactions.")
bioplex = generic_to_dataframe(
    f_input=bioplex_v4(),
    parsing_func=bioplex_func,
    drop_nan=[SOURCE, TARGET],
    allow_self_edges=True,
    allow_duplicates=False
)

pina2_mitab = generic_to_dataframe(
    f_input=pina2_mitab(),
    parsing_func=pina_mitab_func,
    drop_nan=[SOURCE, TARGET],
    allow_self_edges=True,
    allow_duplicates=False
)

innate_c = generic_to_dataframe(
    f_input=innate_curated(),
    parsing_func=innate_mitab_func,
    drop_nan=[SOURCE, TARGET],
    allow_self_edges=True,
    allow_duplicates=False
)

innate_i = generic_to_dataframe(
    f_input=innate_imported(),
    parsing_func=innate_mitab_func,
    drop_nan=[SOURCE, TARGET],
    allow_self_edges=True,
    allow_duplicates=False
)

logger.info("Mapping to most recent uniprot accessions.")
# Get a set of all the unique uniprot accessions
networks = [kegg, hprd, bioplex, pina2_mitab, innate_i, innate_c]
sources = set(p for df in networks for p in df.source.values)
targets = set(p for df in networks for p in df.target.values)
accessions = list(sources | targets)
accession_mapping = batch_map(
    session=db_session,
    allow_download=False,
    accessions=accessions,
    keep_unreviewed=True,
    match_taxon_id=9606,
    cache=True, 
    verbose=True
)
save_uniprot_accession_map(accession_mapping)

logger.info("Mapping each network to the most recent uniprot accessions.")
kegg = map_network_accessions(
    interactions=kegg, accession_map=accession_mapping,
    drop_nan='default', allow_self_edges=True,
    allow_duplicates=False, min_counts=None, merge=False
)

hprd = map_network_accessions(
    interactions=hprd, accession_map=accession_mapping,
    drop_nan='default', allow_self_edges=True,
    allow_duplicates=False, min_counts=None, merge=False
)

pina2_mitab = map_network_accessions(
    interactions=pina2_mitab, accession_map=accession_mapping,
    drop_nan=[SOURCE, TARGET], allow_self_edges=True,
    allow_duplicates=False, min_counts=None, merge=False
)

bioplex = map_network_accessions(
    interactions=bioplex, accession_map=accession_mapping,
    drop_nan=[SOURCE, TARGET], allow_self_edges=True,
    allow_duplicates=False, min_counts=None, merge=False
)

innate_c = map_network_accessions(
    interactions=innate_c, accession_map=accession_mapping,
    drop_nan=[SOURCE, TARGET], allow_self_edges=True,
    allow_duplicates=False, min_counts=None, merge=False
)

innate_i = map_network_accessions(
    interactions=innate_i, accession_map=accession_mapping,
    drop_nan=[SOURCE, TARGET], allow_self_edges=True,
    allow_duplicates=False, min_counts=None, merge=False
)
networks = [hprd, kegg, bioplex, pina2_mitab, innate_i, innate_c]

logger.info("Saving raw networks.")
save_network_to_path(kegg, kegg_network_path)
save_network_to_path(hprd, hprd_network_path)
save_network_to_path(pina2_mitab, pina2_network_path)
save_network_to_path(bioplex, bioplex_network_path)
save_network_to_path(innate_i, innate_i_network_path)
save_network_to_path(innate_c, innate_c_network_path)

2018-03-06 20:51:49 scripts  INFO     Building KEGG interactions.
2018-03-06 20:58:54 scripts  INFO     Building HPRD interactions.


       source  target              label  \
0        None    None    Phosphorylation   
1        None  P12814    Phosphorylation   
2        None    None    Phosphorylation   
3        None  Q05397    Phosphorylation   
4      P12814  Q05397    Phosphorylation   
5        None  Q05397    Phosphorylation   
6        None    None    Phosphorylation   
7        None  P12814    Phosphorylation   
8        None    None    Phosphorylation   
9        None  P18031  Dephosphorylation   
10     P12814  P18031  Dephosphorylation   
11       None  P18031  Dephosphorylation   
12       None    None  Dephosphorylation   
13       None  P12814  Dephosphorylation   
14       None    None  Dephosphorylation   
15       None  P29590        Sumoylation   
16       None    None        Sumoylation   
17       None    None        Sumoylation   
18     P29590  P63165        Sumoylation   
19       None  P63165        Sumoylation   
20       None  P63165        Sumoylation   
21       None  P29590        Sum

[23581 rows x 5 columns]


2018-03-06 20:59:33 scripts  INFO     Building Interactome interactions.
2018-03-06 21:00:32 scripts  INFO     Mapping to most recent uniprot accessions.
2018-03-06 21:02:17 scripts  INFO     Mapping each network to the most recent uniprot accessions.
2018-03-06 21:02:54 scripts  INFO     Saving raw networks.


In [3]:
logger.info("Building and saving processed networks.")
hprd_test_labels = ['Dephosphorylation', 'Phosphorylation']
hprd_train_labels = set(
    [l for l in hprd[LABEL] if l not in hprd_test_labels]
)
train_hprd = remove_labels(hprd, hprd_test_labels)
testing = remove_labels(hprd, hprd_train_labels)
training = pd.concat([kegg, train_hprd], ignore_index=True).reset_index(
    drop=True, inplace=False)

# Some ppis will be the same between training/testing sets but
# with different labels. Put all the ppis appearing in testing
# with a different label compared to the same instance in training
# into the training set. This way we can keep the testing and
# training sets completely disjoint.
training, testing, common = remove_common_ppis(
    df_1=training,
    df_2=testing
)
full_training = pd.concat(
    [training, testing, common],
    ignore_index=True
).reset_index(
    drop=True, inplace=False
)

testing = process_interactions(
    interactions=testing,
    drop_nan='default', allow_duplicates=False, allow_self_edges=True,
    exclude_labels=None, min_counts=5, merge=True
)
training = process_interactions(
    interactions=training,
    drop_nan='default', allow_duplicates=False, allow_self_edges=True,
    exclude_labels=None, min_counts=5, merge=True
)
full_training = process_interactions(
    interactions=full_training,
    drop_nan='default', allow_duplicates=False, allow_self_edges=True,
    exclude_labels=None, min_counts=5, merge=True
)
common = process_interactions(
    interactions=common,
    drop_nan='default', allow_duplicates=False, allow_self_edges=True,
    exclude_labels=None, min_counts=None, merge=True
)

interactome_networks = [bioplex, pina2_mitab, innate_i, innate_c]
interactome = pd.concat(interactome_networks, ignore_index=True)
interactome = process_interactions(
    interactions=interactome, drop_nan=[SOURCE, TARGET],
    allow_duplicates=False, allow_self_edges=True,
    exclude_labels=None, min_counts=None, merge=True
)
save_network_to_path(interactome, interactome_network_path)
save_network_to_path(training, training_network_path)
save_network_to_path(testing, testing_network_path)
save_network_to_path(full_training, full_training_network_path)

2018-03-06 21:02:55 scripts  INFO     Building and saving processed networks.


In [4]:
logger.info("Saving Interaction records to database.")
protein_map = uniprotid_entry_map()
ppis = [
    (protein_map[a], protein_map[b])
    for network in [full_training, interactome]
    for (a, b) in zip(network[SOURCE], network[TARGET])
]

feature_map = {}
logger.info("Computing features.")
features_ls = [{} for _ in ppis]
# features_ls = Parallel(n_jobs=n_jobs, backend='multiprocessing')(
#     delayed(compute_interaction_features)(source, target)
#     for (source, target) in ppis
# )
for (source, target), features in zip(ppis, features_ls):
    feature_map[(source.uniprot_id, target.uniprot_id)] = features

# Create and save all the psimi and pubmed objects if they don't already
# exist in the database.
logger.info("Updating Pubmed/PSI-MI database entries.")
objects = []
mi_ont = load_mi_ontology()
networks = [full_training, interactome]
pmids = set([
    p.upper()
    for ls in pd.concat(networks, ignore_index=True)[PUBMED]
    for p in str(ls).split(',') if not is_null(p)
])
psimis = set([
    p.upper()
    for ls in pd.concat(networks, ignore_index=True)[EXPERIMENT_TYPE]
    for p in str(ls).split(',') if not is_null(p)
])
for pmid in pmids:
    if not Pubmed.query.filter_by(accession=pmid).count():
        objects.append(Pubmed(accession=pmid))
for psimi_group in psimis:
    psimis = psimi_group.split('|')
    for p in psimis:
        if is_null(p):
            continue
        if not Psimi.query.filter_by(accession=p).count():
            objects.append(
                Psimi(accession=p, description=mi_ont[p].name)
            )
try:
    db_session.add_all(objects)
    db_session.commit()
except:
    db_session.rollback()
    raise

2018-03-06 21:03:31 scripts  INFO     Saving Interaction records to database.
2018-03-06 21:03:34 scripts  INFO     Computing features.
2018-03-06 21:03:35 scripts  INFO     Updating Pubmed/PSI-MI database entries.


In [5]:
logger.info("Creating Interaction database entries.")
interactions = {}
for interaction in Interaction.query.all():
    a = Protein.query.get(interaction.source)
    b = Protein.query.get(interaction.target)
    uniprot_a, uniprot_b = sorted([a.uniprot_a, b.uniprot_b])
    interactions[(uniprot_a, uniprot_b)] = interaction

# Training should only update the is_training to true and leave other
# boolean fields alone.
logger.info("Creating training interaction entries.")
generator = generate_interaction_tuples(training)
for (uniprot_a, uniprot_b, label, pmids, psimis) in generator:
    uniprot_a, uniprot_b = sorted([uniprot_a, uniprot_b])
    source = protein_map[uniprot_a]
    target = protein_map[uniprot_b]
    class_kwargs = feature_map[(uniprot_a, uniprot_b)]
    class_kwargs["is_training"] = True
    entry = create_interaction(
        source, target, label, session=db_session, save=False,
        commit=False, verbose=False, **class_kwargs
    )
    interactions[(uniprot_a, uniprot_b)] = (entry, pmids, psimis)

2018-03-06 21:05:01 scripts  INFO     Creating Interaction database entries.
2018-03-06 21:05:01 scripts  INFO     Creating training interaction entries.


In [6]:
# Testing should only update the is_holdout to true and leave other
# boolean fields alone.
logger.info("Creating holdout interaction entries.")
generator = generate_interaction_tuples(testing)
for (uniprot_a, uniprot_b, label, pmids, psimis) in generator:
    uniprot_a, uniprot_b = sorted([uniprot_a, uniprot_b])
    entry = interactions.get((uniprot_a, uniprot_b), None)
    if entry is None:
        source = protein_map[uniprot_a]
        target = protein_map[uniprot_b]
        class_kwargs = feature_map[(uniprot_a, uniprot_b)]
        class_kwargs["is_holdout"] = True
        entry = create_interaction(
            source, target, label, session=db_session, save=False,
            commit=False, verbose=False, **class_kwargs
        )
        interactions[(uniprot_a, uniprot_b)] = (entry, pmids, psimis)
    else:
        entry[0].is_holdout = True
        entry[0].add_label(label)
        pmids = entry[1] + pmids
        psimis = entry[2] + psimis
        interactions[(uniprot_a, uniprot_b)] = (entry[0], pmids, psimis)

2018-03-06 21:05:32 scripts  INFO     Creating holdout interaction entries.


In [7]:
# Common are in both kegg and hprd so should only update the is_training
# and is_holdout to true and leave other boolean fields alone.
logger.info("Creating training/holdout interaction entries.")
generator = generate_interaction_tuples(common)
for (uniprot_a, uniprot_b, label, pmids, psimis) in generator:
    uniprot_a, uniprot_b = sorted([uniprot_a, uniprot_b])
    entry = interactions.get((uniprot_a, uniprot_b), None)
    if entry is None:
        source = protein_map[uniprot_a]
        target = protein_map[uniprot_b]
        class_kwargs = feature_map[(uniprot_a, uniprot_b)]
        class_kwargs["is_holdout"] = True
        class_kwargs["is_training"] = True
        entry = create_interaction(
            source, target, label, session=db_session, save=False,
            commit=False, verbose=False, **class_kwargs
        )
        interactions[(uniprot_a, uniprot_b)] = (entry, pmids, psimis)
    else:
        entry[0].is_training = True
        entry[0].is_holdout = True
        entry[0].add_label(label)
        pmids = entry[1] + pmids
        psimis = entry[2] + psimis
        interactions[(uniprot_a, uniprot_b)] = (entry[0], pmids, psimis)

2018-03-06 21:05:35 scripts  INFO     Creating training/holdout interaction entries.


In [8]:
# Training should only update the is_interactome to true and leave other
# boolean fields alone.
logger.info("Creating interactome interaction entries.")
generator = generate_interaction_tuples(interactome)
for (uniprot_a, uniprot_b, label, pmids, psimis) in generator:
    uniprot_a, uniprot_b = sorted([uniprot_a, uniprot_b])
    entry = interactions.get((uniprot_a, uniprot_b), None)
    if entry is None:
        source = protein_map[uniprot_a]
        target = protein_map[uniprot_b]
        class_kwargs = feature_map[(uniprot_a, uniprot_b)]
        class_kwargs["is_interactome"] = True
        entry = create_interaction(
            source, target, label, session=db_session, save=False,
            commit=False, verbose=False, **class_kwargs
        )
        interactions[(uniprot_a, uniprot_b)] = (entry, pmids, psimis)
    else:
        entry[0].is_interactome = True
        pmids = entry[1] + pmids
        psimis = entry[2] + psimis
        interactions[(uniprot_a, uniprot_b)] = (entry[0], pmids, psimis)

# Batch commit might be quicker than calling save on each interaction.
logger.info("Commiting interactions to database.")
try:
    entries = [tup[0] for tup in interactions.values()]
    db_session.add_all(entries)
    db_session.commit()
except:
    db_session.rollback()
    raise

2018-03-06 21:05:35 scripts  INFO     Creating interactome interaction entries.
2018-03-06 21:10:15 scripts  INFO     Commiting interactions to database.


In [10]:
logger.info("Linking Pubmed/Psimi references.")
pubmed_map = {p.accession: p for p in Pubmed.query.all()}
psimi_map = {p.accession: p for p in Psimi.query.all()}
references = []
for entry, pmid_ls, psimi_ls in interactions.values():
    for pmid, psimis in zip(pmid_ls, psimi_ls):
        if pmid is None:
            continue
        if psimis is None:
            ref = Reference(entry, pubmed_map[pmid], None)
        else:
            for psimi in psimis:
                if psimi is None:
                    ref = Reference(entry, pubmed_map[pmid], None)
                else:
                    ref = Reference(entry, pubmed_map[pmid], psimi_map[psimi])
        references.append(ref)

try:
    db_session.add_all(references)
    db_session.commit()
    db_session.close()
except:
    db_session.rollback()
    raise

2018-03-06 21:24:50 scripts  INFO     Linking Pubmed/Psimi references.


In [16]:
Interaction.query.get(56).experiment_types().all()

[<Psimi(id=454, accession=MI:0493, desc=in vivo)>]

In [25]:
from pyppi.database.utilities import *
training_interactions().filter_by(is_holdout=False).count()

26445