In [11]:
%reload_ext autoreload

"""
This script runs classifier training over the entire training data and then
output predictions over the interactome.

Usage:
  predict_ppis.py [--interpro] [--pfam] [--mf] [--cc] [--bp]
                  [--use_cache] [--retrain] [--induce] [--verbose]
                  [--model=M] [--n_jobs=J] [--n_splits=S] [--n_iterations=I]
                  [--input=FILE] [--output=FILE] [--directory=DIR]
  predict_ppis.py -h | --help

Options:
  -h --help     Show this screen.
  --interpro    Use interpro domains in features.
  --pfam        Use Pfam domains in features.
  --mf          Use Molecular Function Gene Ontology in features.
  --cc          Use Cellular Compartment Gene Ontology in features.
  --bp          Use Biological Process Gene Ontology in features.
  --induce      Use ULCA inducer over Gene Ontology.
  --verbose     Print intermediate output for debugging.
  --binary      Use binary feature encoding instead of ternary.
  --use_cache   Use cached features if available.
  --retrain     Re-train classifier instead of loading previous version. If
                using a previous version, you must use the same selection of
                features along with the same induce setting.
  --model=M         A binary classifier from Scikit-Learn implementing fit,
                    predict and predict_proba [default: LogisticRegression].
                    Ignored if using 'retrain'.
  --n_jobs=J        Number of processes to run in parallel [default: 1]
  --n_splits=S      Number of cross-validation splits used during randomized
                    grid search [default: 5]
  --n_iterations=I  Number of randomized grid search iterations [default: 60]
  --input=FILE      Uniprot edge-list, with a path directory that absolute or
                    relative to this script. Entries must be tab separated with
                    header columns 'source' and 'target'. [default: 'default']
  --output=FILE     Output file name [default: predictions.tsv]
  --directory=DIR   Absolute or relative output directory [default: ./results/]
"""

import os
import json
import logging
import numpy as np
from datetime import datetime

from pyppi.base import parse_args, su_make_dir
from pyppi.base import P1, P2, G1, G2
from pyppi.data import load_network_from_path, load_ptm_labels
from pyppi.data import full_training_network_path, generic_io
from pyppi.data import interactome_network_path, classifier_path

from pyppi.models import make_classifier, get_parameter_distribution_for_model

from pyppi.data_mining.features import AnnotationExtractor
from pyppi.data_mining.uniprot import UniProt, get_active_instance
from pyppi.data_mining.tools import xy_from_interaction_frame
from pyppi.data_mining.generic import edgelist_func, generic_to_dataframe
from pyppi.data_mining.tools import map_network_accessions

from sklearn.externals import joblib
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.multiclass import OneVsRestClassifier

logging.captureWarnings(False)
logging.basicConfig(
    format='[%(asctime)s] %(levelname)s: %(message)s',
    datefmt='%m-%d-%Y %I:%M:%S',
    level=logging.DEBUG,
)
logger = logging.getLogger(__name__)

args = dict(
    n_jobs=3,
    n_splits=2,
    rcv_iter=10,
    induce=True,
    verbose=True,
    selection = [
        UniProt.data_types().GO_MF.value,
        UniProt.data_types().GO_BP.value,
        UniProt.data_types().GO_CC.value,
        UniProt.data_types().INTERPRO.value,
        UniProt.data_types().PFAM.value
    ],
    model='LogisticRegression',
    use_cache=True,
    output='predictions.tsv',
    input='test_network.tsv',
    direc='./results/',
    retrain=True,
    use_binary=False
)
n_jobs = args['n_jobs']
n_splits = args['n_splits']
rcv_iter = args['rcv_iter']
induce = args['induce']
verbose = args['verbose']
selection = args['selection']
model = args['model']
use_feature_cache = args['use_cache']
out_file = args['output']
input_file = args['input']
direc = args['direc']
retrain = args['retrain']
use_binary = args['use_binary']

In [2]:
# Set up the folder for each experiment run named after the current time
# -------------------------------------------------------------------- #
folder = datetime.now().strftime("pred_%y-%m-%d_%H-%M-%S")
direc = "{}/{}/".format(direc, folder)
su_make_dir(direc)
json.dump(args, fp=open("{}/settings.json".format(direc), 'w'), indent=4,
          sort_keys=True)
out_file = open("{}/{}".format(direc, out_file), "w")

In [3]:
# Load features from feature cache, or create an empty annotation extrator
# ------------------------------------------------------------------------ #
logger.info("Loading feature data...")
uniprot = get_active_instance(verbose=verbose)
data_types = UniProt.data_types()
labels = load_ptm_labels()
annotation_ex = AnnotationExtractor(
    induce=induce,
    selection=selection,
    n_jobs=n_jobs,
    verbose=verbose,
    cache=use_feature_cache
)

[11-17-2017 05:58:39] INFO: Loading feature data...


First time loading on UniProt instance. Make take a few moments


In [9]:
# Get the input edge-list ready
# -------------------------------------------------------------------- #
if input_file == 'default':
    logger.info("Loading interactome data...")
    testing = load_network_from_path(interactome_network_path)
else:
    logger.info("Loading custom ppi data...")
    testing = generic_to_dataframe(
        f_input=generic_io(input_file),
        parsing_func=edgelist_func,
        drop_nan=True,
        allow_self_edges=True,
        allow_duplicates=True
    )
    sources = set(p for p in testing.source.values)
    targets = set(p for p in testing.target.values)
    accessions = list(sources | targets)
    accession_mapping = uniprot.batch_map(accessions)
    testing = map_network_accessions(
        interactions=testing, accession_map=accession_mapping,
        drop_nan=True, allow_self_edges=True,
        allow_duplicates=False, min_counts=None, merge=False
    )

In [15]:
# Get the features into X, and multilabel y indicator format
# -------------------------------------------------------------------- #
logger.info("Preparing training and testing data...")
training = load_network_from_path(full_training_network_path)
X_train_ppis, y_train = xy_from_interaction_frame(training)
X_test_ppis, _ = xy_from_interaction_frame(testing)

if not use_feature_cache:
    logger.info("Computing feature cache.")
    annotation_ex.fit(X_train_ppis + X_test_ppis)
    X_train = annotation_ex.transform(X_train_ppis)
    X_test = annotation_ex.transform(X_test_ppis)
else:
    try:
        X_train = annotation_ex.transform(X_train_ppis)
        X_test = annotation_ex.transform(X_test_ppis)
    except ValueError:
        logger.info("Found new PPIs. Re-computing feature cache.")
        annotation_ex.fit(X_train_ppis + X_test_ppis)
        X_train = annotation_ex.transform(X_train_ppis)
        X_test = annotation_ex.transform(X_test_ppis)

# Get all annotations used during training
# -------------------------------------------------------------------- #
training_go = set([
    g.strip() for gs in X_train
    for g in x.split(',')
    if 'go' in g.strip().lower()
])
training_pfam = set([
    g.strip() for gs in X_train
    for g in x.split(',')
    if 'pf' in g.strip().lower()
])
training_ipr = set([
    g.strip() for gs in X_train
    for g in x.split(',')
    if 'ipr' in g.strip().lower()
])

mlb = MultiLabelBinarizer(classes=sorted(labels))
mlb.fit(y_train)
y_train = mlb.transform(y_train)

vectorizer = CountVectorizer(binary=True if use_binary else False)
X_train = vectorizer.fit_transform(X_train)

Acquiring features for PPIs...
Acquiring data for accession 1/4029..
Acquiring data for accession 2/4029..
Acquiring data for accession 3/4029..
Acquiring data for accession 4/4029..
Acquiring data for accession 5/4029..
Acquiring data for accession 6/4029..
Acquiring data for accession 7/4029..
Acquiring data for accession 8/4029..
Acquiring data for accession 9/4029..
Acquiring data for accession 10/4029..
Acquiring data for accession 11/4029..
Acquiring data for accession 12/4029..
Acquiring data for accession 13/4029..
Acquiring data for accession 14/4029..
Acquiring data for accession 15/4029..
Acquiring data for accession 16/4029..
Acquiring data for accession 17/4029..
Acquiring data for accession 18/4029..
Acquiring data for accession 19/4029..
Acquiring data for accession 20/4029..
Acquiring data for accession 21/4029..
Acquiring data for accession 22/4029..
Acquiring data for accession 23/4029..
Acquiring data for accession 24/4029..
Acquiring data for accession 25/4029..
Acq

Acquiring data for accession 207/4029..
Acquiring data for accession 208/4029..
Acquiring data for accession 209/4029..
Acquiring data for accession 210/4029..
Acquiring data for accession 211/4029..
Acquiring data for accession 212/4029..
Acquiring data for accession 213/4029..
Acquiring data for accession 214/4029..
Acquiring data for accession 215/4029..
Acquiring data for accession 216/4029..
Acquiring data for accession 217/4029..
Acquiring data for accession 218/4029..
Acquiring data for accession 219/4029..
Acquiring data for accession 220/4029..
Acquiring data for accession 221/4029..
Acquiring data for accession 222/4029..
Acquiring data for accession 223/4029..
Acquiring data for accession 224/4029..
Acquiring data for accession 225/4029..
Acquiring data for accession 226/4029..
Acquiring data for accession 227/4029..
Acquiring data for accession 228/4029..
Acquiring data for accession 229/4029..
Acquiring data for accession 230/4029..
Acquiring data for accession 231/4029..


Acquiring data for accession 411/4029..
Acquiring data for accession 412/4029..
Acquiring data for accession 413/4029..
Acquiring data for accession 414/4029..
Acquiring data for accession 415/4029..
Acquiring data for accession 416/4029..
Acquiring data for accession 417/4029..
Acquiring data for accession 418/4029..
Acquiring data for accession 419/4029..
Acquiring data for accession 420/4029..
Acquiring data for accession 421/4029..
Acquiring data for accession 422/4029..
Acquiring data for accession 423/4029..
Acquiring data for accession 424/4029..
Acquiring data for accession 425/4029..
Acquiring data for accession 426/4029..
Acquiring data for accession 427/4029..
Acquiring data for accession 428/4029..
Acquiring data for accession 429/4029..
Acquiring data for accession 430/4029..
Acquiring data for accession 431/4029..
Acquiring data for accession 432/4029..
Acquiring data for accession 433/4029..
Acquiring data for accession 434/4029..
Acquiring data for accession 435/4029..


Acquiring data for accession 615/4029..
Acquiring data for accession 616/4029..
Acquiring data for accession 617/4029..
Acquiring data for accession 618/4029..
Acquiring data for accession 619/4029..
Acquiring data for accession 620/4029..
Acquiring data for accession 621/4029..
Acquiring data for accession 622/4029..
Acquiring data for accession 623/4029..
Acquiring data for accession 624/4029..
Acquiring data for accession 625/4029..
Acquiring data for accession 626/4029..
Acquiring data for accession 627/4029..
Acquiring data for accession 628/4029..
Acquiring data for accession 629/4029..
Acquiring data for accession 630/4029..
Acquiring data for accession 631/4029..
Acquiring data for accession 632/4029..
Acquiring data for accession 633/4029..
Acquiring data for accession 634/4029..
Acquiring data for accession 635/4029..
Acquiring data for accession 636/4029..
Acquiring data for accession 637/4029..
Acquiring data for accession 638/4029..
Acquiring data for accession 639/4029..


Acquiring data for accession 819/4029..
Acquiring data for accession 820/4029..
Acquiring data for accession 821/4029..
Acquiring data for accession 822/4029..
Acquiring data for accession 823/4029..
Acquiring data for accession 824/4029..
Acquiring data for accession 825/4029..
Acquiring data for accession 826/4029..
Acquiring data for accession 827/4029..
Acquiring data for accession 828/4029..
Acquiring data for accession 829/4029..
Acquiring data for accession 830/4029..
Acquiring data for accession 831/4029..
Acquiring data for accession 832/4029..
Acquiring data for accession 833/4029..
Acquiring data for accession 834/4029..
Acquiring data for accession 835/4029..
Acquiring data for accession 836/4029..
Acquiring data for accession 837/4029..
Acquiring data for accession 838/4029..
Acquiring data for accession 839/4029..
Acquiring data for accession 840/4029..
Acquiring data for accession 841/4029..
Acquiring data for accession 842/4029..
Acquiring data for accession 843/4029..


Acquiring data for accession 1023/4029..
Acquiring data for accession 1024/4029..
Acquiring data for accession 1025/4029..
Acquiring data for accession 1026/4029..
Acquiring data for accession 1027/4029..
Acquiring data for accession 1028/4029..
Acquiring data for accession 1029/4029..
Acquiring data for accession 1030/4029..
Acquiring data for accession 1031/4029..
Acquiring data for accession 1032/4029..
Acquiring data for accession 1033/4029..
Acquiring data for accession 1034/4029..
Acquiring data for accession 1035/4029..
Acquiring data for accession 1036/4029..
Acquiring data for accession 1037/4029..
Acquiring data for accession 1038/4029..
Acquiring data for accession 1039/4029..
Acquiring data for accession 1040/4029..
Acquiring data for accession 1041/4029..
Acquiring data for accession 1042/4029..
Acquiring data for accession 1043/4029..
Acquiring data for accession 1044/4029..
Acquiring data for accession 1045/4029..
Acquiring data for accession 1046/4029..
Acquiring data f

Acquiring data for accession 1222/4029..
Acquiring data for accession 1223/4029..
Acquiring data for accession 1224/4029..
Acquiring data for accession 1225/4029..
Acquiring data for accession 1226/4029..
Acquiring data for accession 1227/4029..
Acquiring data for accession 1228/4029..
Acquiring data for accession 1229/4029..
Acquiring data for accession 1230/4029..
Acquiring data for accession 1231/4029..
Acquiring data for accession 1232/4029..
Acquiring data for accession 1233/4029..
Acquiring data for accession 1234/4029..
Acquiring data for accession 1235/4029..
Acquiring data for accession 1236/4029..
Acquiring data for accession 1237/4029..
Acquiring data for accession 1238/4029..
Acquiring data for accession 1239/4029..
Acquiring data for accession 1240/4029..
Acquiring data for accession 1241/4029..
Acquiring data for accession 1242/4029..
Acquiring data for accession 1243/4029..
Acquiring data for accession 1244/4029..
Acquiring data for accession 1245/4029..
Acquiring data f

Acquiring data for accession 1421/4029..
Acquiring data for accession 1422/4029..
Acquiring data for accession 1423/4029..
Acquiring data for accession 1424/4029..
Acquiring data for accession 1425/4029..
Acquiring data for accession 1426/4029..
Acquiring data for accession 1427/4029..
Acquiring data for accession 1428/4029..
Acquiring data for accession 1429/4029..
Acquiring data for accession 1430/4029..
Acquiring data for accession 1431/4029..
Acquiring data for accession 1432/4029..
Acquiring data for accession 1433/4029..
Acquiring data for accession 1434/4029..
Acquiring data for accession 1435/4029..
Acquiring data for accession 1436/4029..
Acquiring data for accession 1437/4029..
Acquiring data for accession 1438/4029..
Acquiring data for accession 1439/4029..
Acquiring data for accession 1440/4029..
Acquiring data for accession 1441/4029..
Acquiring data for accession 1442/4029..
Acquiring data for accession 1443/4029..
Acquiring data for accession 1444/4029..
Acquiring data f

Acquiring data for accession 1620/4029..
Acquiring data for accession 1621/4029..
Acquiring data for accession 1622/4029..
Acquiring data for accession 1623/4029..
Acquiring data for accession 1624/4029..
Acquiring data for accession 1625/4029..
Acquiring data for accession 1626/4029..
Acquiring data for accession 1627/4029..
Acquiring data for accession 1628/4029..
Acquiring data for accession 1629/4029..
Acquiring data for accession 1630/4029..
Acquiring data for accession 1631/4029..
Acquiring data for accession 1632/4029..
Acquiring data for accession 1633/4029..
Acquiring data for accession 1634/4029..
Acquiring data for accession 1635/4029..
Acquiring data for accession 1636/4029..
Acquiring data for accession 1637/4029..
Acquiring data for accession 1638/4029..
Acquiring data for accession 1639/4029..
Acquiring data for accession 1640/4029..
Acquiring data for accession 1641/4029..
Acquiring data for accession 1642/4029..
Acquiring data for accession 1643/4029..
Acquiring data f

Acquiring data for accession 1819/4029..
Acquiring data for accession 1820/4029..
Acquiring data for accession 1821/4029..
Acquiring data for accession 1822/4029..
Acquiring data for accession 1823/4029..
Acquiring data for accession 1824/4029..
Acquiring data for accession 1825/4029..
Acquiring data for accession 1826/4029..
Acquiring data for accession 1827/4029..
Acquiring data for accession 1828/4029..
Acquiring data for accession 1829/4029..
Acquiring data for accession 1830/4029..
Acquiring data for accession 1831/4029..
Acquiring data for accession 1832/4029..
Acquiring data for accession 1833/4029..
Acquiring data for accession 1834/4029..
Acquiring data for accession 1835/4029..
Acquiring data for accession 1836/4029..
Acquiring data for accession 1837/4029..
Acquiring data for accession 1838/4029..
Acquiring data for accession 1839/4029..
Acquiring data for accession 1840/4029..
Acquiring data for accession 1841/4029..
Acquiring data for accession 1842/4029..
Acquiring data f

Acquiring data for accession 2018/4029..
Acquiring data for accession 2019/4029..
Acquiring data for accession 2020/4029..
Acquiring data for accession 2021/4029..
Acquiring data for accession 2022/4029..
Acquiring data for accession 2023/4029..
Acquiring data for accession 2024/4029..
Acquiring data for accession 2025/4029..
Acquiring data for accession 2026/4029..
Acquiring data for accession 2027/4029..
Acquiring data for accession 2028/4029..
Acquiring data for accession 2029/4029..
Acquiring data for accession 2030/4029..
Acquiring data for accession 2031/4029..
Acquiring data for accession 2032/4029..
Acquiring data for accession 2033/4029..
Acquiring data for accession 2034/4029..
Acquiring data for accession 2035/4029..
Acquiring data for accession 2036/4029..
Acquiring data for accession 2037/4029..
Acquiring data for accession 2038/4029..
Acquiring data for accession 2039/4029..
Acquiring data for accession 2040/4029..
Acquiring data for accession 2041/4029..
Acquiring data f

Acquiring data for accession 2217/4029..
Acquiring data for accession 2218/4029..
Acquiring data for accession 2219/4029..
Acquiring data for accession 2220/4029..
Acquiring data for accession 2221/4029..
Acquiring data for accession 2222/4029..
Acquiring data for accession 2223/4029..
Acquiring data for accession 2224/4029..
Acquiring data for accession 2225/4029..
Acquiring data for accession 2226/4029..
Acquiring data for accession 2227/4029..
Acquiring data for accession 2228/4029..
Acquiring data for accession 2229/4029..
Acquiring data for accession 2230/4029..
Acquiring data for accession 2231/4029..
Acquiring data for accession 2232/4029..
Acquiring data for accession 2233/4029..
Acquiring data for accession 2234/4029..
Acquiring data for accession 2235/4029..
Acquiring data for accession 2236/4029..
Acquiring data for accession 2237/4029..
Acquiring data for accession 2238/4029..
Acquiring data for accession 2239/4029..
Acquiring data for accession 2240/4029..
Acquiring data f

Acquiring data for accession 2416/4029..
Acquiring data for accession 2417/4029..
Acquiring data for accession 2418/4029..
Acquiring data for accession 2419/4029..
Acquiring data for accession 2420/4029..
Acquiring data for accession 2421/4029..
Acquiring data for accession 2422/4029..
Acquiring data for accession 2423/4029..
Acquiring data for accession 2424/4029..
Acquiring data for accession 2425/4029..
Acquiring data for accession 2426/4029..
Acquiring data for accession 2427/4029..
Acquiring data for accession 2428/4029..
Acquiring data for accession 2429/4029..
Acquiring data for accession 2430/4029..
Acquiring data for accession 2431/4029..
Acquiring data for accession 2432/4029..
Acquiring data for accession 2433/4029..
Acquiring data for accession 2434/4029..
Acquiring data for accession 2435/4029..
Acquiring data for accession 2436/4029..
Acquiring data for accession 2437/4029..
Acquiring data for accession 2438/4029..
Acquiring data for accession 2439/4029..
Acquiring data f

Acquiring data for accession 2615/4029..
Acquiring data for accession 2616/4029..
Acquiring data for accession 2617/4029..
Acquiring data for accession 2618/4029..
Acquiring data for accession 2619/4029..
Acquiring data for accession 2620/4029..
Acquiring data for accession 2621/4029..
Acquiring data for accession 2622/4029..
Acquiring data for accession 2623/4029..
Acquiring data for accession 2624/4029..
Acquiring data for accession 2625/4029..
Acquiring data for accession 2626/4029..
Acquiring data for accession 2627/4029..
Acquiring data for accession 2628/4029..
Acquiring data for accession 2629/4029..
Acquiring data for accession 2630/4029..
Acquiring data for accession 2631/4029..
Acquiring data for accession 2632/4029..
Acquiring data for accession 2633/4029..
Acquiring data for accession 2634/4029..
Acquiring data for accession 2635/4029..
Acquiring data for accession 2636/4029..
Acquiring data for accession 2637/4029..
Acquiring data for accession 2638/4029..
Acquiring data f

Acquiring data for accession 2814/4029..
Acquiring data for accession 2815/4029..
Acquiring data for accession 2816/4029..
Acquiring data for accession 2817/4029..
Acquiring data for accession 2818/4029..
Acquiring data for accession 2819/4029..
Acquiring data for accession 2820/4029..
Acquiring data for accession 2821/4029..
Acquiring data for accession 2822/4029..
Acquiring data for accession 2823/4029..
Acquiring data for accession 2824/4029..
Acquiring data for accession 2825/4029..
Acquiring data for accession 2826/4029..
Acquiring data for accession 2827/4029..
Acquiring data for accession 2828/4029..
Acquiring data for accession 2829/4029..
Acquiring data for accession 2830/4029..
Acquiring data for accession 2831/4029..
Acquiring data for accession 2832/4029..
Acquiring data for accession 2833/4029..
Acquiring data for accession 2834/4029..
Acquiring data for accession 2835/4029..
Acquiring data for accession 2836/4029..
Acquiring data for accession 2837/4029..
Acquiring data f

Acquiring data for accession 3013/4029..
Acquiring data for accession 3014/4029..
Acquiring data for accession 3015/4029..
Acquiring data for accession 3016/4029..
Acquiring data for accession 3017/4029..
Acquiring data for accession 3018/4029..
Acquiring data for accession 3019/4029..
Acquiring data for accession 3020/4029..
Acquiring data for accession 3021/4029..
Acquiring data for accession 3022/4029..
Acquiring data for accession 3023/4029..
Acquiring data for accession 3024/4029..
Acquiring data for accession 3025/4029..
Acquiring data for accession 3026/4029..
Acquiring data for accession 3027/4029..
Acquiring data for accession 3028/4029..
Acquiring data for accession 3029/4029..
Acquiring data for accession 3030/4029..
Acquiring data for accession 3031/4029..
Acquiring data for accession 3032/4029..
Acquiring data for accession 3033/4029..
Acquiring data for accession 3034/4029..
Acquiring data for accession 3035/4029..
Acquiring data for accession 3036/4029..
Acquiring data f

Acquiring data for accession 3212/4029..
Acquiring data for accession 3213/4029..
Acquiring data for accession 3214/4029..
Acquiring data for accession 3215/4029..
Acquiring data for accession 3216/4029..
Acquiring data for accession 3217/4029..
Acquiring data for accession 3218/4029..
Acquiring data for accession 3219/4029..
Acquiring data for accession 3220/4029..
Acquiring data for accession 3221/4029..
Acquiring data for accession 3222/4029..
Acquiring data for accession 3223/4029..
Acquiring data for accession 3224/4029..
Acquiring data for accession 3225/4029..
Acquiring data for accession 3226/4029..
Acquiring data for accession 3227/4029..
Acquiring data for accession 3228/4029..
Acquiring data for accession 3229/4029..
Acquiring data for accession 3230/4029..
Acquiring data for accession 3231/4029..
Acquiring data for accession 3232/4029..
Acquiring data for accession 3233/4029..
Acquiring data for accession 3234/4029..
Acquiring data for accession 3235/4029..
Acquiring data f

Acquiring data for accession 3411/4029..
Acquiring data for accession 3412/4029..
Acquiring data for accession 3413/4029..
Acquiring data for accession 3414/4029..
Acquiring data for accession 3415/4029..
Acquiring data for accession 3416/4029..
Acquiring data for accession 3417/4029..
Acquiring data for accession 3418/4029..
Acquiring data for accession 3419/4029..
Acquiring data for accession 3420/4029..
Acquiring data for accession 3421/4029..
Acquiring data for accession 3422/4029..
Acquiring data for accession 3423/4029..
Acquiring data for accession 3424/4029..
Acquiring data for accession 3425/4029..
Acquiring data for accession 3426/4029..
Acquiring data for accession 3427/4029..
Acquiring data for accession 3428/4029..
Acquiring data for accession 3429/4029..
Acquiring data for accession 3430/4029..
Acquiring data for accession 3431/4029..
Acquiring data for accession 3432/4029..
Acquiring data for accession 3433/4029..
Acquiring data for accession 3434/4029..
Acquiring data f

Acquiring data for accession 3610/4029..
Acquiring data for accession 3611/4029..
Acquiring data for accession 3612/4029..
Acquiring data for accession 3613/4029..
Acquiring data for accession 3614/4029..
Acquiring data for accession 3615/4029..
Acquiring data for accession 3616/4029..
Acquiring data for accession 3617/4029..
Acquiring data for accession 3618/4029..
Acquiring data for accession 3619/4029..
Acquiring data for accession 3620/4029..
Acquiring data for accession 3621/4029..
Acquiring data for accession 3622/4029..
Acquiring data for accession 3623/4029..
Acquiring data for accession 3624/4029..
Acquiring data for accession 3625/4029..
Acquiring data for accession 3626/4029..
Acquiring data for accession 3627/4029..
Acquiring data for accession 3628/4029..
Acquiring data for accession 3629/4029..
Acquiring data for accession 3630/4029..
Acquiring data for accession 3631/4029..
Acquiring data for accession 3632/4029..
Acquiring data for accession 3633/4029..
Acquiring data f

Acquiring data for accession 3809/4029..
Acquiring data for accession 3810/4029..
Acquiring data for accession 3811/4029..
Acquiring data for accession 3812/4029..
Acquiring data for accession 3813/4029..
Acquiring data for accession 3814/4029..
Acquiring data for accession 3815/4029..
Acquiring data for accession 3816/4029..
Acquiring data for accession 3817/4029..
Acquiring data for accession 3818/4029..
Acquiring data for accession 3819/4029..
Acquiring data for accession 3820/4029..
Acquiring data for accession 3821/4029..
Acquiring data for accession 3822/4029..
Acquiring data for accession 3823/4029..
Acquiring data for accession 3824/4029..
Acquiring data for accession 3825/4029..
Acquiring data for accession 3826/4029..
Acquiring data for accession 3827/4029..
Acquiring data for accession 3828/4029..
Acquiring data for accession 3829/4029..
Acquiring data for accession 3830/4029..
Acquiring data for accession 3831/4029..
Acquiring data for accession 3832/4029..
Acquiring data f

Acquiring data for accession 4008/4029..
Acquiring data for accession 4009/4029..
Acquiring data for accession 4010/4029..
Acquiring data for accession 4011/4029..
Acquiring data for accession 4012/4029..
Acquiring data for accession 4013/4029..
Acquiring data for accession 4014/4029..
Acquiring data for accession 4015/4029..
Acquiring data for accession 4016/4029..
Acquiring data for accession 4017/4029..
Acquiring data for accession 4018/4029..
Acquiring data for accession 4019/4029..
Acquiring data for accession 4020/4029..
Acquiring data for accession 4021/4029..
Acquiring data for accession 4022/4029..
Acquiring data for accession 4023/4029..
Acquiring data for accession 4024/4029..
Acquiring data for accession 4025/4029..
Acquiring data for accession 4026/4029..
Acquiring data for accession 4027/4029..
Acquiring data for accession 4028/4029..
Acquiring data for accession 4029/4029..


[Parallel(n_jobs=1)]: Done 4029 out of 4029 | elapsed:   16.7s finished


Computing selected features for PPIs...


[Parallel(n_jobs=1)]: Done   0 out of   0 | elapsed:    0.0s finished


Updating instance attributes...


[Parallel(n_jobs=1)]: Done   0 out of   0 | elapsed:    0.0s finished


KeyError: 'accession'

In [21]:
annotation_ex._ppi_df

Index([], dtype='object')

In [None]:
# Make the estimators and BR classifier
# -------------------------------------------------------------------- #
if retrain or not os.path.isfile(classifier_path):
    logger.info("Making classifier...")
    params = get_parameter_distribution_for_model(model)
    random_cv = RandomizedSearchCV(
        cv=n_splits,
        n_iter=rcv_iter,
        n_jobs=n_jobs,
        refit=True, 
        random_state=0,
        scoring='f1', 
        error_score=0,
        param_distributions=params,
        estimator=make_classifier(model)
    )
    clf = OneVsRestClassifier(estimator=random_cv, n_jobs=1)

    # Fit the complete training data and make predictions.
    logging.info("Fitting data...")
    clf.fit(X_train, y_train)
    joblib.dump(clf, classifier_path)

In [None]:
# Loads a previously (or recently trained) classifier from disk
# and then performs the predictions on the new dataset.
# -------------------------------------------------------------------- #
logging.info("Making predictions...")
clf = joblib.load(classifier_path)
predictions = clf.predict_proba(X_test)

In [None]:
# Write the predictions to a tsv file
# -------------------------------------------------------------------- #
logging.info("Writing results to file...")
header = "{p1}\t{p2}\t{g1}\t{g2}\t{classes}\tsum\n".format(
    p1=P1, p2=P2, g1=G1, g2=G2, classes='\t'.join(sorted(mlb.classes_))
)
out_file.write(header)
acc = annotation_ex.accession_vocabulary[UniProt.accession_column()]
genes = annotation_ex.accession_vocabulary[UniProt.data_types().GENE.value]
accession_gene_map = {a: g for (a, g) in zip(acc, genes)}
for (s, t), p_vec in zip(X_test_ppis, predictions):
    p_vec = [p for _, p in sorted(zip(mlb.classes_, p_vec))]
    g1 = accession_gene_map.get(s, ['-'])[0] or '-'
    g2 = accession_gene_map.get(t, ['-'])[0] or '-'

    # Compute the usability of each of the annotation sets
    annots = annotation_ex.transform([s, t])
    go = set([
        g.strip() for gs in annots
        for g in x.split(',')
        if 'go' in g.strip().lower()
    ])
    pf = set([
        g.strip() for gs in annots
        for g in x.split(',')
        if 'pf' in g.strip().lower()
    ])
    ipr = set([
        g.strip() for gs in annots
        for g in x.split(',')
        if 'ipr' in g.strip().lower()
    ])
    usability_go = (go & training_go) / (go | training_go)
    usability_pf = (pf & training_pfam) / (pf | training_pfam)
    usability_ipr = (ipr & training_ipr) / (ipr | training_ipr)

    sum_pr = sum(p_vec)
    line = "{s}\t{t}\t{g1}\t{g2}\t{classes}\t{sum_pr}\t{usability_go}" \
           "\t{usability_pf}\t{usability_ipr}\n".format(
            s=s, t=t, g1=g1, g2=g2, sum_pr=sum_pr,
            classes='\t'.join(['%.4f' % p for p in p_vec]),
            usability_go=usability_go,
            usability_pf=usability_pf,
            usability_ipr=usability_ipr)
    out_file.write(line)
out_file.close()