In [4]:
%reload_ext autoreload

"""
This script runs classifier training over the entire training data and then
output predictions over the interactome.

Usage:
  predict_ppis.py [--interpro] [--pfam] [--mf] [--cc] [--bp]
                  [--use_cache] [--retrain] [--induce] [--verbose]
                  [--model=M] [--n_jobs=J] [--n_splits=S] [--n_iterations=I]
                  [--input=FILE] [--output=FILE] [--directory=DIR]
  predict_ppis.py -h | --help

Options:
  -h --help     Show this screen.
  --interpro    Use interpro domains in features.
  --pfam        Use Pfam domains in features.
  --mf          Use Molecular Function Gene Ontology in features.
  --cc          Use Cellular Compartment Gene Ontology in features.
  --bp          Use Biological Process Gene Ontology in features.
  --induce      Use ULCA inducer over Gene Ontology.
  --verbose     Print intermediate output for debugging.
  --binary      Use binary feature encoding instead of ternary.
  --use_cache   Use cached features if available.
  --retrain     Re-train classifier instead of loading previous version. If
                using a previous version, you must use the same selection of
                features along with the same induce setting.
  --model=M         A binary classifier from Scikit-Learn implementing fit,
                    predict and predict_proba [default: LogisticRegression].
                    Ignored if using 'retrain'.
  --n_jobs=J        Number of processes to run in parallel [default: 1]
  --n_splits=S      Number of cross-validation splits used during randomized
                    grid search [default: 5]
  --n_iterations=I  Number of randomized grid search iterations [default: 60]
  --input=FILE      Uniprot edge-list, with a path directory that absolute or
                    relative to this script. Entries must be tab separated with
                    header columns 'source' and 'target'. [default: None]
  --output=FILE     Output file name [default: predictions.tsv]
  --directory=DIR   Absolute or relative output directory [default: ./results/]
"""
import os
import json
import logging
import numpy as np
import pandas as pd
from joblib import Parallel, delayed
from datetime import datetime

from pyppi.base import parse_args, su_make_dir, chunk_list
from pyppi.base import P1, P2, G1, G2, SOURCE, TARGET
from pyppi.data import load_network_from_path, load_ptm_labels
from pyppi.data import full_training_network_path, generic_io
from pyppi.data import interactome_network_path, classifier_path

from pyppi.models import make_classifier, get_parameter_distribution_for_model

from pyppi.database import begin_transaction
from pyppi.database.models import Interaction
from pyppi.database.managers import InteractionManager, ProteinManager
from pyppi.database.managers import format_interactions_for_sklearn

from pyppi.data_mining.tools import xy_from_interaction_frame
from pyppi.data_mining.generic import edgelist_func, generic_to_dataframe
from pyppi.data_mining.tools import map_network_accessions
from pyppi.data_mining.uniprot import batch_map
from pyppi.data_mining.features import compute_interaction_features

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.multiclass import OneVsRestClassifier

In [13]:
logger = logging.getLogger("scripts")
handler = logging.StreamHandler()
formatter = logging.Formatter(
    '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
)
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.setLevel(logging.INFO)
logger.propagate = False

args = parse_args({
    '--n_jobs': 14,
    '--n_splits': 5,
    '--n_iterations': 120,
    '--verbose': True,
    
    '--mf': True,
    '--bp': True,
    '--cc': True,
    '--interpro':True,
    '--pfam': True,
    '--induce': True,
    '--binary': False,
    
    '--model': 'LogisticRegression',
    '--use_cache': True,
    '--output': 'predictions.tsv',
    '--input': './data/test.tsv',
    '--directory': './results/',
    '--retrain': True
})

n_jobs = args['n_jobs']
n_splits = args['n_splits']
rcv_iter = args['n_iterations']
induce = args['induce']
verbose = args['verbose']
selection = args['selection']
model = args['model']
use_feature_cache = args['use_cache']
out_file = args['output']
input_file = args['input']
direc = args['directory']
retrain = args['retrain']
use_binary = args['binary']


# Set up the folder for each experiment run named after the current time
# -------------------------------------------------------------------- #
folder = datetime.now().strftime("pred_%y-%m-%d_%H-%M-%S")
direc = "{}/{}/".format(direc, folder)
su_make_dir(direc)
json.dump(args, fp=open("{}/settings.json".format(direc), 'w'), indent=4, sort_keys=True)
i_manager = InteractionManager(verbose=verbose, match_taxon_id=9606)
p_manager = ProteinManager(verbose=verbose, match_taxon_id=9606)    

In [10]:
# Get the input edge-list ready
# -------------------------------------------------------------------- #
if __name__ == "__main__":
    with begin_transaction() as session:
        labels = i_manager.training_labels(session, include_holdout=True)
        training = i_manager.training_interactions(session, filter_out_holdout=False)
        
        if input_file == 'default':
            logger.info("Loading interactome data...")
            testing = i_manager.interactome_interactions(
                session=session,
                filter_out_holdout=False,
                filter_out_training=False
            )
        else:
            logger.info("Loading custom ppi data...")
            testing = generic_to_dataframe(
                f_input=generic_io(input_file),
                parsing_func=edgelist_func,
                drop_nan=True,
                allow_self_edges=True,
                allow_duplicates=True
            )
            sources = set(p for p in testing.source.values)
            targets = set(p for p in testing.target.values)
            accessions = list(sources | targets)
            accession_mapping = batch_map(
                session=session,
                accessions=accessions,
                keep_unreviewed=True,
                match_taxon_id=9606,
                allow_download=True
            )
            testing_network = map_network_accessions(
                interactions=testing, accession_map=accession_mapping,
                drop_nan=True, allow_self_edges=True,
                allow_duplicates=False, min_counts=None, merge=False
            )

            # Compute features for new ppis
            testing = []
            feature_map = {}
            protein_map = p_manager.uniprotid_entry_map(session)
            ppis = [
                (protein_map[a], protein_map[b])
                for (a, b) in zip(testing_network[SOURCE], testing_network[TARGET])
                if i_manager.get_by_source_target(session, a, b) is None
            ]

            logger.info("Computing features.")
            features = Parallel(n_jobs=n_jobs, backend="multiprocessing", verbose=verbose)(
                delayed(compute_interaction_features)(source, target)
                for (source, target) in ppis
            )
            for (source, target), features in zip(ppis, features):    
                feature_map[(source.uniprot_id, target.uniprot_id)] = features

            for (a, b) in zip(testing_network[SOURCE], testing_network[TARGET]):
                entry = i_manager.get_by_source_target(session, a, b)
                if entry is None:
                    logger.info("Creating new Interaction ({},{}).".format(a, b))
                    entry = Interaction(
                        source=protein_map[a], target=protein_map[b],
                        is_interactome=False,
                        is_training=False,
                        is_holdout=False,
                        label=None,
                        **feature_map[(a, b)]
                    )
                    entry.save(session, commit=True)
                testing.append(entry)

2017-12-16 12:33:32,772 scripts      INFO     Loading custom ppi data...
2017-12-16 12:33:52,150 scripts      INFO     Computing features.
[Parallel(n_jobs=14)]: Done   0 out of   0 | elapsed:    0.0s finished


In [11]:
# Get the features into X, and multilabel y indicator format
# -------------------------------------------------------------------- #
logger.info("Preparing training and testing data...")
X_train, y_train = format_interactions_for_sklearn(training, selection)
X_test, _ = format_interactions_for_sklearn(testing, selection)

mlb = MultiLabelBinarizer(classes=sorted(labels))
mlb.fit(y_train)
y_train = mlb.transform(y_train)

vectorizer = CountVectorizer(binary=True if use_binary else False)
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

2017-12-16 12:34:03,501 scripts      INFO     Preparing training and testing data...


In [None]:
# Make the estimators and BR classifier
# -------------------------------------------------------------------- #
if retrain or not os.path.isfile(classifier_path):
    logger.info("Making classifier...")
    params = get_parameter_distribution_for_model(model)
    random_cv = RandomizedSearchCV(
        cv=n_splits,
        n_iter=rcv_iter,
        n_jobs=n_jobs,
        refit=True, 
        random_state=42,
        scoring='f1', 
        error_score=0,
        param_distributions=params,
        estimator=make_classifier(model)
    )
    clf = OneVsRestClassifier(estimator=random_cv, n_jobs=1)

    # Fit the complete training data and make predictions.
    logging.info("Fitting data...")
    clf.fit(X_train, y_train)
    joblib.dump(clf, classifier_path)

2017-12-16 12:40:52,316 scripts      INFO     Making classifier...
2017-12-16 12:40:52,316 scripts      INFO     Making classifier...


In [None]:
# Loads a previously (or recently trained) classifier from disk
# and then performs the predictions on the new dataset.
# -------------------------------------------------------------------- #
logging.info("Making predictions...")
clf = joblib.load(classifier_path)
predictions = clf.predict_proba(X_test)

In [None]:
# Write the predictions to a tsv file
# -------------------------------------------------------------------- #
logging.info("Writing results to file...")
data_dict = {
    P1: [s for (s, _) in X_test_ppis],
    P2: [t for (_, t) in X_test_ppis],
    "sum": np.sum(predictions, axis=1)
}

for idx, label in enumerate(mlb.classes):
    data_dict[label] = predictions[:, idx]

columns = [P1, P2, G1, G2] + list(sorted(mlb.classes)) + ['sum']
df = pd.DataFrame(data=data_dict, columns=columns)

accession_gene_map = {p.uniprot_id: p.gene_id for p in protein_map.values()}
df['{}'.format(G1)] = df.apply(
    func=lambda row: accession_gene_map.get(row[P1], ['-'])[0] or '-',
    axis=1)
df['{}'.format(G2)] = df.apply(
    func=lambda row: accession_gene_map.get(row[P2], ['-'])[0] or '-', 
    axis=1)
df.to_csv("{}/{}".format(direc, out_file), sep=',', index=False)

In [None]:
# Calculate the proportion of the interactome classified at a threshold value, t.
logging.info("Writing results to file...")
thresholds = np.arange(0.0, 1.05, 0.05)
proportions = np.zeros_like(thresholds)
for i, t in enumerate(thresholds):
    classified = sum(map(lambda p: np.max(p) >= t, predictions))
    proportion = classified / predictions.shape[0]
    proportions[i] = proportion

with open("{}/thresholds.csv".format(direc), 'wt') as fp:
    for (t, p) in zip(thresholds, proportions):
        fp.write("{},{}\n".format(t, p))