In [63]:
%reload_ext autoreload

"""
This script runs the bootstrap kfold validation experiments as used in
the publication.

Usage:
  validation.py [--interpro] [--pfam] [--mf] [--cc] [--bp]
             [--use_cache] [--induce] [--verbose]
             [--model=M] [--n_jobs=J] [--n_splits=S] [--n_iterations=I]
             [--directory=DIR]
  validation.py -h | --help

Options:
  -h --help     Show this screen.
  --interpro    Use interpro domains in features.
  --pfam        Use Pfam domains in features.
  --mf          Use Molecular Function Gene Ontology in features.
  --cc          Use Cellular Compartment Gene Ontology in features.
  --bp          Use Biological Process Gene Ontology in features.
  --induce      Use ULCA inducer over Gene Ontology.
  --verbose     Print intermediate output for debugging.
  --use_cache   Use cached features if available.
  --model=M         A binary classifier from Scikit-Learn implementing fit,
                    predict and predict_proba [default: LogisticRegression]
  --n_jobs=J        Number of processes to run in parallel [default: 1]
  --n_splits=S      Number of cross-validation splits [default: 5]
  --n_iterations=I  Number of bootstrap iterations [default: 5]
  --directory=DIR   Output directory [default: ./results/]
"""

import json
import logging
import pandas as pd
import numpy as np
from operator import itemgetter
from collections import Counter
from datetime import datetime

from pyppi.base import parse_args, su_make_dir
from pyppi.data import load_network_from_path, load_ptm_labels
from pyppi.data import testing_network_path, training_network_path

from pyppi.models.binary_relevance import BinaryRelevance, get_coefs
from pyppi.models import make_classifier
from pyppi.model_selection.scoring import MultilabelScorer, Statistics
from pyppi.model_selection.experiment import KFoldExperiment, Bootstrap
from pyppi.model_selection.sampling import IterativeStratifiedKFold

from pyppi.data_mining.features import AnnotationExtractor
from pyppi.data_mining.uniprot import UniProt, get_active_instance
from pyppi.data_mining.tools import xy_from_interaction_frame

from sklearn.base import clone
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.metrics import (
    recall_score, make_scorer, 
    label_ranking_average_precision_score,
    label_ranking_loss
)

from sklearn.datasets import make_multilabel_classification

logging.basicConfig(
    format='[%(asctime)s] %(levelname)s: %(message)s', 
    datefmt='%m-%d-%Y %I:%M:%S',
    level=logging.DEBUG,
)
logger = logging.getLogger(__name__)

args = {
    'n_jobs': 1,
    'n_splits': 5,
    'n_iterations': 5,
    'induce': True,
    'verbose': True,
    'selection': [
        UniProt.data_types().GO_MF.value,
        UniProt.data_types().GO_BP.value,
        UniProt.data_types().GO_CC.value,
        UniProt.data_types().INTERPRO.value,
        UniProt.data_types().PFAM.value
    ],
    'model': 'LogisticRegression',
    'use_cache': True,
    'directory': './results/'
}
n_jobs = args['n_jobs']
n_splits = args['n_splits']
n_iter = args['n_iterations']
induce = args['induce']
verbose = args['verbose']
selection = args['selection']
model = args['model']
use_feature_cache = args['use_cache']
direc = args['directory']
backend = 'multiprocessing'

# Set up the folder for each experiment run named after the current time
folder = datetime.now().strftime("val_%y-%m-%d_%H-%M")
direc = "{}/{}/".format(direc, folder)
su_make_dir(direc)
json.dump(
    args, fp=open("{}/settings.json".format(direc), 'w'),
    indent=4, sort_keys=True)

In [None]:
logging.info("Loading training and testing data.")
uniprot = get_active_instance(
    verbose=verbose,
    sprot_cache=None,
    trembl_cache=None
)
data_types = UniProt.data_types()
labels = load_ptm_labels()
annotation_ex = AnnotationExtractor(
    induce=induce,
    selection=selection,
    n_jobs=n_jobs,
    verbose=verbose,
    cache=use_feature_cache,
    backend='multiprocessing'
)
training = load_network_from_path(training_network_path)
testing = load_network_from_path(testing_network_path)

In [20]:
# Get the features into X, and multilabel y indicator format
logging.info("Preparing training and testing data.")
mlb = MultiLabelBinarizer(classes=labels)
X_train_ppis, y_train = xy_from_interaction_frame(training)
X_test_ppis, y_test = xy_from_interaction_frame(testing)
mlb.fit(y_train)

X_train = annotation_ex.transform(X_train_ppis)
X_test = annotation_ex.transform(X_test_ppis)
y_train = mlb.transform(y_train)
y_test = mlb.transform(y_test)

[08-13-2017 04:47:55] INFO Preparing training and testing data.


NameError: name 'training' is not defined

In [62]:
def specificity(y_true, y_pred):
    y_true = np.asarray(list(y_true))
    y_pred = np.asarray(list(y_pred))
    tn = np.sum(y_true == 0)
    fp = np.sum([True for (a, b) in zip(y_true, y_pred) if a == 0 and b == 1])
    if (tn + fp) == 0:
        return np.NaN
    return tn / (tn + fp)

def top_n_features(n, clf, absolute=False, vectorizer=None):
    """
    Return the top N features. If clf is a pipeline, then it assumes
    the first step is the vectoriser holding the feature names.

    :return: array like, shape (n_estimators, n).
        Each element in a list is a tuple (feature_idx, weight).
    """
    check_is_fitted(clf, 'fitted_')
    top_features = []
    coefs = get_coefs(clf)

    if absolute:
        coef = abs(coef)
    if hasattr(e, 'steps') and vectorizer is None:
        vectorizer = e.steps[0][-1]
    idx_coefs = sorted(
        enumerate(coef), key=itemgetter(1), reverse=True
    )[:n]
    if vectorizer:
        idx = [idx for (idx, w) in idx_coefs]
        ws = [w for (idx, w) in idx_coefs]
        features = np.asarray(vectorizer.get_feature_names())[idx]
        top_features.append(list(zip(features, ws)))
    else:
        top_features.append(idx_coefs)

    return top_features

In [40]:
logging.info("Setting up preliminaries and the statistics arrays")
n_classes = 18 #len(mlb.classes_)
seeds = range(n_iter)
top_features = {}
param_distribution = {
    'C': np.arange(0.01, 20.01, step=0.01),
    'penalty': ['l1', 'l2']
}

binary_scoring_funcs = [
    ('Binary F1', f1_score) , 
    ('Precision', precision_score), 
    ('Recall', recall_score),
    ('Specificity', recall_score)
]
multilabel_scores_funcs = [
    ('Label Ranking Loss', label_ranking_loss), 
    ('Label Ranking Average Precision', label_ranking_average_precision_score), 
    ('Macro (weighted) F1', f1_score), 
    ('Macro (un-weighted) F1', f1_score)
]
n_scorers = len(binary_scoring_funcs)
n_ml_scorers = len(mlb_scores_funcs)

# 2: position 0 is for validation, position 1 is for testing
binary_statistics = np.zeros((n_iter, n_splits, n_classes, 2, n_scorers))
multilabel_statistics = np.zeros((n_iter, n_splits, 2, n_ml_scorers))

[08-13-2017 05:06:16] INFO Setting up preliminaries and the statistics arrays


In [56]:
X_train, y_train = make_multilabel_classification(
    n_classes=5, n_features=1000, n_labels=5, n_samples=1000, allow_unlabeled=False)
X_test, y_test = make_multilabel_classification(
    n_classes=5, n_features=1000, n_labels=5, n_samples=200, allow_unlabeled=False)
labels = range(5)

for bs_iter in range(n_iter):
    logging.info("Fitting bootstrap iteration {}.".format(bs_iter + 1))
    cv = IterativeStratifiedKFold(n_splits=n_splits, random_state=seeds[bs_iter])
    
    for fold_iter, (train_idx, validation_idx) in enumerate(cv.split(X_train, y_train)):
        logging.info("Fitting fold iteration {}.".format(fold_iter + 1))
        y_valid_f_pred = []
        y_test_f_pred = []
        y_valid_f_proba = []
        y_test_f_proba = []

        for label_idx, label in enumerate(labels):
            logging.info("Fitting label {}.".format(label))
            
            # Prepare all training and testing data
            logging.info("Preparing data.")
            if False:
                vectorizer = CountVectorizer(binary=False)
                vectorizer.fit(X_train, y_train)
                
                X_train_l = vectorizer.transform(X_train[train_idx])
                y_train_l = y_train[train_idx, label_idx]
                
                X_valid_l = vectorizer.transform(X_train[validation_idx])
                y_valid_l = y_train[validation_idx, label_idx]

                X_test_l = vectorizer.transform(X_test)
                y_test_l = y_test[:, label_idx]
            
            else:
                X_train_l = X_train[train_idx]
                y_train_l = y_train[train_idx, label_idx]
                
                X_valid_l = X_train[validation_idx]
                y_valid_l = y_train[validation_idx, label_idx]

                X_test_l = X_test
                y_test_l = y_test[:, label_idx]
            
            # Build and fit classifier
            logging.info("Fitting classifier.")
            clf = RandomizedSearchCV(
                estimator=make_classifier(algorithm=model, random_state=0),
                scoring='f1', cv=3, n_iter=60, n_jobs=n_jobs, 
                random_state=0, param_distributions=param_distribution,
            )
            clf.fit(X_train_l, y_train_l)
            
            # Validation scores in binary and probability format
            y_valid_l_pred = clf.predict(X_valid_l)
            y_valid_l_proba = clf.predict_proba(X_valid_l)
            
            # Held-out testing scores in binary and probability format
            y_test_l_pred = clf.predict(X_test_l)
            y_test_l_proba = clf.predict_proba(X_test_l)
            
            # Store these per label results in a list which we will
            # later use to stack into a multi-label array.
            y_valid_f_pred.append([[x] for x in y_valid_l_pred])
            y_valid_f_proba.append([[x[1]] for x in y_valid_l_proba])
            
            y_test_f_pred.append([[x] for x in y_test_l_pred])
            y_test_f_proba.append([[x[1]] for x in y_test_l_proba])
            
            # Perform scoring on the validation set and the external testing set.
            logging.info("Computing fold label binary performance.")
            for func_idx, (_, func) in enumerate(binary_scoring_funcs):
                scores_v = func(y_valid_l, y_valid_l_pred, average='binary')
                scores_t = func(y_test_l, y_test_l_pred, average='binary')
                binary_statistics[bs_iter, fold_iter, label_idx, 0, func_idx] = scores_v
                binary_statistics[bs_iter, fold_iter, label_idx, 1, func_idx] = scores_t
                
            logging.info("Computing top label features for fold.")
            # Get the top 20 features for this labels's run.
            top_20 = [
                f for f, _ in top_n_features(
                    clf=clf, n=20, absolute=True, vectorizer=vectorizer
                )
            ]
            top_features[label].append(top_20)
        
        logging.info("Computing fold mult-label performance.")
        # True scores in multi-label indicator format
        y_valid_f = y_train[validation_idx]
        y_test_f = y_test
        
        # Validation scores in multi-label indicator format
        y_valid_f_pred = np.hstack(y_valid_f_pred)
        y_valid_f_proba = np.hstack(y_valid_f_proba)
        
        # Testing scores in multi-label probability format
        y_test_f_pred = np.hstack(y_test_f_pred)
        y_test_f_proba = np.hstack(y_test_f_proba)
        
        for func_idx, (func_name, func) in enumerate(multilabel_scores_funcs):
            if func_name == 'Macro (weighted) F1':
                scores_v = func(y_valid_f, y_valid_f_pred, average='weighted')
                scores_t = func(y_test_f, y_test_f_pred, average='weighted')
            elif func_name == 'Macro (un-weighted) F1':
                scores_v = func(y_valid_f, y_valid_f_pred, average='macro')
                scores_t = func(y_test_f, y_test_f_pred, average='macro')
            elif func_name == 'Label Ranking Average Precision':
                scores_v = func(y_valid_f, y_valid_f_proba)
                scores_t = func(y_test_f, y_test_f_proba)
            else:
                scores_v = func(y_valid_f, y_valid_f_pred)
                scores_t = func(y_test_f, y_test_f_pred)
                
            multilabel_statistics[bs_iter, fold_iter, 0, func_idx] = scores_v
            multilabel_statistics[bs_iter, fold_iter, 1, func_idx] = scores_t

            
logging.info("Writing statistics to file.")
# make a multi-index dataframe for binary stats
func_names = [n for n, _ in binary_scoring_funcs]
iterables = [range(n_iter), range(n_splits), range(n_classes), ["validation", "holdout"], func_names]
names=['bootstrap iteration', 'fold iteration', 'labels', 'condition', 'score function']
index = pd.MultiIndex.from_product(iterables, names=names)
binary_df = pd.DataFrame(binary_statistics.ravel(), index=index)



            
logging.info("Computing top label features overall.")
top_features_df = pd.DataFrame(data=None, columns=mlb.classes)   
for label, feature_ls in top_features.items():
    counts = Counter(feature_ls)
    counts = sorted(counts.items(), key=itemgetter(1), reverse=True)
    top_20 = [f for f, _ in counts][:20]
    top_features_df[label] = top_20

logging.info("Writing top features to file.")
top_features_df.to_csv('{}/{}'.format(folder, 'top_features'), sep=',', index=False)


[08-13-2017 05:17:24] INFO Fitting bootstrap iteration 1.
[08-13-2017 05:17:24] INFO Fitting fold iteration 1.
[08-13-2017 05:17:24] INFO Fitting label 0.
[08-13-2017 05:17:24] INFO Preparing data.
[08-13-2017 05:17:24] INFO Fitting classifier.
[08-13-2017 05:17:29] INFO Computing fold performance.
[08-13-2017 05:17:29] INFO Fitting label 1.
[08-13-2017 05:17:29] INFO Preparing data.
[08-13-2017 05:17:30] INFO Fitting classifier.
[08-13-2017 05:17:34] INFO Computing fold performance.
[08-13-2017 05:17:34] INFO Fitting label 2.
[08-13-2017 05:17:34] INFO Preparing data.
[08-13-2017 05:17:34] INFO Fitting classifier.
[08-13-2017 05:17:39] INFO Computing fold performance.
[08-13-2017 05:17:39] INFO Fitting label 3.
[08-13-2017 05:17:39] INFO Preparing data.
[08-13-2017 05:17:39] INFO Fitting classifier.
[08-13-2017 05:17:44] INFO Computing fold performance.
[08-13-2017 05:17:44] INFO Fitting label 4.
[08-13-2017 05:17:44] INFO Preparing data.
[08-13-2017 05:17:44] INFO Fitting classifier.

[08-13-2017 05:20:48] INFO Fitting label 0.
[08-13-2017 05:20:48] INFO Preparing data.
[08-13-2017 05:20:48] INFO Fitting classifier.
[08-13-2017 05:20:53] INFO Computing fold performance.
[08-13-2017 05:20:53] INFO Fitting label 1.
[08-13-2017 05:20:53] INFO Preparing data.
[08-13-2017 05:20:53] INFO Fitting classifier.
[08-13-2017 05:20:58] INFO Computing fold performance.
[08-13-2017 05:20:58] INFO Fitting label 2.
[08-13-2017 05:20:58] INFO Preparing data.
[08-13-2017 05:20:58] INFO Fitting classifier.
[08-13-2017 05:21:03] INFO Computing fold performance.
[08-13-2017 05:21:03] INFO Fitting label 3.
[08-13-2017 05:21:03] INFO Preparing data.
[08-13-2017 05:21:03] INFO Fitting classifier.
[08-13-2017 05:21:08] INFO Computing fold performance.
[08-13-2017 05:21:08] INFO Fitting label 4.
[08-13-2017 05:21:08] INFO Preparing data.
[08-13-2017 05:21:08] INFO Fitting classifier.
[08-13-2017 05:21:13] INFO Computing fold performance.
[08-13-2017 05:21:13] INFO Fitting fold iteration 5.
[0

[08-13-2017 05:23:59] INFO Fitting classifier.
[08-13-2017 05:24:04] INFO Computing fold performance.
[08-13-2017 05:24:04] INFO Fitting label 1.
[08-13-2017 05:24:04] INFO Preparing data.
[08-13-2017 05:24:04] INFO Fitting classifier.
[08-13-2017 05:24:07] INFO Computing fold performance.
[08-13-2017 05:24:07] INFO Fitting label 2.
[08-13-2017 05:24:07] INFO Preparing data.
[08-13-2017 05:24:07] INFO Fitting classifier.
[08-13-2017 05:24:13] INFO Computing fold performance.
[08-13-2017 05:24:13] INFO Fitting label 3.
[08-13-2017 05:24:13] INFO Preparing data.
[08-13-2017 05:24:13] INFO Fitting classifier.
[08-13-2017 05:24:17] INFO Computing fold performance.
[08-13-2017 05:24:17] INFO Fitting label 4.
[08-13-2017 05:24:17] INFO Preparing data.
[08-13-2017 05:24:18] INFO Fitting classifier.
[08-13-2017 05:24:22] INFO Computing fold performance.
[08-13-2017 05:24:22] INFO Fitting fold iteration 3.
[08-13-2017 05:24:22] INFO Fitting label 0.
[08-13-2017 05:24:22] INFO Preparing data.
[0

[08-13-2017 05:27:10] INFO Preparing data.
[08-13-2017 05:27:10] INFO Fitting classifier.
[08-13-2017 05:27:14] INFO Computing fold performance.
[08-13-2017 05:27:14] INFO Fitting label 2.
[08-13-2017 05:27:14] INFO Preparing data.
[08-13-2017 05:27:14] INFO Fitting classifier.
[08-13-2017 05:27:20] INFO Computing fold performance.
[08-13-2017 05:27:20] INFO Fitting label 3.
[08-13-2017 05:27:20] INFO Preparing data.
[08-13-2017 05:27:20] INFO Fitting classifier.
[08-13-2017 05:27:25] INFO Computing fold performance.
[08-13-2017 05:27:25] INFO Fitting label 4.
[08-13-2017 05:27:25] INFO Preparing data.
[08-13-2017 05:27:25] INFO Fitting classifier.
[08-13-2017 05:27:29] INFO Computing fold performance.


In [81]:
binary_statistics.shape

(5, 5, 18, 2, 4)

In [82]:
func_names = [n for n, _ in binary_scoring_funcs]
iterables = [range(n_iter), range(n_splits), range(n_classes), ["validation", "holdout"], func_names]
names=['bootstrap iteration', 'fold iteration', 'labels', 'condition', 'score function']
index = pd.MultiIndex.from_product(iterables, names=names)
stats = pd.DataFrame(binary_statistics.ravel(), index=index)

MultiIndex(levels=[[0, 1, 2, 3, 4], [0, 1, 2, 3, 4], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17], ['holdout', 'validation'], ['Binary F1', 'Precision', 'Recall', 'Specificity']],
           labels=[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,