In [241]:
"""
This script runs classifier training over the entire training data and then
output predictions over the interactome.
"""

import json
import numpy as np
import scipy as sp

from pyPPI.base import parse_args, su_make_dir
from pyPPI.data import load_network_from_path, load_ptm_labels
from pyPPI.data import testing_network_path, training_network_path

from pyPPI.models import make_classifier
from pyPPI.model_selection.scoring import MultilabelScorer, Statistics
from pyPPI.model_selection.experiment import KFoldExperiment, Bootstrap
from pyPPI.model_selection.sampling import IterativeStratifiedKFold

from pyPPI.data_mining.features import AnnotationExtractor
from pyPPI.data_mining.uniprot import UniProt, get_active_instance
from pyPPI.data_mining.tools import xy_from_interaction_frame

from sklearn.base import clone
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import f1_score, precision_score, label_ranking_average_precision_score
from sklearn.metrics import recall_score, make_scorer, label_ranking_loss, log_loss
from sklearn.linear_model import LogisticRegression

from skmultilearn.problem_transform.br import BinaryRelevance
from skmultilearn.problem_transform.cc import ClassifierChain

n_jobs=3
n_splits=5
n_iterations=1
induce=True
verbose=True
model = 'LogisticRegression'

In [2]:
print("Loading data labels...")
labels = load_ptm_labels()

print("Loading feature data...")
uniprot = get_active_instance(verbose=verbose)
data_types = UniProt.data_types()
selection = [
    data_types.GO_MF.value,
    data_types.GO_BP.value,
    data_types.GO_CC.value,
    data_types.INTERPRO.value,
    data_types.PFAM.value
]

Loading data labels...
Loading feature data...
First time loading on UniProt instance. Make take a few moments


In [3]:
# Build the features from protein annotation selection.
print('Building the features from protein annotation selection...')
annotation_ex = AnnotationExtractor(
    induce=induce,
    selection=selection,
    n_jobs=n_jobs,
    verbose=verbose,
    cache=True
)

Building the features from protein annotation selection...


In [57]:
print("Preparing training and testing data...")
training = load_network_from_path(training_network_path)
testing = load_network_from_path(testing_network_path)
mlb = MultiLabelBinarizer(classes=labels, sparse_output=True)

X_dev_ppis, y_dev = xy_from_interaction_frame(training)
X_test_ppis, y_test = xy_from_interaction_frame(testing)
mlb.fit(y_dev)

X_dev = annotation_ex.transform(X_dev_ppis)
y_dev = mlb.transform(y_dev)

X_test = annotation_ex.transform(X_test_ppis)
y_test = mlb.transform(y_test)

Preparing training and testing data...
Finding new PPIs...
Stringing selected features for each PPI...
Finding new PPIs...
Stringing selected features for each PPI...


In [162]:
print("Setting up scorers...")
f1_scorer = MultilabelScorer(f1_score)
recall_scorer = MultilabelScorer(recall_score)
precision_scorer = MultilabelScorer(precision_score)

Setting up scorers...


In [242]:
print("Setting up experiments...")
cv_seeds = range(1, n_splits + 1)
clf_seeds = range(1, n_iterations * n_splits + 1)
clf_seeds = np.asarray(seeds).reshape(n_iterations, n_splits)

binary_score_data = np.zeros((n_iterations, n_splits, 3, len(mlb.classes)))
mlb_score_data = np.zeros((n_iterations, n_splits, 4, 1))

binary_score_data_hold_out = np.zeros((n_iterations, n_splits, 3, len(mlb.classes)))
mlb_score_data_hold_out = np.zeros((n_iterations, n_splits, 4, 1))

binary_scoring_funcs = [
    ('Binary F1', f1_scorer) , 
    ('Precision', precision_scorer), 
    ('Recall', recall_scorer)
]
mlb_scores_funcs = [
    ('Label Ranking Loss', label_ranking_loss), 
    ('Label Ranking Average Precision', label_ranking_average_precision_score), 
    ('Macro (weighted) F1', f1_score), 
    ('Macro (un-weighted) F1', f1_score)
]
param_distribution = {
    'C': np.arange(0.1, 20.1, step=0.1),
    'penalty': ['l1', 'l2']
}

for iter_j in range(n_iterations):
    print("Fitting iteration {}".format(iter_j + 1))
    cv = IterativeStratifiedKFold(n_splits=n_splits, shuffle=True, random_state=cv_seeds[iter_j])
    for split_i, (train_idx, test_idx) in enumerate(cv.split(X_dev, y_dev.toarray())):
        print("\tFitting split {}".format(split_i + 1))
        random_cv = RandomizedSearchCV(
            cv=3,
            n_jobs=n_jobs,
            n_iter=60,
            random_state=clf_seeds[iter_j, split_i],
            param_distributions=param_distribution,
            estimator=make_classifier(model, random_state=clf_seeds[iter_j, split_i]),
            scoring=make_scorer(f1_score, greater_is_better=True)
        )
        clf = BinaryRelevance(random_cv)
        
        vectorizer = CountVectorizer(binary=False)
        X_train_j = vectorizer.fit_transform(X_dev[train_idx, ])
        y_train_j = y_dev[train_idx, ]        
        clf.fit(X_train_j, y_train_j)
        
        X_test_j = vectorizer.transform(X_dev[test_idx, ])
        y_true_j = y_dev[test_idx, ].toarray()
        y_pred_j = clf.predict(X_test_j).toarray()
        y_proba_j = clf.predict_proba(X_test_j).toarray()
        
        X_hold_out = vectorizer.transform(X_test)
        y_pred_hold_out = clf.predict(X_hold_out).toarray()
        y_proba_hold_out = clf.predict_proba(X_hold_out).toarray()
        y_true_hold_out = y_test.toarray()
        
        for func_idx, (_, func) in enumerate(binary_scoring_funcs):
            scores_v = func(y_true_j, y_pred_j, average='binary')
            binary_score_data[iter_j, split_i, func_idx, :] = scores_v
            
            scores_h = func(y_true_hold_out, y_pred_hold_out, average='binary')
            binary_score_data_hold_out[iter_j, split_i, func_idx, :] = scores_h
        
        for func_idx, (func_name, func) in enumerate(mlb_scores_funcs):
            if func_name == 'Macro (weighted) F1':
                scores_v = func(y_true_j, y_pred_j, average='weighted')
                scores_h = func(y_true_hold_out, y_pred_hold_out, average='weighted')
            elif func_name == 'Macro (un-weighted) F1':
                scores_v = func(y_true_j, y_pred_j, average='macro')
                scores_h = func(y_true_hold_out, y_pred_hold_out, average='macro')
            elif func_name == 'Label Ranking Average Precision':
                scores_v = func(y_true_j, y_proba_j)
                scores_h = func(y_true_hold_out, y_proba_hold_out)
            else:
                scores_v = func(y_true_j, y_pred_j)
                scores_h = func(y_true_hold_out, y_pred_hold_out)
            
            mlb_score_data[iter_j, split_i, func_idx, 0] = scores_v
            mlb_score_data_hold_out[iter_j, split_i, func_idx, 0] = scores_h          

Setting up experiments...
Fitting iteration 1
	Fitting split 1




KeyboardInterrupt: 