In [6]:
import warnings
import numpy as np

from pyppi.model_selection.sampling import IterativeStratifiedKFold
from pyppi.predict.utilities import load_validation_dataset
from pyppi.models.utilities import (
    make_gridsearch_clf, make_classifier, make_gridsearch_clf,
    get_parameter_distribution_for_model   
)
from pyppi.models.classifier_chain import KRandomClassifierChains

from sklearn.base import clone
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.multioutput import ClassifierChain
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from sklearn.exceptions import UndefinedMetricWarning

from skmultilearn.adapt import MLkNN
from skmultilearn.neurofuzzy import MLARAM
from skmultilearn.problem_transform.cc import ClassifierChain

warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

In [7]:
selection=['interpro', 'pfam', 'go_mf', 'go_bp', 'go_cc']
data = load_validation_dataset(selection=selection, taxon_id=9606)

X, y = data["training"]
X_hold, y_hold = data["testing"]
mlb = data['binarizer']

cv = list(
    IterativeStratifiedKFold(n_splits=5, shuffle=True, random_state=0
).split(X, y))

# Multi-learn library

In [8]:
model = make_classifier("LogisticRegression", random_state=0)
clf_cc = ClassifierChain(classifier=model, require_dense=False)
clf_mlknn = MLkNN(k=10)
clf_mlaram = MLARAM()

# KRandomChains

In [33]:
base_estimator = make_classifier(
    "LogisticRegression", random_state=0, n_jobs=1
)
params = get_parameter_distribution_for_model('LogisticRegression')
clf = RandomizedSearchCV(
    estimator=base_estimator,
    cv=StratifiedKFold(
        n_splits=3,
        shuffle=True,
        random_state=1
    ),
    n_iter=10,
    n_jobs=16,
    refit=True,
    random_state=2,
    scoring='f1',
    error_score=0.0,
    param_distributions=params
)
clf = KRandomClassifierChains(clf, k=8, n_jobs=8, random_state=0)

In [36]:
for train_idx, test_idx in cv:
    X_train = X[train_idx]
    y_train = y[train_idx]
    
    X_test = X[test_idx]
    y_test = y[test_idx]
    
    vec = CountVectorizer(binary=True)
    X_train = vec.fit_transform(X_train)
    X_test = vec.transform(X_test)
    X_holdout = vec.transform(X_hold)
       
    clf_cc.fit(np.asarray(X_train.todense()), y_train)
    
    y_pred_v = clf_cc.predict(np.asarray(X_test))
    y_pred_t = clf_cc.predict(np.asarray(X_holdout))
    
    for i, label in enumerate(mlb.classes):
        score_v = f1_score(
            y_true=y_test[:, i], y_pred=y_pred_v[:, i], average="binary"
        )
        score_t = f1_score(
            y_true=y_hold[:, i], y_pred=y_pred_t[:, i], average="binary"
        )
        print(f'{label}: {score_v}, {score_t}')

ValueError: Found input variables with inconsistent numbers of samples: [21372, 1]