In [1]:
from pyppi.database import make_session
from pyppi.database.managers import InteractionManager
from pyppi.database.managers import format_interactions_for_sklearn
from pyppi.model_selection.sampling import IterativeStratifiedKFold
from pyppi.models import get_parameter_distribution_for_model, make_classifier

from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import f1_score

In [2]:
session = make_session()
i_manager = InteractionManager()
training = i_manager.training_interactions(session, keep_holdout=True)
labels = i_manager.training_labels(session, include_holdout=True)
X, y = format_interactions_for_sklearn(training, selection=['interpro', 'pfam', 'ulca_go_mf', 'ulca_go_bp', 'ulca_go_cc'])

In [3]:
mlb = MultiLabelBinarizer(classes=labels, sparse_output=False)
y_true = mlb.fit_transform(y)

In [None]:
cv = IterativeStratifiedKFold(n_splits=5, shuffle=True, random_state=0)

for train_idx, test_idx in cv.split(X, y_true):
    X_train = X[train_idx]
    y_train = y_true[train_idx, :]
    X_test = X[test_idx]
    y_test = y_true[test_idx, :]
    
    vectorizer = CountVectorizer(
        binary=True,
        lowercase=False, stop_words=[':', 'GO']
    )
    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)
    
    for i, label in enumerate(mlb.classes):
        params = get_parameter_distribution_for_model("RandomForestClassifier")
        model = make_classifier("RandomForestClassifier", random_state=56, n_jobs=16)
        clf = RandomizedSearchCV(
            estimator=model,
            scoring='f1',
            error_score=0.0,
            cv=StratifiedKFold(
                n_splits=2,
                shuffle=True,
                random_state=1
            ),
            n_iter=20,
            n_jobs=16,
            refit=True,
            random_state=2,
            param_distributions=params,
        )
        print(f'Fitting {label}')
        clf.fit(X_train, y_train[:, i])
        y_pred = clf.predict(X_test)
        score = f1_score(y_true=y_test[:, i], y_pred=y_pred, average="binary")
        print(f'{label}, {score}')
        

Fitting Acetylation
Acetylation, 0.8148148148148148
Fitting Activation
