In [2]:
import os
import json
import logging
import numpy as np
import pandas as pd
from collections import Counter
from numpy.random import RandomState
from joblib import Parallel, delayed
from datetime import datetime
from docopt import docopt

from pyppi.base import parse_args, su_make_dir, chunk_list
from pyppi.base import P1, P2, G1, G2, SOURCE, TARGET, PUBMED, EXPERIMENT_TYPE
from pyppi.base.logging import create_logger
from pyppi.data import load_network_from_path, load_ptm_labels
from pyppi.data import full_training_network_path, generic_io
from pyppi.data import interactome_network_path, classifier_path
from pyppi.data import default_db_path

from pyppi.models import make_classifier, get_parameter_distribution_for_model

from pyppi.database import make_session
from pyppi.database.models import Interaction
from pyppi.database.managers import InteractionManager, ProteinManager
from pyppi.database.managers import format_interactions_for_sklearn
from pyppi.database.utilities import update_interaction

from pyppi.data_mining.tools import xy_from_interaction_frame
from pyppi.data_mining.generic import edgelist_func, generic_to_dataframe
from pyppi.data_mining.tools import map_network_accessions
from pyppi.data_mining.uniprot import batch_map
from pyppi.data_mining.features import compute_interaction_features

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline


MAX_SEED = 1000000
RANDOM_STATE = 42
logger = create_logger("scripts", logging.INFO)


if __name__ == "__main__":
    # args = parse_args(docopt(__doc__))
    n_jobs = 4  # args['n_jobs']
    n_splits = 3  # args['n_splits']
    rcv_iter = 10  # args['n_iterations']
    induce = False  # args['induce']
    verbose = False  # args['verbose']
    selection = ['pfam']  # args['selection']
    model = 'LogisticRegression'  # args['model']
    out_file = 'predictions.tsv'  # args['output']
    input_file = None  # args['input']
    direc = './results/'  # args['directory']

    # Set up the folder for each experiment run named after the current time
    # -------------------------------------------------------------------- #
    folder = datetime.now().strftime("pred_%y-%m-%d_%H-%M-%S")
    direc = "{}/{}/".format(direc, folder)
    su_make_dir(direc)
    json.dump(
        args, fp=open("{}/settings.json".format(direc), 'w'),
        indent=4, sort_keys=True
    )

    logger.info("Starting new database session.")
    session = make_session(db_path=default_db_path)
    i_manager = InteractionManager(verbose=verbose, match_taxon_id=9606)
    p_manager = ProteinManager(verbose=verbose, match_taxon_id=9606)
    protein_map = p_manager.uniprotid_entry_map(session)

    # Get the input edge-list ready
    # -------------------------------------------------------------------- #
    labels = i_manager.training_labels(session, include_holdout=True)
    training = i_manager.training_interactions(
        session, keep_holdout=True
    )

    if input_file == None:
        logger.info("Loading interactome data.")
        testing = i_manager.interactome_interactions(
            session=session,
            keep_holdout=True,
            keep_training=True
        )
    else:
        logger.info("Loading custom ppi data.")
        testing = generic_to_dataframe(
            f_input=generic_io(input_file),
            parsing_func=edgelist_func,
            drop_nan=True,
            allow_self_edges=True,
            allow_duplicates=True
        )
        sources = set(p for p in testing.source.values)
        targets = set(p for p in testing.target.values)
        accessions = list(sources | targets)
        accession_mapping = batch_map(
            session=session,
            accessions=accessions,
            keep_unreviewed=True,
            match_taxon_id=9606,
            allow_download=True
        )
        testing_network = map_network_accessions(
            interactions=testing, accession_map=accession_mapping,
            drop_nan=True, allow_self_edges=True,
            allow_duplicates=False, min_counts=None, merge=False
        )

        # Compute features for new ppis
        testing = []
        feature_map = {}
        ppis = [
            (protein_map[a], protein_map[b])
            for (a, b) in zip(testing_network[SOURCE], testing_network[TARGET])
            if i_manager.get_by_source_target(session, a, b) is None
        ]

        logger.info("Computing features.")
        features = Parallel(n_jobs=n_jobs, backend="multiprocessing")(
            delayed(compute_interaction_features)(source, target)
            for (source, target) in ppis
        )
        for (source, target), features in zip(ppis, features):
            feature_map[(source.uniprot_id, target.uniprot_id)] = features

        existing_interactions = {}
        for interaction in session.query(Interaction).all():
            a = p_manager.get_by_id(session, id=interaction.source)
            b = p_manager.get_by_id(session, id=interaction.target)
            existing_interactions[(a.uniprot_id, b.uniprot_id)] = interaction

        for (a, b) in zip(testing_network[SOURCE], testing_network[TARGET]):
            class_kwargs = feature_map[(a, b)]
            class_kwargs["source"] = protein_map[a]
            class_kwargs["target"] = protein_map[b]
            class_kwargs["label"] = None
            class_kwargs["is_training"] = False
            class_kwargs["is_holdout"] = False
            class_kwargs["is_interactome"] = False
            entry = update_interaction(
                session=session,
                commit=False,
                psimi_ls=[],
                pmid_ls=[],
                replace_fields=False,
                override_boolean=False,
                create_if_not_found=True,
                match_taxon_id=9606,
                verbose=False,
                update_features=False,
                existing_interactions=existing_interactions,
                **class_kwargs
            )
            existing_interactions[(a, b)] = entry
            testing.append(entry)
        session.commit()

    # Get the features into X, and multilabel y indicator format
    # -------------------------------------------------------------------- #
    logger.info("Preparing training and testing data.")
    X_train, y_train = format_interactions_for_sklearn(training, selection)
    X_test, _ = format_interactions_for_sklearn(testing, selection)

    logger.info("Computing usable feature proportions in testing samples.")

    def separate_features(row):
        features = row[0].upper().split(',')
        interpro = set(term for term in features if 'IPR' in term)
        go = set(term for term in features if 'GO:' in term)
        pfam = set(term for term in features if 'PF' in term)
        return (go, interpro, pfam)

    def compute_proportions_shared(row):
        go, ipr, pf = row
        try:
            go_prop = len(go & go_training) / len(go)
        except ZeroDivisionError:
            go_prop = 0
        try:
            ipr_prop = len(ipr & ipr_training) / len(ipr)
        except ZeroDivisionError:
            ipr_prop = 0
        try:
            pf_prop = len(pf & pf_training) / len(pf)
        except ZeroDivisionError:
            pf_prop = 0
        return go_prop, ipr_prop, pf_prop

    X_train_split_features = np.apply_along_axis(
        separate_features, axis=1, arr=X_train.reshape((X_train.shape[0], 1))
    )
    go_training = set()
    ipr_training = set()
    pf_training = set()
    for (go, ipr, pf) in X_train_split_features:
        go_training |= go
        ipr_training |= ipr
        pf_training |= pf

    X_test_split_features = np.apply_along_axis(
        separate_features, axis=1, arr=X_test.reshape((X_test.shape[0], 1))
    )
    X_test_useable_props = np.apply_along_axis(
        compute_proportions_shared, axis=1, arr=X_test_split_features
    )

    mlb = MultiLabelBinarizer(classes=sorted(labels))
    mlb.fit(y_train)
    y_train = mlb.transform(y_train)

    # X_train = vectorizer.fit_transform(X_train)
    # X_test = vectorizer.transform(X_test)

    # Make the estimators and BR classifier
    # -------------------------------------------------------------------- #
    rng = RandomState(seed=RANDOM_STATE)
    params = get_parameter_distribution_for_model(model)
    for key in params.keys():
        value = params.pop(key)
        params['estimator__{}'.format(key)] = value
    params['vectorizer__binary'] = [False, True]

ImportError: No module named 'joblib'