# Stuff

In [30]:
import pandas as pd
import numpy as np
from pathlib import Path

from bioblp.data import COL_EDGE, COL_SOURCE, COL_TARGET
from bioblp.logging import get_logger


logger = get_logger(__name__)


In [4]:
DATA_DIR = Path("../data/")
DATA_SHARED = Path("/home/jovyan/workbench-shared-folder/bioblp")

In [5]:
dpi_benchmark_path = DATA_SHARED.joinpath('data/benchmarks/benchmarks/dpi_fda.tsv')

In [6]:
dpi_bm = pd.read_csv(dpi_benchmark_path, sep='\t', names=[COL_SOURCE, COL_EDGE, COL_TARGET])
dpi_bm.head(3)

Unnamed: 0,src,edg,tgt
0,DB01079,DPI,Q13639
1,DB00114,DPI,P20711
2,DB01158,DPI,P13637


In [7]:
len(dpi_bm)

19161

In [72]:
from typing import List

ent2id_map = pd.read_csv(DATA_SHARED.joinpath("models/1baon0eg/training_triples/entity_to_id.tsv.gz"), sep="\t", compression="gzip")
ent2id_map = {v[1]: v[0] for v in ent2id_map.values}

def get_ent_ids_for_entity_list(entity_list: List[str], ent2id_map):
    ids = [ent2id_map.get(ent) for ent in entity_list]
    return ids

In [66]:
import torch

model_dir = DATA_SHARED.joinpath('models/1baon0eg')
print(f'Loading trained model from {model_dir}')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = torch.load(model_dir.joinpath(f"trained_model.pkl"), map_location=device)
#if not torch.cuda.is_available():
#    model.device = 'cpu'

Loading trained model from /home/jovyan/workbench-shared-folder/bioblp/models/1baon0eg


In [67]:
entity_representation = model.entity_representations[0]._embeddings
relation_representation = model.relation_representations[0]._embeddings
entity_representation

Embedding(106339, 512)

In [73]:
dpi_bm_drugs = list(dpi_bm.src.values)
print(ent2id_map)
drug_ids = get_ent_ids_for_entity_list(dpi_bm_drugs, ent2id_map)
print(drug_ids)
drug_ids = torch.LongTensor(drug_ids)
#drug_embs = model.entity_representations[0](drug_ids)
drug_embs = model.entity_representations[0]._embeddings(drug_ids)

In [None]:
dpi_bm_prots = list(dpi_bm.tgt.values)
prot_ids = get_ent_ids_for_entity_list(dpi_bm_prots, ent2id_map)   
prot_ids = torch.LongTensor(prot_ids)
prot_embs = model.entity_representations[0]._embeddings(prot_ids)

In [None]:
drug_embs.shape, prot_embs.shape


In [None]:
from collections.abc import Callable

def concatenate(emb1, emb2):
    out = torch.cat((emb1, emb2), dim=0).view(1, -1)
    return out

def average(emb1, emb2):
    concat = torch.cat((emb1, emb2), dim=0).view(2, -1)
    out = torch.stack((emb1, emb2)).mean(dim=0).view(1,-1)
    return out

def encode_entity_pair(emb1, emb2, transform:Callable):
    return transform(emb1, emb2)


In [None]:
out = encode_entity_pair(emb1=drug_embs[0, :], emb2=prot_embs[0, :], transform=average)


In [None]:
out.shape

In [None]:
import os
import numpy as np
import random as rn

from argparse import ArgumentParser
from pathlib import Path
from time import time
from collections import defaultdict

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score


In [None]:
SEED = 2022

#
# DEFAULT PARAMETERS FOR RR
#
rf_default_params = {
    'n_estimators': 300,
    'criterion': 'gini',
    'min_samples_split': 2,
    'min_samples_leaf': 1,
    'max_features': 'auto',
    'random_state': SEED,
    'n_jobs': -1,

}

lr_default_params = {
    'C': 1.0,
    'random_state': SEED,
    'max_iter': 1000,
    'solver': 'lbfgs',
    'n_jobs': -1,
}


#
# OPT SPACES FOR RR
#

rf_search_space = {
    "criterion": ["gini", "entropy"],
    'n_estimators': np.arange(100, 1201, 200, dtype=int),
    'min_samples_leaf': [1, 2, 5, 10],
    'min_samples_split': [2, 5, 10, 15, 100],
    'max_depth': [5, 8, 15, 25, 30, None],
    'random_state': [SEED]
}

lr_search_space = {
    'penalty': ['l2'],
    'C': np.logspace(-4, 3, 8),
    'random_state': [SEED],
    'max_iter': [1000],
    'solver': ['lbfgs'],
    'n_jobs': [-1],
}


In [None]:

def run_nested_cv(candidates: list, X, y, scoring: dict,
                  outer_n_folds: int = 5, inner_n_folds: int = 2, inner_n_iter: int = 10, shuffle: bool = False,
                  random_state: int = SEED, n_jobs: int = 12, refit_param: str = 'fbeta', verbose: int = 0) -> dict:
    """Nested cross validation routine.
    Inner cv loop performs hp optimization on all folds and surfaces
    Parameters
    ----------
    candidates : list
        list of (label, estimator, param_dist)
    X : np.array
        predictor
    y : np.ndarray
        labels
    scoring : dict
        dict containing sklearn scorers
    outer_n_folds : int, optional
        splits for outer cv, by default 5
    inner_n_folds : int, optional
        splits for inner cv, by default 2
    inner_n_iter : int, optional
        number of trials within inner fold, by default 10
    shuffle : bool, optional
        shuffles data before cv, by default True
    random_state : int, optional
        seed for rng, by default SEED
    n_jobs : int, optional
        multiprocessing, by default 10
    refit_param : str, optional
        which metric to optimize for and return refit model, by default 'fbeta'
    verbose : int, optional
        level of console feedback, by default 0
    Returns
    -------
    dict
        outer cv scores e.g. {name: scores}
    """
    gridcvs = {}

    inner_cv = StratifiedKFold(n_splits=inner_n_folds, shuffle=shuffle, random_state=random_state)

    for name, estimator, param_grid in candidates:
        gcv = RandomizedSearchCV(
                estimator=estimator,
                param_distributions=param_grid,
                n_iter=inner_n_iter,
                scoring=scoring,
                n_jobs=1,
                cv=inner_cv,
                verbose=1,
                refit=refit_param,
                random_state=random_state)
        gridcvs[name] = gcv

    outer_cv = StratifiedKFold(n_splits=outer_n_folds, shuffle=shuffle, random_state=random_state)
    outer_scores = {}

    for name, gs_est in sorted(gridcvs.items()):
        nested_score = cross_validate(gs_est,
                                      X=X,
                                      y=y,
                                      scoring=scoring,
                                      cv=outer_cv,
                                      n_jobs=n_jobs,
                                      return_estimator=False,
                                      return_train_score=True)

        score_to_optimize = nested_score.get('test_{}'.format(refit_param))
        logger.info(
            f'{name}: outer {refit_param} {100*score_to_optimize.mean():.2f} +/- {100*score_to_optimize.std():.2f}')
        outer_scores[name] = nested_score
    return outer_scores

In [None]:


# reproducibility
SEED = 2022


# def set_seeds(seed: int = SEED):
#     os.environ['PYTHONHASHSEED'] = str(SEED)
#     np.random.seed(SEED)
#     tf.random.set_seed(SEED)
#     rn.seed(SEED)



experiment_config = {
    "n_proc": 1,
    "n_iter": 3,
    "inner_n_folds": 3,
    "outer_n_folds": 5,
    "param": "fbeta"
}

# data_dir = experiment_base_path.joinpath(experiment_config["data_dir"])
# out_dir = experiment_base_path.joinpath(experiment_config["out_dir"])

# mkdir(out_dir)

n_proc = config["n_proc"]
n_iter = experiment_config["n_iter"]
inner_n_folds = experiment_config["inner_n_folds"]
outer_n_folds = experiment_config["outer_n_folds"]
optimize_param = experiment_config["param"]

# set_seeds(seed=SEED)

shuffle = False

exp_output = defaultdict(dict)
exp_output['settings'] = {
    'data_dir': data_dir,
    'n_iter': n_iter,
    'inner_n_folds': inner_n_folds,
    'outer_n_folds': outer_n_folds,
    'optimize_param': optimize_param,
    'shuffle': shuffle,
    'seed': SEED
}

start = time()

logger.info("Starting model building script at {}.".format(start))

############
# Load data
############
logger.info("Loading training data...")

X_train = np.load(data_dir.joinpath('X_train.npy'))
y_train = np.load(data_dir.joinpath('y_train.npy'))

logger.info("Resulting shapes X_train: {}, y_train: {}".format(X_train.shape, y_train.shape))
logger.info("Counts in y_train: {}".format(np.unique(y_train, return_counts=True)))
############
# Setup classifiers & pipelines
############

lr_label = 'LR'
clf_lr = LogisticRegression(**lr_default_params)

rf_label = 'RF'
clf_rf = RandomForestClassifier(**rf_default_params)


# record default params
exp_output['default_params']: {
    lr_label: lr_default_params,
    rf_label: rf_default_params
}

############
# Setup grids
############
exp_output['grids']: {
    lr_label: lr_search_space,
    rf_label: rf_search_space
}

############
# Compare models
############
candidates = [
    (lr_label, clf_lr, lr_search_space),
    (rf_label, clf_rf, rf_search_space)
]

scorer = {
    'AUC': make_scorer(roc_auc_score),
    'fbeta': make_scorer(fbeta_score, beta=1, average='micro'),
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    'accuracy': make_scorer(accuracy_score)
}

nested_cv_scores = run_nested_cv(
    candidates=candidates,
    X=X_train,
    y=y_train,
    scoring=scorer,
    inner_n_folds=inner_n_folds,
    inner_n_iter=n_iter,
    outer_n_folds=outer_n_folds,
    shuffle=shuffle,
    n_jobs=n_proc,
    refit_param=optimize_param,
    random_state=SEED
)

for algo, scores in nested_cv_scores.items():
    logger.info("Scores {}: {}".format(algo, scores))

exp_output['results'] = nested_cv_scores

logger.info(exp_output)

run_timestamp = int(time())
# file_out = out_dir.joinpath('nested_cv_scores_{}.npy'.format(run_timestamp))
# logger.info("Saving to {}".format(file_out))
# np.save(file_out, exp_output)

end = time()

logger.info("Ran script in {} seconds".format(str(end - start)))