# Run nested CV routine

In [None]:
import random

import pandas as pd
import numpy as np

from bioblp.data import COL_EDGE, COL_SOURCE, COL_TARGET
from bioblp.logging import get_logger
import torch


logger = get_logger(__name__)


In [None]:
DATA_DIR = Path("../data/")
DATA_SHARED = Path("/home/jovyan/workbench-shared-folder/bioblp")

In [None]:
from time import time
from pathlib import Path
from collections import defaultdict

from bioblp.benchmarking.train import run_nested_cv
from bioblp.benchmarking.train import get_scorers



In [None]:
"""Perform train run"""

# reproducibility
# SEED is set as global
shuffle = True
refit_params = ["AUCPR", "AUCROC"]

data_dir = Path("../data/features/kge-1baon0eg/")
out_dir = Path("../data/runs/")

n_proc = 1
n_iter = 2
inner_n_folds = 3
outer_n_folds = 5

exp_output = defaultdict(dict)
exp_output["config"] = {
    "n_proc": n_proc,
    "n_iter": n_iter,
    "inner_n_folds": inner_n_folds,
    "outer_n_folds": outer_n_folds,
    "data_dir": data_dir,
    "seed": SEED,
    "shuffle": shuffle
}

start = time()
run_timestamp = int(start)

logger.info("Starting model building script at {}.".format(start))

############
# Load data
############
logger.info("Loading training data...")

X_train = np.load(data_dir.joinpath("X.npy"))
y_train = np.load(data_dir.joinpath("y.npy"))

logger.info(
    "Resulting shapes X_train: {}, y_train: {}".format(
        X_train.shape, y_train.shape)
)
logger.info("Counts in y_train: {}".format(
    np.unique(y_train, return_counts=True)))

############
# Setup classifiers & pipelines
############

lr_label = "LR"
rf_label = "RF"
MLP_label = "MLP"

############
# Compare models
############

candidates = [
    lr_label,
    # rf_label,
    # MLP_label

]

scorer = get_scorers()

nested_cv_scores = run_nested_cv(
    candidates=candidates,
    X=X_train,
    y=y_train,
    scoring=scorer,
    inner_n_folds=inner_n_folds,
    inner_n_iter=n_iter,
    outer_n_folds=outer_n_folds,
    shuffle=shuffle,
    n_jobs=n_proc,
    refit_params=refit_params,
    random_state=SEED,
    outdir=out_dir,
    timestamp=run_timestamp
)

for algo, scores in nested_cv_scores.items():
    logger.info("Scores {}: {}".format(algo, scores))

exp_output["results"] = nested_cv_scores

logger.info(exp_output)

file_out = out_dir.joinpath(
    "nested_cv_scores_{}.npy".format(run_timestamp))
logger.info("Saving to {}".format(file_out))
np.save(file_out, exp_output)

end = time()

logger.info("Ran script in {} seconds".format(str(end - start)))

_____