# Re-Training of the best model found be hyperparameter search

## 1. Setup

In [None]:
from typing import Tuple
import os
from os import path
import pickle
import numpy as np
import hyperopt
import tensorflow as tf
import time
import subprocess
import pandas as pd
from deepgrp import training, preprocessing
from deepgrp import model as deepgrp_model

In [None]:
PROJECT_ROOT_DIR = ".."
GENOMEBUILD = "hg19"
TRAIN_CHR = "chr11"
VAL_CHR = "chr20"

In [None]:
datadir = path.join(PROJECT_ROOT_DIR, "data")

In [None]:
hyperparameter = None

## 2a) Loading pickled hyperopt results if present

In [None]:
RESULTS_FILE = None

In [None]:
if RESULTS_FILE:
    with open(RESULTS_FILE, 'rb') as file:
        best = pickle.load(file).best_trial
    hyperparameter = deepgrp_model.Options(**best['result']['options'])

## 2b) Load from toml if hyperparameter are not loaded yet

In [None]:
TOML_FILE = path.join(PROJECT_ROOT_DIR, "defaults.toml")

In [None]:
if hyperparameter is None:
    with open(TOML_FILE, 'r') as file:
        hyperparameter = deepgrp_model.Options.from_toml(file)

## 3. Train DeepGRP

In [None]:
def train_model(
    hyperparameter: deepgrp_model.Options, data: Tuple[preprocessing.Data,
                                                       preprocessing.Data]
) -> Tuple[str, float]:
    """ Train model and time the training """
    starttime = time.time()
    logdir = deepgrp_model.create_logdir(hyperparameter)
    model = deepgrp_model.create_model(hyperparameter)
    training.training(data, hyperparameter, model, logdir)
    endtime = time.time()
    return logdir, endtime - starttime

### 3.1 Load training and validation data

In [None]:
Xfwd = np.load(path.join(datadir, GENOMEBUILD, TRAIN_CHR + ".fa.gz.npz"))['fwd']
Xfwd_val = np.load(path.join(datadir, GENOMEBUILD,
                             VAL_CHR + ".fa.gz.npz"))['fwd']

In [None]:
Y = preprocessing.preprocess_y(path.join(datadir,
                                         GENOMEBUILD + ".bed"), TRAIN_CHR,
                               Xfwd.shape[1], hyperparameter.repeats_to_search)
Y_val = preprocessing.preprocess_y(path.join(datadir, GENOMEBUILD + ".bed"),
                                   VAL_CHR, Xfwd_val.shape[1],
                                   hyperparameter.repeats_to_search)

Remove leading and trailing N's for training, because they do not contain repetitive elements

In [None]:
Xfwd, Y = preprocessing.drop_start_end_n(Xfwd, Y)
Xfwd_val, Y_val = preprocessing.drop_start_end_n(Xfwd_val, Y_val)

In [None]:
train_data = preprocessing.Data(Xfwd, Y)
val_data = preprocessing.Data(Xfwd_val, Y_val)

### 3.2 Run the training for DeepGRP

In [None]:
MODELS_TO_TRAIN = 6

In [None]:
results = dict()

In [None]:
for _ in range(MODELS_TO_TRAIN):
    modelname, runtime = train_model(hyperparameter, (train_data, val_data))
    results[modelname] = runtime

## 4. Save model to HDF5 format for the python tool

In [None]:
def weights_to_model(hyperparameter: deepgrp_model.Options, logdir: str,
                     output: str) -> None:
    """Saves model to HDF5 format"""
    ckpt = tf.train.Checkpoint()
    manager = tf.train.CheckpointManager(ckpt, logdir, max_to_keep=None)
    if manager.latest_checkpoint is None:
        raise FileNotFoundError(logdir)
    model = deepgrp_model.create_model(hyperparameter)
    model.load_weights(manager.latest_checkpoint).expect_partial()
    model.save(output + '_' + path.basename(manager.latest_checkpoint) + '.h5')

In [None]:
for logdir in results:
    weights_to_model(hyperparameter, logdir,
                     logdir.replace('tf_logs/run-', './model_'))

## 5. Save training times to CSV file

In [None]:
results = pd.Series(results).to_frame().reset_index()

In [None]:
results['model'] = 'DeepGRP'

In [None]:
results.rename({'index': 'modelname', 0: 'training time'}, axis=1, inplace=True)

In [None]:
results.to_csv('training_times.csv')