# Hyperparameter Optimization for DeepGRP

## 1. Setup

In [None]:
PROJECT_ROOT_DIR = "."

In [None]:
import tensorflow as tf
from deepgrp.model import Options
from deepgrp.preprocessing import preprocess_y, drop_start_end_n, Data
from deepgrp.optimization import run_a_trial, build_and_optimize
from hyperopt import hp
from functools import partial
import numpy as np
from os import path

## 2. Setting basic configuration

In [None]:
TRAIN_CHR = 'chr11'
VAL_CHR = 'chr20'
GENOMEBUILD = "hg19"

In [None]:
datadir = path.join(PROJECT_ROOT_DIR, "data")
train_data_file = path.join(datadir, GENOMEBUILD, TRAIN_CHR + ".fa.gz.npz")
val_data_file = path.join(datadir, GENOMEBUILD, VAL_CHR + ".fa.gz.npz")
true_annotations = path.join(datadir, GENOMEBUILD + ".bed")

In [None]:
options = Options()
options.repeats_to_search = [1, 2, 3, 4]
options.attention = True

## 3. Loading training and validation data

In [None]:
Xfwd = np.load()['fwd']
Xfwd_val = np.load()['fwd']

In [None]:
Y = preprocess_y(true_annotations, TRAIN_CHR, Xfwd.shape[1],
                 options.repeats_to_search)
Y_val = preprocess_y(true_annotations, VAL_CHR, Xfwd_val.shape[1],
                     options.repeats_to_search)

Remove leading and trailing N's for training, because they do not contain repetitive elements

In [None]:
Xfwd, Y = drop_start_end_n(Xfwd, Y)
Xfwd_val, Y_val = drop_start_end_n(Xfwd_val, Y_val)

In [None]:
train_data = Data(Xfwd, Y)
val_data = Data(Xfwd_val, Y_val)

## 4. Defining a hyperopt search space

In [None]:
SEARCH_SPACE = {
    'vecsize': hp.qnormal('vecsize', 200, 20, 2),
    'gru_units': hp.qnormal('gru_units', 34, 5, 2),
    'gru_dropout': hp.uniform('gru_dropout', 0, 0.4),
    'momentum': hp.uniform('momentum', 0, 1),
    'learning_rate': hp.lognormal('learning_rate', -7, 0.5),
    'rho': hp.uniform('decay', 0, 1),
    'repeat_probability': hp.uniform('repeat_probability', 0, 0.49),
}

## 6. Build an optimizable function

The function has the varying hyperparameter (dict) as single argument

In [None]:
objective = partial(build_and_optimize, train_data, val_data, 50, options)

## 7. Run the optimization

Per default the negative Matthews correlation coefficient gets minimized, meaning maximizing the Matthews correlation coefficient

In [None]:
runs = 100
save_step = 3

In [None]:
for i in range(0, runs, save_step):
    try:
        run_a_trial(SEARCH_SPACE, objective, PROJECT_ROOT_DIR, save_step)
    except Exception:
        pass