# Training of dna-brnn with hyperparameter from deepgrp

## 1) Setup

In [None]:
import os
from os import path
import time
import subprocess
import pandas as pd
from deepgrp import model as deepgrp_model

In [None]:
PROJECT_ROOT_DIR = "."
GENOMEBUILD = "hg19"
TRAIN_CHR = "chr11"
VAL_CHR = "chr20"
RESULTS_PATH = './results'

In [None]:
datadir = path.join(PROJECT_ROOT_DIR, "data")

## 2) Load from toml

In [None]:
TOML_FILE = path.join(PROJECT_ROOT_DIR, "best_model.toml")

In [None]:
with open(TOML_FILE, 'r') as file:
    hyperparameter = deepgrp_model.Options.from_toml(file)

In [None]:
MODELS_TO_TRAIN = 6

In [None]:
results = dict()

## 3) Train dna-brnn

In [None]:
def train_dnabrnn(hyperparameter, threads, training_file, outputfilename):
    command = [
        'dna-nn/dna-brnn', '-t{}'.format(threads),
        '-n{:d}'.format(hyperparameter.units),
        '-u{:d}'.format(hyperparameter.vecsize),
        '-d{:.4f}'.format(hyperparameter.dropout), '-b5m', '-m50', '-s14', '-o',
        outputfilename, training_file
    ]
    with open('dnabrnnerror.log', 'ab') as file:
        start_time = time.time()
        process = subprocess.Popen(command, stderr=file)
        process.wait()
        end_time = time.time()
    return end_time - start_time

### 3.1) Prepare training data

In [None]:
import re
_REGEX = re.compile(r'^{}\s'.format(TRAIN_CHR))

In [None]:
training_fastq_path = path.join("dna-nn", TRAIN_CHR + '.lb4.fq')
training_bed = path.join('dna-nn', TRAIN_CHR + '.bed')
with open(path.join(datadir, GENOMEBUILD + ".bed")) as infile:
    with open(training_bed, 'w') as outfile:
        [outfile.write(line) for line in infile if _REGEX.search(line)]

command = [
    "dna-nn/gen-fq", "-m4",
    path.join(datadir, GENOMEBUILD, TRAIN_CHR + '.fa'), training_bed
]
with open(training_fastq_path, 'wb') as file:
    process = subprocess.Popen(command, stdout=file)

### 3.2) Run the training

In [None]:
for i in range(MODELS_TO_TRAIN):
    modelname = 'dnabrnn_model{:d}.knm'.format(i)
    results[modelname] = train_dnabrnn(hyperparameter, THREADS,
                                       training_fastq_path, modelname)

## 4) Save training times to CSV file

In [None]:
results = pd.Series(results).to_frame().reset_index()

In [None]:
results['model'] = 'dnabrnn'

In [None]:
results.rename({'index': 'modelname', 0: 'training time'}, axis=1, inplace=True)

In [None]:
results.to_csv(os.path.join(RESULTS_PATH,'training_times_dnabrnn.csv'))