# Training pipelines from scratch

In [1]:
import torch
import pandas as pd
import numpy as np

import torch.nn as nn
from torch.optim import Adam
from torch.utils.data import DataLoader

from sklearn.metrics import r2_score, roc_auc_score, f1_score, matthews_corrcoef, precision_score, recall_score

from random import shuffle
from time import time
from utils import (
    load_yeast_promoters,
    load_human_promoters,
    one_hot_encode, 
    seq2kmer,
    shifting_batch_generator,
    to_tensor,
    PromoterSequences,
    PromoterTokenizer,
    ConvNet,
    DNABERT,
    train_loop,
    test_loop,
    load_dnabert,
    plot_predictions
)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Using {} device'.format(device))

Using cpu device


## Load in desired data
First choose the desired pipeline to be trained:

Select either human or yeast promoters: `promoter`

Select either one-hot encoding or DNABERT embeddings: `embedding`

If using DNABERT select a CNN or a dense layer: `cnn`

In [2]:
promoter = 'human' # Change me
embedding = 'onehot' # Change me
cnn = False # Change me (if using DNABERT embedding)

n_outputs = 1 if promoter == 'yeast' else 2

if promoter == 'yeast':
    train, val, test = load_yeast_promoters('example/data/')
if promoter == 'human':
    train, val = load_human_promoters('example/data/')

if embedding == 'onehot':
    print('One-hot encoding sequences')
    trainX, trainy = one_hot_encode(train['Seq']), train['label'].values
    valX, valy = one_hot_encode(val['Seq']), val['label'].values
    
    batch_size = 32
    
    val_dataset = PromoterSequences((valX, valy))
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
    
if embedding == 'dnabert':
    print('Converting sequences to kmers')
    trainX, trainy = train['Seq'].apply(lambda x: seq2kmer(x,6)).values, train['label'].values
    valX, valy = val['Seq'].apply(lambda x: seq2kmer(x,6)).values, val['label'].values

Loaded human promoters dataset 
 53277 Training examples 
 5919 Validation examples 

One-hot encoding sequences
one-hot encoding sequences: 100%|█████████████████████████████████████████████████████████| 53277/53277 [00:00<00:00, 60017.91it/s]
one-hot encoding sequences: 100%|███████████████████████████████████████████████████████████| 5919/5919 [00:00<00:00, 65491.74it/s]


## Create model
This cell instantiates a PyTorch model and a `DataLoader` according to the `promoter` and `embedding` selected.

In [3]:
if embedding == 'onehot':
    SHIFT = 4
    in_c = 4
    hidden_size = int(np.ceil(trainX.shape[1]*(0.5**6)))

    model = ConvNet(n_outputs, in_c, hidden_size)
    model.to(device)
    print(f'{embedding} Model loaded.')
    
    train_loader = shifting_batch_generator(trainX, trainy, batch_size, SHIFT)
    input_shape = (batch_size, 4, trainX.shape[1] - SHIFT + 1)

if embedding == 'dnabert':
    config, tokenizer, dnabert_base = load_dnabert('example/dnabert_model_base', n_outputs)
    
    model = DNABERT(dnabert_base, config, n_outputs, cnn)
    model.to(device)
    print(f'{embedding} Model loaded.')
    
    batch_size = 2
    
    train_dataset = PromoterTokenizer((trainX, trainy), tokenizer)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    val_dataset = PromoterTokenizer((valX[:100], valy[:100]), tokenizer)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

onehot Model loaded.


## Training

In [4]:
learning_rate = 1e-5
epochs = 3 if embedding == 'dnabert' else 100

steps_per_epoch = np.ceil(SHIFT*trainX.shape[0]/batch_size).astype(int) if embedding == 'onehot' else len(train_dataset)//batch_size

loss_fn = nn.SmoothL1Loss() if promoter == 'yeast' else nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=learning_rate)

print(f'Using {embedding} pipeline on {promoter} promoters.\nTraining for {epochs} epochs')
for t in range(1, epochs+1):
    print(f"Epoch {t}\n-------------------------------")

    train_loop(model, train_loader, loss_fn, optimizer, embedding, steps_per_epoch, promoter)
    _ = test_loop(model, val_loader, loss_fn, embedding, promoter, SHIFT)

print("Done!")

Using onehot pipeline on human promoters.
Training for 100 epochs
Epoch 1
-------------------------------
  6%|█████▍                                                                                     | 401/6660 [00:23<05:59, 17.40it/s]


KeyboardInterrupt: 