In [1]:
MAX_LEN = 26
LR = 0.0005974060251967456
BATCH_SIZE = 4096

In [2]:
import os
import numpy as np
import pandas as pd
import deepdish as dd

import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
from pytorch_lightning.loggers import CSVLogger

import torch
import torch.nn.functional as F
from torch import nn
from torch.utils.data import Dataset, DataLoader, TensorDataset

from torchnlp.encoders.text import CharacterEncoder

In [3]:
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support, roc_auc_score, roc_curve, matthews_corrcoef, plot_confusion_matrix
from collections import defaultdict
import seaborn as sns

In [4]:
class RNNRegression(pl.LightningModule):
    def __init__(self, vocab_size, emb_dim, hidden_size, num_layers, dropout):
        super(RNNRegression, self).__init__()
        self.hidden_size = hidden_size
        self.out_predictions = []
        
        self.embeddings = nn.Embedding(vocab_size, emb_dim)
        self.rnn = nn.LSTM(input_size=emb_dim, hidden_size=hidden_size, num_layers=num_layers, dropout=dropout, bidirectional=True, batch_first=True)
        self.dropout = nn.Dropout(0.21659149581080556)
        
        self.linear = nn.Linear(2 * hidden_size, 1)

    def forward(self, x):
        x = self.embeddings(x.long())
        x, _ = self.rnn(x)
        cls_token_emb = x[:, 0, :]
        x = self.dropout(cls_token_emb)
        x = self.linear(x)
        return x
    

    def training_step(self, batch, batch_idx):
        x, y = batch
        train_out = self(x)
        loss = F.mse_loss(torch.squeeze(train_out), y)
        self.log('train_loss', loss)
        return loss

    
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), LR)
    
            
    def validation_step(self, batch, batch_idx):
        x, y = batch
        val_out = self(x)
        val_loss = F.mse_loss(torch.squeeze(val_out), y)
        self.log('val_loss', val_loss)
        return val_loss
    
            
    def test_step(self, batch, batch_idx):
        x, y = batch
        test_out = self(x)
        return 0
    
    
    def train_dataloader(self):
        return torch.utils.data.DataLoader(
            list(zip(X_train, y_train)),
            batch_size=BATCH_SIZE,
            shuffle=True,
        )
    

    def val_dataloader(self):
        return torch.utils.data.DataLoader(
            list(zip(X_val, y_val)),
            batch_size=BATCH_SIZE,
            shuffle=False,
        )
    
    
    def test_dataloader(self):
        return torch.utils.data.DataLoader(
            list(zip(X_test, y_test)),
            batch_size=8192,
            shuffle=False,
        )

In [6]:
train = dd.io.load('../splits/train.h5')
val = dd.io.load('../splits/val.h5')
test = dd.io.load('../splits/test.h5')
y_train = np.load('../splits/y_train.npy')
y_val = np.load('../splits/y_val.npy')
y_test = np.load('../splits/y_test.npy')

In [7]:
y_train = y_train * 100
y_val = y_val * 100
y_test = y_test * 100

In [8]:
all_seqs = set()
all_seqs.update([item[0] for item in train])
all_seqs.update([item[1] for item in train])
all_seqs.update([item[0] for item in val])
all_seqs.update([item[1] for item in val])
all_seqs.update([item[0] for item in test])
all_seqs.update([item[1] for item in test])

In [9]:
encoder = CharacterEncoder(all_seqs)

In [10]:
enc_dict = dict(zip(encoder.vocab, range(len(encoder.vocab))))

In [11]:
from torchnlp.encoders.text.default_reserved_tokens import DEFAULT_SOS_INDEX
from torchnlp.encoders.text.default_reserved_tokens import DEFAULT_EOS_INDEX
from torchnlp.encoders.text.default_reserved_tokens import DEFAULT_PADDING_INDEX

In [12]:
def split(X, y, split):
    X_train, y_train, X_test, y_test = [], [], [], []
    hist = [[], [], []]
    for i, val in enumerate(y):
        if val < 0.1:
            hist[0].append((X[i], val))
        elif val <= 0.90:
            hist[1].append((X[i], val))
        else:
            hist[2].append((X[i], val))
            
    for h in hist:
        np.random.shuffle(h)
        limit = int(len(h) * split)
        d1, d2 = h[:limit], h[limit:]
        for pair in d1:
            X_train.append(pair[0])
            y_train.append(pair[1])
        for pair in d2:
            X_test.append(pair[0])
            y_test.append(pair[1])
            
    return X_train, y_train, X_test, y_test

In [13]:
def encode_for_rnn(seq, max_len):
    # Default padding index is zero for the character encoder
    nucl_dict = {'A': enc_dict['A'], 'C': enc_dict['C'], 'G': enc_dict['G'], 'T': enc_dict['T']}
    mat = np.zeros(max_len, dtype=int)
    
    for i, nucl in enumerate(seq):
        mat[i] = nucl_dict[nucl]
    return mat

def encode_pair_for_rnn(seq1, seq2, max_len):
    enc1 = encode_for_rnn(seq1, max_len)
    enc2 = encode_for_rnn(seq2, max_len)
    return np.hstack((np.array([DEFAULT_SOS_INDEX]), enc1, np.array([DEFAULT_EOS_INDEX]), enc2, np.array([DEFAULT_EOS_INDEX])))

In [14]:
X_train = [encode_pair_for_rnn(item[0], item[1], MAX_LEN) for item in train]
X_val = [encode_pair_for_rnn(item[0], item[1], MAX_LEN) for item in val]
X_test = [encode_pair_for_rnn(item[0], item[1], MAX_LEN) for item in test]

In [15]:
X_train = np.array(X_train, dtype=np.dtype('d'))
X_val = np.array(X_val, dtype=np.dtype('d'))
X_test = np.array(X_test, dtype=np.dtype('d'))

In [16]:
torch.set_default_tensor_type(torch.DoubleTensor)
torch.set_default_dtype(torch.double)

In [17]:
torch.cuda.get_device_name(0)

'NVIDIA GeForce RTX 3090'

In [17]:
model = RNNRegression.load_from_checkpoint('...', vocab_size=len(encoder.vocab), emb_dim=32, hidden_size=64, num_layers=3, dropout=0.2412375022122436)
model.eval()

RNNRegression(
  (embeddings): Embedding(9, 32)
  (rnn): LSTM(32, 64, num_layers=3, batch_first=True, dropout=0.2412375022122436, bidirectional=True)
  (dropout): Dropout(p=0.21659149581080556, inplace=False)
  (linear): Linear(in_features=128, out_features=1, bias=True)
)

In [18]:
trainer = pl.Trainer(gpus=[0])

GPU available: True, used: True
TPU available: None, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


In [19]:
# INIT LOGGERS
starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
repetitions = 1
timings=np.zeros((repetitions,1))

# MEASURE PERFORMANCE
with torch.no_grad():
    for rep in range(repetitions):
        starter.record()
        trainer.test(model)
        ender.record()
        # WAIT FOR GPU SYNC
        torch.cuda.synchronize()
        curr_time = starter.elapsed_time(ender)
        timings[rep] = curr_time
mean_syn = np.sum(timings) / repetitions
std_syn = np.std(timings)
print(mean_syn)
print(std_syn)

Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{}
--------------------------------------------------------------------------------
17319.466796875
0.0


In [72]:
timings

array([[16214.48828125],
       [15779.60449219],
       [15781.15527344],
       [15978.25976562],
       [15781.203125  ],
       [16011.27832031],
       [15892.85839844],
       [16094.82128906],
       [16198.18457031],
       [15897.16796875]])