In [8]:
import torch
from torch import nn
import torch.autograd as autograd

from itertools import product
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import utils as u

In [3]:
# create all possible 8-mers
seqs8 = [''.join(x) for x in product(['A','C','G','T'], repeat=8)]
print('Total 8mers:',len(seqs8))

Total 8mers: 65536


In [4]:
# methods for assigning scores to a particular DNA sequence
score_dict = {
    'A':20,
    'C':17,
    'G':14,
    'T':11
}
def score_seqs(seqs):
    '''Each seq is just the average of the letter scores wrt score_dict'''
    data = []
    for seq in seqs:
        score = np.mean([score_dict[base] for base in seq])
        data.append([seq,score])
        
    df = pd.DataFrame(data, columns=['seq','score'])
    return df
                  
def score_seqs_motif(seqs):
    '''
    Each seq is the average of the letter scores wrt score_dict but if
    it has a TAT it gets a +10 but if it has a GCG it gets a -10
    '''
    data = []
    for seq in seqs:
        score = np.mean([score_dict[base] for base in seq])
        if 'TAT' in seq:
            score += 10
        if 'GCG' in seq:
            score -= 10
        data.append([seq,score])
        
    df = pd.DataFrame(data, columns=['seq','score'])
    return df

In [5]:
mer8 = score_seqs(seqs8)
mer8.head()

Unnamed: 0,seq,score
0,AAAAAAAA,20.0
1,AAAAAAAC,19.625
2,AAAAAAAG,19.25
3,AAAAAAAT,18.875
4,AAAAAACA,19.625


In [6]:
mer8_motif = score_seqs_motif(seqs8)
mer8_motif

Unnamed: 0,seq,score
0,AAAAAAAA,20.000
1,AAAAAAAC,19.625
2,AAAAAAAG,19.250
3,AAAAAAAT,18.875
4,AAAAAACA,19.625
...,...,...
65531,TTTTTTGT,11.375
65532,TTTTTTTA,12.125
65533,TTTTTTTC,11.750
65534,TTTTTTTG,11.375


In [9]:
# load stuff into pytorch dataloaders
mer8motif_train_dl,\
mer8motif_test_dl, \
mer8motif_train_df, \
mer8motif_test_df = u.build_dataloaders_single(mer8_motif,batch_size=11)
# change to batch size 11 so I can figure out the dimension errors

In [10]:
mer8motif_train_dl

<torch.utils.data.dataloader.DataLoader at 0x7f7e9e2f70a0>

In [11]:
print(mer8motif_train_dl.batch_size)

11


In [14]:
def loss_batch(model, loss_func, xb, yb, opt=None):
    '''
    Apply loss function to a batch of inputs. If no optimizer
    is provided, skip the back prop step.
    '''
    print('loss batch ****')
    print("xb shape:",xb.shape)
    print("yb shape:",yb.shape)

    xb_out = model(xb.float())
    print("model out pre loss", xb_out.shape)
    loss = loss_func(xb_out, yb.float())

    if opt is not None:
        loss.backward()
        opt.step()
        opt.zero_grad()

    #print("lb returning:",loss.item(), len(xb))
    return loss.item(), len(xb)

def fit(epochs, model, loss_func, opt, train_dl, test_dl):
    '''
    Fit the model params to the training data, eval on unseen data.
    Loop for a number of epochs and keep train of train and val losses 
    along the way
    '''
    # keep track of losses
    train_losses = []    
    test_losses = []
    
    # loops through epochs
    for epoch in range(epochs):
        #print("TRAIN")
        model.train()
        ts = []
        ns = []
        # collect train loss; provide opt so backpropo happens
        for xb, yb in train_dl:
            t, n = loss_batch(model, loss_func, xb, yb, opt=opt)
            ts.append(t)
            ns.append(n)
        train_loss = np.sum(np.multiply(ts, ns)) / np.sum(ns)
        train_losses.append(train_loss)
        
        #print("EVAL")
        model.eval()
        with torch.no_grad():
            losses, nums = zip(
                # loop through test batches
                # returns loss calc for test set batch size
                # unzips into two lists
                *[loss_batch(model, loss_func, xb, yb) for xb, yb in test_dl]
                # Note: no opt provided, backprop won't happen
            )
        # Gets average MSE loss across all batches (may be of diff sizes, hence the multiply)
        #print("losses", losses)
        #print("nums", nums)
        test_loss = np.sum(np.multiply(losses, nums)) / np.sum(nums)

        print(epoch, test_loss)
        test_losses.append(test_loss)

    return train_losses, test_losses

def run_model(train_dl,test_dl, model, lr=0.01, epochs=20):
    '''
    Given data and a model type, run dataloaders with MSE loss and SGD opt
    '''
    # define loss func and optimizer
    loss_func = torch.nn.MSELoss() 
    optimizer = torch.optim.SGD(model.parameters(), lr=lr) 
    
    # run the training loop
    train_losses, test_losses = fit(epochs, model, loss_func, optimizer, train_dl, test_dl)
    
    #return model, train_losses, test_losses
    return train_losses, test_losses

## Attempt to build LSTM

In [16]:
class DNA_LSTM(nn.Module):
    def __init__(self,seq_len,hidden_dim=10):
        super().__init__()
        self.seq_len = seq_len

        self.hidden_dim = hidden_dim
        self.hidden = None # when initialized, should be tuple of (hidden state, cell state)
        
        self.rnn = nn.LSTM(4, hidden_dim,batch_first=True)
        self.fc = nn.Linear(hidden_dim, 1)
            

    
    def init_hidden(self,batch_size):
        # initialize hidden and cell states with 0s
        self.hidden =  (torch.zeros(1, batch_size, self.hidden_dim), 
                        torch.zeros(1, batch_size, self.hidden_dim))
        return self.hidden
        #hidden_state = torch.randn(n_layers, batch_size, hidden_dim)
    

    def forward(self, xb):
        print("original xb.shape:", xb.shape)
        print(xb) # 11 x 32
        
        # make the one-hot nucleotide vectors group together
        xb = xb.view(-1,self.seq_len,4) 
        print("re-viewed xb.shape:", xb.shape) # >> 11 x 8 x 4
        print(xb)

        # ** Init hidden/cell states?? **
        batch_size = xb.shape[0]
        print("batch_size:",batch_size)
        (h,c) = self.init_hidden(batch_size)
         
        # *******
        
        lstm_out, self.hidden = self.rnn(xb, (h,c)) # should this get H and C?
        #print("lstm_out",lstm_out)
        print("lstm_out shape:",lstm_out.shape) # >> 11, 8, 10
        print("lstm_out[-1] shape:",lstm_out[-1].shape) # >> 8 x 10
        print("lstm_out[-1][-1] shape:",lstm_out[-1][-1].shape) # 10
        
        print("hidden len:",len(self.hidden)) # 2
        print("hidden[0] shape:", self.hidden[0].shape) # >> 1 x 11 x 10
        print("hidden[0][-1] shape:", self.hidden[0][-1].shape) # >> 11 X 10
        print("hidden[0][-1][-1] shape:", self.hidden[0][-1][-1].shape) # >> 10
        
        print("*****")
        # These vectors should be the same, right?
        A = lstm_out[-1][-1]
        B = self.hidden[0][-1][-1]
        print("lstm_out[-1][-1]:",A)
        print("self.hidden[0][-1][-1]",B)
        print("==?", A==B)
        print("*****")
        
        print()
        print("what does this reshaping do?")
        print("lstm_out shape:",lstm_out.shape)  # out.shape = (batch_size, seq_len, hidden_size) >> 11, 8, 10
        out = lstm_out.contiguous().view(-1, self.hidden_dim) # out.shape = (seq_len, hidden_size) 
                                                              # THIS IS WRONG ^^ it's batch_size*seq_len?! 
                                                              #                       88 x 10
        print("re-viewed lstm_out shape:",out.shape) 
        
        out = self.fc(out)                                   # this is now 88 X 1 instead of 11 X 1
        print("LSTM->FC out shape:",out.shape)               # This makes it break at the loss function
        return out

In [17]:
seq_len = len(mer8motif_train_df['seq'].values[0])

mer8motif_model_lstm = DNA_LSTM(seq_len)
mer8motif_model_lstm

DNA_LSTM(
  (rnn): LSTM(4, 10, batch_first=True)
  (fc): Linear(in_features=10, out_features=1, bias=True)
)

In [18]:
train_losses,test_losses= run_model(
    mer8motif_train_dl,
    mer8motif_test_dl,
    mer8motif_model_lstm, 
    lr=0.01
)


loss batch ****
xb shape: torch.Size([11, 32])
yb shape: torch.Size([11, 1])
original xb.shape: torch.Size([11, 32])
tensor([[1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0.,
         0., 1., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 1.],
        [0., 0., 1., 0., 1., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0.,
         1., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0.],
        [0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 1.,
         0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 1., 0.],
        [0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 1., 0.,
         0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0.],
        [0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 1.,
         0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0.,
         0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 1.],
 

  return F.mse_loss(input, target, reduction=self.reduction)


RuntimeError: The size of tensor a (88) must match the size of tensor b (11) at non-singleton dimension 0