In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from data_loader.data_loader import PhageLoader
from torch.utils.data.sampler import SubsetRandomSampler
import seaborn as sns

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [2]:
class GRU(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_size, output_labels, number_of_layers=1, bidirectional=True, weights_matrix=None):
        super(GRU, self).__init__()
        
        self.bidirectional = bidirectional
        self.hidden_dim_dense = hidden_size
        self.num_layers = number_of_layers 
        if bidirectional:
            self.hidden_dim_dense = hidden_size * 2
        if len(weights_matrix.size())!=0:
            self.emb_layer = self.create_emb_layer(weights_matrix)          
        else:
            self.emb_layer = nn.Embedding(vocab_size, embedding_size)
            
        self.gru = nn.GRU(input_size=embedding_size, hidden_size=hidden_size, bidirectional=bidirectional, num_layers=number_of_layers,batch_first=True)
        self.linear = nn.Linear(self.hidden_dim_dense, output_labels)
        
        
    def forward(self, h_t1, indexes):
        #indexes -> (batch,seq_length)
        embedding = self.emb_layer(indexes)
        #print("EMBEDDING SHAPE: ", embedding.size())
        #embedding -> (batch,seq_length,embedding_size)
        out,h_t = self.gru(embedding, h_t1)
        #print("OUT SHAPE: ", out.size(), self.hidden_dim_dense)
        #out -> ()
        #out = out.view(self.hidden_dim_dense, -1)
        
        out = F.relu(self.linear(out))
        out2 = F.softmax(out,dim=2)
        out = F.log_softmax(out,dim=2)
        
        return out, out2,h_t
        
    def initHidden(self, batch_size, hidden_size):
        if self.bidirectional:
            return torch.randn(self.num_layers*2, batch_size, hidden_size, device=device)
        else:
            return torch.randn(self.num_layers, batch_size, hidden_size, device=device)

    def create_emb_layer(self, weights_matrix, non_trainable=False):
        num_embeddings, embedding_dim = weights_matrix.size()
        emb_layer = nn.Embedding(num_embeddings, embedding_dim)
        emb_layer.load_state_dict({'weight': weights_matrix})
        if non_trainable:
            emb_layer.weight.requires_grad = False
        return emb_layer
    
    

In [3]:
loader = PhageLoader("data/")
read_length = 100

In [8]:
k_size=1
batch_size = 16
read_length = 100
dataset = loader.get_data_set(n_files='all',read_length=read_length, batch_size=batch_size, k=k_size, stride=1, embedding="dict", embed_size=None, drop_last=False)

In [9]:
def split_sets(dataset):
    n = len(dataset)  # how many total elements you have
    test_size = .1
    n_test = int( n * test_size )  # number of test/val elements
    n_train = n - 2 * n_test

    idx = list(range(n))  # indices to all elements
    np.random.shuffle(idx)  # in-place shuffle the indices to facilitate random splitting
    train_idx = idx[:n_train]
    val_idx = idx[n_train:(n_train + n_test)]
    test_idx = idx[(n_train + n_test):]

    print(n,len(train_idx),len(val_idx),len(test_idx))
    train_sampler = SubsetRandomSampler(train_idx)
    valid_sampler = SubsetRandomSampler(val_idx)
    test_sampler = SubsetRandomSampler(test_idx)

    train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, 
                                               sampler=train_sampler,drop_last=True)
    validation_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size,
                                                    sampler=valid_sampler,drop_last=True)
    test_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size,
                                                    sampler=test_sampler,drop_last=True)
    return train_loader, validation_loader, test_loader


In [10]:
def get_initial_embeddings(k=3):
    if(k<=2):
        return torch.from_numpy(np.eye(4**k))
    dictionary = loader.get_dict(k,'dna2vec')
    indexes = loader.get_dict(k)
    size = len(indexes)
    matrix = np.zeros((size,100))
    for key, value in indexes.items():
        matrix[value] = dictionary[key]
    return torch.from_numpy(matrix)

In [11]:
def check_gradients(model):
    gru = model.gru
    for p,n in zip(gru.parameters(),gru._all_weights[0]):
        if n[:6] == 'weight':
            print('===========\ngradient:{}\n----------\n{}'.format(n,p.grad.abs().sum()))

In [12]:
vocab_size = 4**k_size
embedding_layer_size = 4
hidden_size = 16
output_labels=2
initial_embedding = get_initial_embeddings(k_size)
model = GRU(vocab_size, embedding_layer_size, hidden_size, output_labels,weights_matrix=initial_embedding)
model.to(device)
val_accuracies = []
train_accuracies = []

hidden = model.initHidden(batch_size,hidden_size)
print(hidden)
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.02)

losses = []
running_loss = 0
epochs = 2

train_loader, validation_loader, test_loader = split_sets(dataset)

for epoch in range(epochs):
    for b, (x, y) in enumerate(train_loader):
        #gives batches of size "batch_size, read_length"
        model.zero_grad()
        x, y = x.to(device), y.to(device)
        x = x.type(torch.LongTensor)
        y = y.type(torch.LongTensor)
            
        out,_,hidden = model(hidden,x)
        y = y.view(batch_size*read_length)
        out = out.view(batch_size*read_length,output_labels)
        loss = loss_function(out,y)
        running_loss = running_loss + loss.item()
        
        
        loss.backward()
        optimizer.step()
        
        if b % 20 == 19:
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, b + 1, running_loss / 100))
            losses.append(running_loss)
            running_loss = 0.0
            check_gradients(model)
            
        hidden.detach_()
    train_acc = accuracy_test(train_loader,model,hidden)
    val_acc = accuracy_test(validation_loader,model,hidden)
    val_accuracies.append(val_acc)
    train_accuracies.append(train_acc)
    losses.append(running_loss)
    running_loss = 0.0

tensor([[[-5.5212e-01, -9.0010e-01, -2.4534e-01, -4.5929e-01,  7.2408e-01,
          -2.7522e-01, -1.4250e+00, -5.1070e-01, -5.8889e-01, -5.9803e-01,
          -3.4178e-01, -1.1529e+00,  1.5320e+00, -9.5519e-01, -5.2789e-01,
           1.7838e+00],
         [-3.7691e-01, -6.6096e-01, -6.4870e-01, -5.7640e-01, -2.6654e-01,
          -5.7906e-01,  1.1511e+00, -8.2056e-01,  1.5204e+00,  1.6127e-01,
           1.0147e+00,  1.6614e+00, -1.1325e+00, -9.0000e-01,  5.6217e-01,
           4.1829e-01],
         [ 9.9227e-02, -6.3150e-01, -1.1611e+00,  6.3215e-01, -3.3317e-01,
           2.6595e-01,  5.0291e-01,  1.0916e+00, -8.6107e-01, -2.8782e-01,
           9.0368e-01, -2.4702e-01,  3.8838e-01,  1.9176e-01,  2.7612e-01,
           4.3105e-01],
         [-1.6332e+00, -1.4157e+00, -4.3189e-01,  1.1062e+00,  1.1501e+00,
          -9.4518e-01,  2.5337e-01,  1.9763e+00, -5.7808e-01,  2.4201e-01,
          -8.6053e-01, -7.6178e-01, -1.5077e-01,  4.6349e-01, -6.6602e-01,
           1.0706e-01],
    

[1,    60] loss: 0.079
gradient:weight_ih_l0
----------
0.3335218131542206
gradient:weight_hh_l0
----------
0.320604145526886
[1,    80] loss: 0.080
gradient:weight_ih_l0
----------
0.13990512490272522
gradient:weight_hh_l0
----------
0.13841937482357025
[1,   100] loss: 0.079
gradient:weight_ih_l0
----------
0.13370203971862793
gradient:weight_hh_l0
----------
0.13777995109558105
[1,   120] loss: 0.075
gradient:weight_ih_l0
----------
0.044608019292354584
gradient:weight_hh_l0
----------
0.02331388369202614
[1,   140] loss: 0.060
gradient:weight_ih_l0
----------
0.2683221697807312
gradient:weight_hh_l0
----------
0.29504281282424927
[1,   160] loss: 0.060
gradient:weight_ih_l0
----------
0.24075278639793396
gradient:weight_hh_l0
----------
0.2711490988731384
[1,   180] loss: 0.060
gradient:weight_ih_l0
----------
0.12200427800416946
gradient:weight_hh_l0
----------
0.14602281153202057
[1,   200] loss: 0.064
gradient:weight_ih_l0
----------
0.17346130311489105
gradient:weight_hh_l0
---

In [50]:
def accuracy_test(loader,model,hidden):
    total = 0
    correct = 0
    with torch.no_grad():
       
        for b, (x, y) in enumerate(loader):
            x = x.type(torch.LongTensor)
            y = y.type(torch.LongTensor)
            out,_,hidden = model(hidden,x)
            y = y.view(batch_size*read_length)
            out = out.view(batch_size*read_length,output_labels).exp()
            out_index = torch.max(out,dim=-1).indices
            correct += (out_index.eq(y)).sum()
            total += len(y)
    return correct.item()/total

In [52]:
accuracy_test(train_loader,model,hidden)

0.9124969951923076