In [16]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from data_loader.data_loader import PhageLoader
from torch.utils.data.sampler import SubsetRandomSampler
import seaborn as sns
import json
sns.set(rc={'figure.figsize':(15,10)})

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

## Network class (this might be better off in a separate file)

In [6]:
class GRU(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_size, output_labels, number_of_layers=1, bidirectional=True, weights_matrix=None):
        super(GRU, self).__init__()
        
        self.bidirectional = bidirectional
        self.hidden_dim_dense = hidden_size
        self.num_layers = number_of_layers 
        if bidirectional:
            self.hidden_dim_dense = hidden_size * 2
        if len(weights_matrix.size())!=0:
            self.emb_layer = self.create_emb_layer(weights_matrix)          
        else:
            self.emb_layer = nn.Embedding(vocab_size, embedding_size)
            
        self.gru = nn.GRU(input_size=embedding_size, hidden_size=hidden_size, bidirectional=bidirectional, num_layers=number_of_layers,batch_first=True)
        self.linear = nn.Linear(self.hidden_dim_dense, int(self.hidden_dim_dense/2))
        self.linear2 = nn.Linear(int(self.hidden_dim_dense/2),output_labels)
        
    def forward(self, h_t1, indexes):
        #indexes -> (batch,seq_length)
        embedding = self.emb_layer(indexes)
        #print("EMBEDDING SHAPE: ", embedding.size())
        #embedding -> (batch,seq_length,embedding_size)
        out,h_t = self.gru(embedding, h_t1)
        #print("OUT SHAPE: ", out.size(), self.hidden_dim_dense)
        #out -> ()
        #out = out.view(self.hidden_dim_dense, -1)
        
        out = F.relu(self.linear(out))
        out = self.linear2(out)
        out = F.log_softmax(out,dim=2)
        
        return out,h_t
        
    def initHidden(self, batch_size, hidden_size):
        if self.bidirectional:
            return torch.randn(self.num_layers*2, batch_size, hidden_size, device=device)
        else:
            return torch.randn(self.num_layers, batch_size, hidden_size, device=device)

    def create_emb_layer(self, weights_matrix, non_trainable=False):
        num_embeddings, embedding_dim = weights_matrix.size()
        emb_layer = nn.Embedding(num_embeddings, embedding_dim)
        emb_layer.load_state_dict({'weight': weights_matrix})
        if non_trainable:
            emb_layer.weight.requires_grad = False
        return emb_layer

## Computing fordward pass (predicting)

In [None]:
# get data


In [8]:
# read model
model = torch.load('models_grid/model1_100', map_location='cpu')

hidden = 
torch.randn(self.num_layers*2, batch_size, hidden_size, device=device)
model.initHidden(batch_size,hidden_size)

out,hidden = model(hidden,x)
        y = y.view(batch_size*read_length)
        out = out.view(batch_size*read_length,output_labels)
        loss = loss_function(out,y)
        running_loss = running_loss + loss.item()



In [9]:
model

GRU(
  (emb_layer): Embedding(64, 100)
  (gru): GRU(100, 30, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=60, out_features=30, bias=True)
  (linear2): Linear(in_features=30, out_features=2, bias=True)
)

In [None]:
# do predictions
json1_file = 
json1_str = json1_file

In [78]:
weight_list = []
for name in model.named_parameters():
    if 'weight' in name[0]:
        print(name[0])
        weight_list.append(name[1])
print(weight_list[0].shape)
print(weight_list[1].shape)

emb_layer.weight
gru.weight_ih_l0
gru.weight_hh_l0
gru.weight_ih_l0_reverse
gru.weight_hh_l0_reverse
linear.weight
linear2.weight
torch.Size([64, 100])
torch.Size([90, 100])


## Output file (for computing metrics)

In [22]:
model_configs = json.loads(open('models_grid/model1.json').read())
model_configs

{'k_size': 3,
 'stride': 1,
 'batch_size': 20,
 'read_length': 100,
 'hidden_size': 30,
 'number_of_layers': 1,
 'lr': 0.07,
 'optimizer': 'SGD'}

In [28]:
model_configs

100

In [29]:
loader = PhageLoader("data/")
dataset = loader.get_data_set(n_files='all', \
                              read_length=model_configs['read_length'], \
                              batch_size=model_configs['batch_size'], \
                              k=model_configs['k_size'], stride=model_configs['stride'], \
                              embedding="dict", \
                              embed_size=None, \
                              drop_last=False)

In [44]:
# Original:
example_original = dataset[0][0]

# Prediction: 
# sample = np.unique(np.random.randint(1, example_original.shape[0], 50))
#example_prediction = np.array([1  if i in sample else 0 for i in np.arange(1, example_original.shape[0])])

In [142]:
def encode_predictions(read_ids, predicted, save=False):
    
    reading_gene = False
    start = 0
    end = 0
    gene_size = 0
    starts = []
    ends = []
    intervals = []
    sizes = []
    ids = [] 
    
    for i, p in enumerate(predicted):
        if reading_gene:
            if p == 0:
                reading_gene = False
                end = i
                starts.append(start)
                ends.append(end)
                intervals.append('['+str(start)+', '+str(end)+')')
                sizes.append(gene_size)
                ids.append(read_ids[i])
            else:
                gene_size += 1      
        else:
            if p == 1:
                reading_gene = True
                start = i

    if predicted[len(predicted) - 1] == 1:
        end = i + 1
        intervals.append('['+str(start)+', '+str(end)+')')
        starts.append(start)
        ends.append(end)
        sizes.append(gene_size)
        ids.append(read_ids[i])
        
    out = pd.DataFrame({'Read number': ids, \
              'Strand': 1, \
              'Location of the gene in the read': intervals, \
              'Read length': sizes, \
              'Start': starts, \
              'End': ends})
    if save:
        out.to_csv('evaluation_dataframes/' + read_ids[0].split('-')[0] + '.csv')
        
    return out

In [143]:
temp = encode_predictions(['QS-' + str(i) for i in range(example_prediction.shape[0])], example_prediction) 
temp

Unnamed: 0,Read number,Strand,Location of the gene in the read,Read length,Start,End
0,QS-6,1,"[4, 6)",1,4,6
1,QS-10,1,"[8, 10)",2,8,10
2,QS-13,1,"[11, 13)",3,11,13
3,QS-15,1,"[14, 15)",3,14,15
4,QS-20,1,"[17, 20)",5,17,20
5,QS-24,1,"[23, 24)",5,23,24
6,QS-28,1,"[26, 28)",6,26,28
7,QS-32,1,"[31, 32)",6,31,32
8,QS-36,1,"[34, 36)",7,34,36
9,QS-38,1,"[37, 38)",7,37,38
