In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from data_loader.data_loader import PhageLoader
from torch.utils.data.sampler import SequentialSampler
from torch.utils.data import Subset
import seaborn as sns
import math
import json
import pickle
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

sns.set(rc={'figure.figsize':(15,10)})

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [4]:
class GRU(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_size, output_labels, number_of_layers=1, bidirectional=True, weights_matrix=None):
        super(GRU, self).__init__()
        
        self.bidirectional = bidirectional
        self.hidden_dim_dense = hidden_size
        self.num_layers = number_of_layers 
        if bidirectional:
            self.hidden_dim_dense = hidden_size * 2
        if len(weights_matrix.size())!=0:
            self.emb_layer = self.create_emb_layer(weights_matrix,True)          
        else:
            self.emb_layer = nn.Embedding(vocab_size, embedding_size)
            
        self.gru = nn.GRU(input_size=embedding_size, hidden_size=hidden_size, bidirectional=bidirectional, num_layers=number_of_layers,batch_first=True)
        self.linear = nn.Linear(self.hidden_dim_dense, int(self.hidden_dim_dense/2))
        self.linear2 = nn.Linear(int(self.hidden_dim_dense/2),output_labels)
        
    def forward(self, h_t1, indexes):
        #indexes -> (batch,seq_length)
        embedding = self.emb_layer(indexes)
        #print("EMBEDDING SHAPE: ", embedding.size())
        #embedding -> (batch,seq_length,embedding_size)
        out,h_t = self.gru(embedding, h_t1)
        #print("OUT SHAPE: ", out.size(), self.hidden_dim_dense)
        #out -> ()
        #out = out.view(self.hidden_dim_dense, -1)
        
        out = F.relu(self.linear(out))
        out = self.linear2(out)
        out = F.log_softmax(out,dim=2)
        
        return out,h_t
        
    def initHidden(self, batch_size, hidden_size):      
        if self.bidirectional:
            return torch.randn(self.num_layers*2, batch_size, hidden_size, device=device)
        else:
            return torch.randn(self.num_layers, batch_size, hidden_size, device=device)


    def create_emb_layer(self, weights_matrix, non_trainable=True):
        num_embeddings, embedding_dim = weights_matrix.size()
        emb_layer = nn.Embedding(num_embeddings, embedding_dim)
        emb_layer.load_state_dict({'weight': weights_matrix})
        if non_trainable:
            emb_layer.weight.requires_grad = False
        return emb_layer
    

In [5]:
def get_initial_embeddings(loader,k=3):
    if(k<=2):
        return torch.from_numpy(np.eye(4**k))
    dictionary = loader.get_dict(k,'dna2vec')
    indexes = loader.get_dict(k)
    size = len(indexes)
    matrix = np.zeros((size,100))
    for key, value in indexes.items():
        matrix[value] = dictionary[key]
    return torch.from_numpy(matrix)

In [6]:
def check_gradients(model):
    gru = model.gru
    for p,n in zip(gru.parameters(),gru._all_weights[0]):
        if n[:6] == 'weight':
            print('===========\ngradient:{}\n----------\n{}'.format(n,p.grad.abs().sum()))

In [7]:
def check_params(model):
    gru = model.gru
    for p,n in zip(gru.parameters(),gru._all_weights[0]):
        #if n[:6] == 'weight':
            print('===========\ngradient:{}\n----------\n{}'.format(n,p.data))
            print('===========\ngradient:{}\n----------\n{}'.format(n,p.size()))

In [8]:
def accuracy_test(loader,model,hidden):
    total = 0
    correct = 0
    with torch.no_grad():
       
        for b, (x, y) in enumerate(loader):
            x = x.type(torch.LongTensor)
            y = y.type(torch.LongTensor)
            x, y = x.to(device), y.to(device)
            out,hidden = model(hidden,x)
            y = y.view(batch_size*read_length)
            out = out.view(batch_size*read_length,output_labels).exp()
            _, out_index= torch.max(out,dim=-1)
            #print(out_index)
            correct += (out_index.eq(y)).sum()
            total += len(y)
    return correct.item()/total

In [9]:
def strip_plot(model,loader,hidden):
    total = 0
    correct = 0
    with torch.no_grad():
        bs = []
        outs = []
        ys = []
        for b, (x, y) in enumerate(loader):
            x = x.type(torch.LongTensor)
            y = y.type(torch.LongTensor)
            x, y = x.to(device), y.to(device)
            out, _  = model(hidden,x)
            y = y.view(batch_size*read_length)
            out = out.view(batch_size*read_length,output_labels).exp()
            _, out_index= torch.max(out,dim=-1)
            bs.append(np.ones(y.shape[0])*b)
            outs.append(out_index.cpu().numpy())
            ys.append(y.cpu().numpy())
            
        print(b)
    return pd.DataFrame({"batch": np.concatenate(bs), "predicted": np.concatenate(outs),"actual": np.concatenate(ys)})

In [10]:
def print_strip(dataframe):
    df1 = dataframe[dataframe['batch'] == 0]
    df = pd.melt(df1, id_vars=['batch'], value_vars=['predicted','actual'])
    df['x'] = np.tile(np.arange(len(df1))+1,2)
    colors = ["windows blue", "amber"]
    sns.stripplot(x="x", y="variable", data=df,hue='value', linewidth=1,jitter=True,palette=sns.xkcd_palette(colors))

In [11]:
def loss_validation_set(model,hidden, dataloader, batch_size, read_length, loss_function):
    running_loss = 0
    output_labels = 2
    for b, (x, y) in enumerate(dataloader):
        x = x.type(torch.LongTensor)
        y = y.type(torch.LongTensor)
        x, y = x.to(device), y.to(device)
        out,hidden = model(hidden,x)
        y = y.view(batch_size*read_length)
        out = out.view(batch_size*read_length,output_labels)
        valid_loss = loss_function(out,y)
        running_loss = running_loss + valid_loss.item()
        
    return running_loss

In [12]:
def train_net(k_size,stride,batch_size,read_length,hidden_size,number_of_layers,lr,optimizer,n_files='all',id_run=1):
    k_size=k_size
    batch_size = batch_size
    read_length = read_length
    loader = PhageLoader("data/")
    dataset = loader.get_data_set(n_files=n_files,read_length=read_length, batch_size=batch_size, k=k_size, stride=stride, embedding="dict", embed_size=None, drop_last=False)
    vocab_size = 4**k_size
    if k_size <= 2:   
        embedding_layer_size = vocab_size
    else:
        embedding_layer_size = 100
        
    hidden_size = hidden_size
    number_of_layers = number_of_layers
    output_labels = 2
    initial_embedding = get_initial_embeddings(loader,k_size)
    model = GRU(vocab_size, embedding_layer_size, hidden_size, output_labels,number_of_layers=number_of_layers,weights_matrix=initial_embedding)
    model.to(device)
    weights = torch.tensor([7,1],dtype=torch.float)
    weights = weights.to(device)
    val_accuracies = []
    train_accuracies = []
    learning_rate = lr
    optim_type = optimizer
    
    hidden = model.initHidden(batch_size, hidden_size)


    loss_function = nn.NLLLoss(weight = weights)
    
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)
    if optim_type == 'ADAM':
        optimizer = optim.Adam(model.parameters(), lr = learning_rate)     

    losses = []
    losses_val = []
    running_loss = 0
    
    epochs = 100

    train_loader, validation_loader, test_loader = split_sets(dataset,batch_size)

    for epoch in range(epochs):
        running_loss_valid = 0
        for b, (x, y) in enumerate(train_loader):
            #gives batches of size "batch_size, read_length"
            model.zero_grad()

            x = x.type(torch.LongTensor)
            y = y.type(torch.LongTensor)
            x, y = x.to(device), y.to(device)  
            out,hidden = model(hidden,x)
            y = y.view(batch_size*read_length)
            out = out.view(batch_size*read_length,output_labels)
            loss = loss_function(out,y)
            running_loss = running_loss + loss.item()


            loss.backward()
            optimizer.step()

            #if b % 40 == 39:
                #print('[%d, %5d] loss: %.3f' %
                #      (epoch + 1, b + 1, running_loss / 100))
                #losses.append(running_loss)
                #running_loss = 0.0
                #check_gradients(model)

            hidden.detach_()
        if (epoch+1) %20 == 0:
            print('saving model')
            name = 'models_grid/model' + str(id_run)+'_'+str(epoch+1)
            torch.save(model,name)
            
        valid_loss = loss_validation_set(model,hidden,validation_loader,batch_size,read_length,loss_function)  
        losses_val.append(valid_loss)
        losses.append(running_loss)
        running_loss = 0.0
        #print(losses,losses_val)
    
    
    filehandler = open('models_grid/losses_train'+str(id_run)+'.pkl',"wb")
    pickle.dump(losses,filehandler)
    filehandler.close()
    
    filehandler = open('models_grid/losses_validation'+str(id_run)+'.pkl',"wb")
    pickle.dump(losses_val,filehandler)
    filehandler.close()
    create_dict(k_size,stride,batch_size,read_length,hidden_size,number_of_layers,lr,optim_type,id_run=id_run)
    

In [13]:
def create_dict(k_size,stride,batch_size,read_length,hidden_size,number_of_layers,lr,optimizer,id_run=1):
    data = {}
    data['k_size'] = k_size
    data['stride'] = stride
    data['batch_size'] = batch_size
    data['read_length'] = read_length
    data['hidden_size'] = hidden_size
    data['number_of_layers'] = number_of_layers
    data['lr'] = lr
    data['optimizer'] = optimizer
    print(data)
    name_dict = 'models_grid/model'+ str(id_run)+'.json'
    with open(name_dict, 'w') as fp:
        json.dump(data, fp)

In [14]:
def grid_search(k_size,stride,batch_size,read_length,hidden_size,number_of_layers,lr,optimizer):
    g = np.meshgrid(k_size,stride,batch_size,read_length,hidden_size,number_of_layers,lr,optimizer)
    z = list(zip(*(x.flat for x in g)))
    for i,option in enumerate(z): 
        k_size_p = option[0].item()
        stride_p = option[1].item()
        batch_size_p =option[2].item()
        read_length_p = option[3].item()
        hidden_size_p = option[4].item()
        number_of_layers_p = option[5].item()
        lr_p = option[6].item()
        optimizer_p = option[7].item()
        print(k_size_p,stride_p,batch_size_p,read_length_p,hidden_size_p,number_of_layers_p,lr_p,optimizer_p)
        train_net(k_size_p,stride_p,batch_size_p,read_length_p,hidden_size_p,number_of_layers_p,lr_p,optimizer_p,n_files='all',id_run=i+1)
        

In [None]:
k_size= [3,5,7]
stride = [1,2]
batch_size = [20,50]
read_length = [100,200]
hidden_size = [30,60]
number_of_layers = [1,2]
lr = [0.07, 0.3]
optimizer = ['SGD','ADAM']
grid_search(k_size,stride,batch_size,read_length,hidden_size,number_of_layers,lr,optimizer)

3 1 20 100 30 1 0.07 SGD


In [15]:
def split_sets(dataset,batch_size,ids=False):
    n = len(dataset)  # how many total elements you have
    test_size = .1
    n_test = int( n * test_size )  # number of test/val elements
    n_train = n - 2 * n_test

    idx = list(range(n))  # indices to all elements
    train_idx = idx[:n_train]
    val_idx = idx[n_train:(n_train + n_test)]
    test_idx = idx[(n_train + n_test):]

    train_dataset = Subset(dataset,train_idx)
    valid_dataset = Subset(dataset,val_idx)
    test_dataset = Subset(dataset,test_idx)
    a = batch_size 

    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=a, drop_last=True)
    validation_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=a,drop_last=True)
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=a, drop_last=True)
    if ids:
        return (train_loader,train_idx), (validation_loader,val_idx), (test_loader,test_idx)
    else:
        return train_loader, validation_loader, test_loader


In [34]:
def get_results_formatted(model_name='model5',epoch=100):
    folder = 'models_grid/'
    
    model_selected = model_name+'_'+str(epoch)
    model = torch.load(folder + model_selected,map_location='cpu')
    loader = PhageLoader("data/")
    
    configuration = model_name+'.json'
    f = open(folder + configuration)
    config = json.load(f)
    f.close()

    k_size = config['k_size']
    read_length = config['read_length']
    batch_size = config['batch_size']
    stride = config['stride']
    hidden_size = config['hidden_size']
    batch_size = 1

    dataset = loader.get_data_set(n_files=2,read_length=read_length, batch_size=batch_size, k=k_size, stride=stride, embedding="dict", embed_size=None, drop_last=False)
    list_ids = loader.get_data_set_ids(n_files=2,read_length=read_length, batch_size=batch_size, k=k_size, stride=stride, embedding="dict", embed_size=None, drop_last=False)

    (train_loader,train_idx), (validation_loader,val_idx), (test_loader,test_idx) = split_sets(dataset,batch_size,ids=True)
    indices_validation_set = [list_ids[i] for i in val_idx]
    dataframe_result = predictions_validation(validation_loader, model, indices_validation_set, batch_size,hidden_size,k_size,read_length)
    return dataframe_result


In [35]:
def predictions_validation(validation_loader,model,indices,batch_size,hidden_size,k_size,read_length):
    output_labels = 2
    df = pd.DataFrame(columns=['id', 'predicted_values', 'true_values'])
    with torch.no_grad(): 
        model.eval()
        hidden = model.initHidden(batch_size,hidden_size)
        id_read = None
        prediction_batch = []
        ytrue_batch = []
        init = True
        for b, (x, y) in enumerate(validation_loader):
            x = x.type(torch.LongTensor)
            y = y.type(torch.LongTensor)
            x, y = x.to(device), y.to(device)
            out,hidden = model(hidden,x)
            
            y = y.view(-1, 1).repeat(1, k_size).view(batch_size,-1)
            y = get_original(y,k_size,1)
            out = out.view(read_length,batch_size,output_labels).exp()
            _, out_index= torch.max(out,dim=-1)
            out_index = out_index.view(-1, 1).repeat(1, k_size).view(batch_size,-1)
            
            out_index = get_original(out_index,k_size,1)
            
            if id_read == indices[b*batch_size: (b+1)*batch_size][0]:
                
                prediction_batch = prediction_batch + out_index
                ytrue_batch = ytrue_batch + y
                
            else:
                if not init:
                    df = df.append({'id' : id_read , 'predicted_values' : prediction_batch,'true_values':ytrue_batch} , ignore_index=True)
                prediction_batch = out_index
                ytrue_batch = y
                init = False
            id_read = indices[b*batch_size: (b+1)*batch_size][0]
        return df

In [36]:
def get_original(predictions,k_size=3,stride=1):
    i=0
    result = []
    while (i+k_size-1) < predictions.size()[1]:
        preds = predictions[0,i*stride:(i+k_size)*(stride)]
        preds = preds.tolist() 
        if i==0:
            result = result + preds
        else:
            result.append(preds[k_size-1])
        i = i + k_size 
    return result

In [37]:
dataframe_result = get_results_formatted()

In [38]:
dataframe_result

Unnamed: 0,id,predicted_values,true_values
0,NC_016073-69,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,NC_016073-70,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,NC_016073-71,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,NC_016073-72,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,NC_016073-73,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
5,NC_016073-74,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
6,NC_016073-76,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
7,NC_016073-77,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
8,NC_016073-78,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, ..."
9,NC_016073-79,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
