In [None]:
# has for loop to go through hyperparameters

import h5py
from datetime import datetime
import os
import pickle
import argparse
import itertools

import torch
from torch.autograd import Variable
from torch.nn.parameter import Parameter
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from transformer import Models
from transformer import Beam
from transformer import Translator
from transformer.Optim import ScheduledOptim

from tools import CharacterTable
from translator import SignalTranslator
torch.cuda.synchronize()

In [None]:
# Weighting the loss in the middle of the sequence
# Different lengths for protein in training data set

In [None]:
# Constants. Don't change these. 
pad = 0
stop = 2
start = 1
max_in = 107
max_out = 72
n_chars = 27

In [None]:
hyper = [(256, 32, 1000, 6, 6, 0.4, 64, 100, 1e-4, -0.03), (256, 64, 1000, 6, 6, 0.3, 64, 100, 1e-4, -0.03)]

# change dim, change # heads

for h in hyper:
    # Hyperparameters. Do change these
    d_model = h[0]
    batch_size = h[1]
    batches = h[2] # batches / epoch
    n_warmup_steps = h[2] * 25
    n_layers = h[3]
    n_head = h[4]
    dropout = h[5]
    d_k = h[6]
    epochs = h[7]
    lr_max = h[8]
    decay_power = h[9]

    # Name for the model checkpoints
    # Change this, probably to reflect the hyperparameters chosen above
    hypers = [d_model, n_warmup_steps, batch_size, n_layers, n_head, dropout, d_k, epochs, lr_max, decay_power]
    chkpt_name = '_'.join([str(h) for h in hypers])

    model_opt = argparse.Namespace()
    model_opt.src_vocab_size = n_chars
    model_opt.tgt_vocab_size = n_chars
    model_opt.max_token_seq_len = max_in
    model_opt.proj_share_weight = True
    model_opt.embs_share_weight = True
    model_opt.d_k = d_k
    model_opt.d_v = d_k
    model_opt.d_model = d_model
    model_opt.d_word_vec = d_model
    model_opt.d_inner_hid = 2 * d_model
    model_opt.n_layers = n_layers
    model_opt.n_head = n_head
    model_opt.dropout = dropout

    ##############################
    # Change this to use the GPU #
    model_opt.cuda = True        #
    ##############################

    optim_opt = argparse.Namespace()
    optim_opt.n_warmup_steps = n_warmup_steps
    optim_opt.optim = optim.Adam
    optim_opt.lr_max = lr_max
    optim_opt.decay_power = decay_power
    optim_opt.d_model = None

    trans_opt = argparse.Namespace()
    trans_opt.beam_size = 1
    trans_opt.n_best = 1
    trans_opt.max_trans_length = max_out

    with open('../outputs/ctable_token.pkl', 'rb') as f:
        ctable = pickle.load(f)

    trans_opt.ctable = ctable

    clf = SignalTranslator(model_opt, optim_opt, trans_opt)

    steps = np.arange(epochs * batches)
    lrs = clf.optimizer.get_learning_rate(steps)
    _ = plt.plot(steps, lrs)

    train_file = h5py.File('../data/train_tokens.hdf5')
    val_file = h5py.File('../data/validate_tokens.hdf5')
    history = clf.train(train_file, val_file, epochs=epochs, batch_size=batch_size,
                               save_model='../outputs/models/' + chkpt_name, save_mode='best')
    train_file.close()
    val_file.close() #save model and load model

    # See how well it does on first batch_size test sequences
    file = h5py.File('../data/test_tokens.hdf5')
    training_data = SignalTranslator.generator_from_h5(file, batch_size, shuffle=False, use_cuda=True)
    src, tgt = next(training_data)
    file.close()
    %time decoded, all_hyp, all_scores = clf.translate_batch(src)
    for tg, dec in zip(tgt[0], decoded):
        print(dec)
        print(ctable.decode(tg.data.cpu().numpy())[:])
        print()

    fig, axs = plt.subplots(1, 2, figsize=(16, 7))
    x = np.arange(len(history['train_loss']))
    _ = axs[0].plot(x, history['train_loss'], label='training loss', alpha=0.8)
    _ = axs[0].plot(x, history['val_loss'], label='validation loss', alpha=0.8)
    _ = axs[0].legend()
    _ = axs[1].plot(x, history['train_acc'], label='training accuracy', alpha=0.8)
    _ = axs[1].plot(x, history['val_acc'], label='validation accuracy', alpha=0.8)
    _ = axs[1].legend()

In [None]:
#For loading the model
#chkpt = "../outputs/models/" + chkpt_name + ".chkpt"
#clf = SignalTranslator.load_model(chkpt)

In [None]:
history

In [None]:
exit()
# see how similar the SPs that have been predicted well are to the ones in training set

In [None]:
import pickle
from difflib import SequenceMatcher

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [None]:
# Compare similarity of SPs predicted well by model to those in training set USING STRINGS
# See how well it does on first batch_size test sequences 
file = h5py.File('../data/test_tokens.hdf5')
test_data = SignalTranslator.generator_from_h5(file, batch_size, shuffle=False, use_cuda=True)
src, tgt = next(test_data) #src prot, tgt = signal peptides
file.close()
%time decoded, all_hyp, all_scores = clf.translate_batch(src)

good_pred = [] # stores signal peptides predicted well that are in training set

for tg, tg2, dec in zip(tgt[0], src[0], decoded):
    actual = ctable.decode(tg.data.cpu().numpy())[:]
    print()
    print(dec)
    print(actual)
    print()
    
    # sum returns length of amino acids that are the same at same position? --> so full length returned
    # means its the same SP
    # plot signal peptide to similarity
    # plot top three simmilar to how similar their SPs areq
    
    actual = ''.join(actual.split())
    actual = actual[1:-1]
    dec = dec[:-1]
    
    if abs(len(dec) - len(actual)) <= 5 and similar(actual, dec) >= 0.7:
        good_pred.append((tg, tg2, dec, actual))

In [None]:
len(good_pred)

In [None]:
# compare prots of SPs predicted well to prots in training dataset
file = h5py.File('../data/train_tokens.hdf5')

# checking for similar protein sequences
index = 1

for tg, tg2, dec, actual in good_pred:
    train_data = SignalTranslator.generator_from_h5(file, batch_size, shuffle=False, use_cuda=True)
    sum_length = 0 # length of same amino acids when comparing SPs/prots predicted well to training set
    sim = [] # stores signal peptides predicted well that are in training set
    lengths = [] # stores all protein sequences of the signal peptides predicted well that are in training set
    most_sim = []
    lst = []
    int1 = 0
    int2 = 0
    int3 = 0

    print("PROTEIN {}".format(index))
    print(dec)
    print(actual)
    test_prot = tg2.data.cpu().numpy()
    
    for i, batch in enumerate(train_data): # each batch in training data
        src, tgt = batch
        train_prot = src[0].data.cpu().numpy() # training proteins
        for row in train_prot:
            sum_length = sum(test_prot == row) # length of amino acids that are the same
            #print(sum_length)
            if sum_length > 51:
                sim.append((row, sum_length / 102))
    
    if len(sim) > 0:
        for key in sim:
            prot, length = key
            lengths.append(length)
        
        # clone the list by slicing, find top 3 lengths
        copy = lengths[:]
        
        int1 = max(copy)
        copy = [x for x in copy if x != int1]
            
        if len(copy) > 0:
            int2 = max(copy)
            copy = [x for x in copy if x != int2]
            
        if len(copy) > 0:
            int3 = max(copy)
        
        lst = [int1, int2, int3]
        for pair in sim:
            prot, length = pair
            if length in lst:
                print((prot, length))
                lst.remove(length)
            if len(lst) == 0:
                break
    print("-------------------------------------------------------")
    index += 1
    
# sum returns length of amino acids that are the same at same position? --> so full length returned
# means its the same SP
# plot signal peptide to similarity
# plot top three simmilar to how similar their SPs areq

In [None]:
# compare SPs predicted well to SPs in training dataset
file = h5py.File('../data/train_tokens.hdf5')

# checking for similar protein sequences
index = 1

for tg, tg2, dec, actual in good_pred:
    train_data = SignalTranslator.generator_from_h5(file, batch_size, shuffle=False, use_cuda=True)
    sum_length = 0 # length of same amino acids when comparing SPs/prots predicted well to training set
    sim = [] # stores signal peptides predicted well that are in training set
    lengths = [] # stores all protein sequences of the signal peptides predicted well that are in training set
    most_sim = []
    int1 = 0
    int2 = 0
    int3 = 0
    lst = []

    print("PROTEIN {}".format(index))
    print(dec)
    print(actual)
    test_sp = tg.data.cpu().numpy()
    
    for i, batch in enumerate(train_data): # each batch in training data
        src, tgt = batch
        train_sp = tgt[0].data.cpu().numpy() # training proteins
        for row in train_sp:
            sum_length = sum(test_sp == row) # length of amino acids that are the same
            if sum_length > 51:
                sim.append((row, sum_length / 102))
    
    if len(sim) > 0:
        for key in sim:
            sp, length = key
            lengths.append(length)
        
        # clone the list by slicing, find top 3 lengths
        copy = lengths[:]
        
        int1 = max(copy)
        copy = [x for x in copy if x != int1]
            
        if len(copy) > 0:
            int2 = max(copy)
            copy = [x for x in copy if x != int2]
            
        if len(copy) > 0:
            int3 = max(copy)
        
        lst = [int1, int2, int3]
        for pair in sim:
            prot, length = pair
            if length in lst:
                print((prot, length))
                lst.remove(length)
            if len(lst) == 0:
                break
    print("-------------------------------------------------------")
    index += 1
    
# sum returns length of amino acids that are the same at same position? --> so full length returned
# means its the same SP
# plot signal peptide to similarity
# plot top three simmilar to how similar their SPs areq

In [None]:
# Compare similarity of SPs predicted well by model to those in training set USING STRINGS
# See how well it does on first batch_size test sequences 
file = h5py.File('../data/test_tokens.hdf5')
test_data = SignalTranslator.generator_from_h5(file, batch_size, shuffle=False, use_cuda=True)
src, tgt = next(test_data) #src prot, tgt = signal peptides
file.close()
%time decoded, all_hyp, all_scores = clf.translate_batch(src)

acc = [] # stores all signal peptides predicted well
sim = [] # stores signal peptides predicted well that are in training set
same_prot = [] # stores all protein sequences of the signal peptides predicted well that are in the training set

for tg, tg2, dec in zip(tgt[0], src[0], decoded):
    actual = ctable.decode(tg.data.cpu().numpy())[:]
    #prot_seq = ctable.decode(tg2.data.cpu().numpy())[:]
    print()
    #print(prot_seq)
    print(dec)
    print(actual)
    print()
    
    # sum returns length of amino acids that are the same at same position? --> so full length returned
    # means its the same SP
    # plot signal peptide to similarity
    # plot top three simmilar to how similar their SPs areq
    
    actual = ''.join(actual.split())
    actual = actual[1:-1]
    dec = dec[:-1]
    
    if abs(len(dec) - len(actual)) <= 5 and similar(actual, dec) >= 0.7:
        acc.append(actual)
        if actual in train_sp:
            sim.append(actual)
    
    #if prot_seq in train_prot:
    #    same_prot.append(src)

In [None]:
len(sim)

In [None]:
len(acc)