## QA over unstructured data

Using Match LSTM, Pointer Networks, as mentioned in paper https://arxiv.org/pdf/1608.07905.pdf

We start with the pre-processing provided by https://github.com/MurtyShikhar/Question-Answering to clean up the data and make neat para, ques files.


### @TODOs:

1. [done] _Figure out how to put in real, pre-trained embeddings in embeddings layer._
2. [done] _Explicitly provide batch size when instantiating model_
3. [done] is ./val.ids.* validation set or test set?: **validation**
4. [done:em] emInstead of test loss, calculate test acc metrics
    1. todo: new metrics like P, R, F1
5. [done] Update unit test codes

In [1]:
from __future__ import unicode_literals, print_function, division
import matplotlib.pyplot as plt
from io import open
import numpy as np
import unicodedata
import traceback
import string
import random
import time
import re
import os


import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.autograd import Variable

from networks import Encoder, MatchLSTMEncoder, PointerDecoder

device = torch.device("cuda")

torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

#### Debug Legend

- 5: Print everything that goes in every tensor.
- 4: ??
- 3: Check every model individually
- 2: Print things in training loops
- 1: ??

In [2]:
# Macros 
DATA_LOC = './data/domain/'
MODEL_LOC = './models/mlstms/domain/'
DEBUG = 1

# nn Macros
QUES_LEN, PARA_LEN =  30, 200
VOCAB_SIZE = 120000
# VOCAB_SIZE = glove_file.shape[1]               # @TODO: get actual size
HIDDEN_DIM = 150
EMBEDDING_DIM = 300
BATCH_SIZE = 45                  # Might have total 100 batches.
EPOCHS = 300
TEST_EVERY_ = 1
LR = 0.001
CROP = None

### Encoder 
Use a simple lstm class to have encoder for question and paragraph. 
The output of these will be used in the match lstm

$H^p = LSTM(P)$ 


$H^q = LSTM(Q)$

In [3]:
# class Encoder(nn.Module):
    
#     def __init__(self, inputlen, macros, glove_file, device):
#         super(Encoder, self).__init__()
        
#         # Catch dim
#         self.inputlen = inputlen
#         self.hiddendim = macros['hidden_dim']
#         self.embeddingdim =  macros['embedding_dim']
#         self.vocablen = macros['vocab_size']
# #         self.device = macros['device']
#         self.batch_size = macros['batch_size']
#         self.debug = macros['debug']
        
#         # Embedding Layer
# #         self.embedding = nn.Embedding(self.vocablen, self.embeddingdim)
#         self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(glove_file))
#         self.embedding.weight.requires_grad = True
       
#         # LSTM Layer
#         self.lstm = nn.LSTM(self.embeddingdim, self.hiddendim, bidirectional=True)
        
#     def init_hidden(self, batch_size, device):
        
#         # Returns a new hidden layer var for LSTM
#         return (torch.zeros((2, batch_size, self.hiddendim), device=device), 
#                 torch.zeros((2, batch_size, self.hiddendim), device=device))
    
#     def forward(self, x, h):
        
#         # Input: x (batch, len ) (current input)
#         # Hidden: h (1, batch, hiddendim) (last hidden state)
        
#         # Batchsize: b int (inferred)
#         b = x.shape[0]
        
#         if self.debug > 4: print("x:\t", x.shape)
#         if self.debug > 4: print("h:\t", h[0].shape, h[1].shape)
        
#         x_emb = self.embedding(x)
#         if self.debug > 4: print("x_emb:\t", x_emb.shape)
            
#         ycap, h = self.lstm(x_emb.view(-1, b, self.embeddingdim), h)
#         if self.debug > 4: print("ycap:\t", ycap.shape)
        
#         return ycap, h
    
    
# # with torch.no_grad():
# #     print ("Trying out question encoder LSTM")
# #     model = Encoder(QUES_LEN, HIDDEN_DIM, EMBEDDING_DIM, VOCAB_SIZE)
# #     dummy_x = torch.tensor([22,45,12], dtype=torch.long)
# #     hidden = model.init_hidden()
# #     ycap, h = model(dummy_x, hidden)
    
# #     print(ycap.shape)
# #     print(h[0].shape, h[1].shape)


# if DEBUG >= 4:
#     with torch.no_grad():
        
#         macros = {
#         "ques_len": QUES_LEN,
#         "hidden_dim": HIDDEN_DIM, 
#         "vocab_size": VOCAB_SIZE, 
#         "batch_size": BATCH_SIZE,
#         "para_len": PARA_LEN,
#         "embedding_dim": EMBEDDING_DIM,
#         "lr": LR,
#         "debug":4,
#         "device":device
#     }

#         dummy_para = torch.randint(0,VOCAB_SIZE-1,(PARA_LEN*BATCH_SIZE,), device=device).view(BATCH_SIZE,PARA_LEN).long()
#     #     print (dummy_para.shape)
#         dummy_question = torch.randint(0,VOCAB_SIZE-1,(QUES_LEN*BATCH_SIZE,), device=device).view(BATCH_SIZE,QUES_LEN).long()
#     #     print (dummy_question.shape)
#         glove_file = torch.randn((VOCAB_SIZE, EMBEDDING_DIM))

#     #     print("LSTM with batches")
#         ques_model = Encoder(QUES_LEN, macros, glove_file).cuda(device)
#         para_model = Encoder(QUES_LEN, macros, glove_file).cuda(device)
#         ques_hidden = ques_model.init_hidden(BATCH_SIZE)
#         para_hidden = para_model.init_hidden(BATCH_SIZE)
#         ques_embedded,hidden_ques = ques_model(dummy_question,ques_hidden)
#         para_embedded,hidden_para = para_model(dummy_para,para_hidden)
        
#         print (ques_embedded.shape) # question_length,batch,embedding_dim
#         print (para_embedded.shape) # para_length,batch,embedding_dim
#         print (hidden_para[0].shape,hidden_para[1].shape)

### Match LSTM

Use a match LSTM to compute a **summarized sequential vector** for the paragraph w.r.t the question.

Consider the summarized vector ($H^r$) as the output of a new decoder, where the inputs are $H^p, H^q$ computed above. 

1. Attend the para word $i$ with the entire question ($H^q$)
  
    1. $\vec{G}_i = tanh(W^qH^q + repeat(W^ph^p_i + W^r\vec{h^r_{i-1} + b^p}))$
    
    2. *Computing it*: Here, $\vec{G}_i$ is equivalent to `energy`, computed differently.
    
    3. Use a linear layer to compute the content within the $repeat$ fn.
    
    4. Add with another linear (without bias) with $H_q$
    
    5. $tanh$ the bloody thing
  
  
2. Softmax over it to get $\alpha$ weights.

    1. $\vec{\alpha_i} = softmax(w^t\vec{G}_i + repeat(b))$
    
3. Use the attention weight vector $\vec{\alpha_i}$ to obtain a weighted version of the question and concat it with the current token of the passage to form a vector $\vec{z_i}$

4. Use $\vec{z_i}$ to compute the desired $h^r_i$:

    1. $ h^r_i = LSTM(\vec{z_i}, h^r_{i-1}) $
    


In [4]:
# class MatchLSTMEncoder(nn.Module):
    
#     def __init__(self, macros, device):
        
#         super(MatchLSTMEncoder, self).__init__()
        
#         self.hidden_dim = macros['hidden_dim']
#         self.ques_len = macros['ques_len']
#         self.batch_size = macros['batch_size']
#         self.debug = macros['debug']    
        
#         # Catch lens and params
#         self.lin_g_repeat_a_dense = nn.Linear(2*self.hidden_dim, self.hidden_dim)
#         self.lin_g_repeat_b_dense = nn.Linear(self.hidden_dim, self.hidden_dim, bias=False)
#         self.lin_g_nobias = nn.Linear(2*self.hidden_dim, self.hidden_dim, bias=False)
        
#         self.alpha_i_w = nn.Parameter(torch.rand((self.hidden_dim, 1)))
#         self.alpha_i_b = nn.Parameter(torch.rand((1)))
        
#         self.lstm_summary = nn.LSTM((self.ques_len+1)*2*self.hidden_dim, self.hidden_dim)
                                      
    
#     def forward(self, H_p, h_ri, H_q, hidden, device):
#         """
#             Ideally, we would have manually unrolled the lstm 
#             but due to memory constraints, we do it in the module.
#         """
        
#         # Find the batchsize
#         batch_size = H_p.shape[1]
        
#         H_r = torch.empty((0, batch_size, self.hidden_dim), device=device, dtype=torch.float)
#         H_r = torch.cat((H_r, h_ri), dim=0)
        
#         if self.debug > 4:
#             print( "H_p:\t\t\t", H_p.shape)
#             print( "H_q:\t\t\t", H_q.shape)
#             print( "h_ri:\t\t\t", h_ri.shape)
#             print( "H_r:\t\t\t", H_r.shape)
#             print( "hid:\t\t\t", hidden.shape)
        
#         for i in range(H_p.shape[0]):
            
#             # We call the (W^P.H^P + W^rh^r_i-1 + b^P) as lin_repeat_input.
            
#             # We first write out its two components as
#             lin_repeat_input_a = self.lin_g_repeat_a_dense(H_p[i].view(1, batch_size, -1))
#             if self.debug > 4: print("lin_repeat_input_a:\t", lin_repeat_input_a.shape)
            
#             lin_repeat_input_b = self.lin_g_repeat_b_dense(H_r[i].view(1, batch_size, -1))
#             if self.debug > 4: print("lin_repeat_input_b:\t", lin_repeat_input_b.shape)
            
#             # Add the two terms up
#             lin_repeat_input_a.add_(lin_repeat_input_b)
# #             if self.debug > 4: print("lin_g_input_b unrepeated:", lin_g_input_b.shape)

#             lin_g_input_b = lin_repeat_input_a.repeat(H_q.shape[0], 1, 1)
#             if self.debug > 4: print("lin_g_input_b:\t\t", lin_g_input_b.shape)

#             # lin_g_input_a = self.lin_g_nobias.matmul(H_q.view(-1, self.ques_len, self.hidden_dim)) #self.lin_g_nobias(H_q)
#             lin_g_input_a =  self.lin_g_nobias(H_q)
#             if self.debug > 4: print("lin_g_input_a:\t\t", lin_g_input_a.shape)

#             G_i = F.tanh(lin_g_input_a + lin_g_input_b)
#             if self.debug > 4: print("G_i:\t\t\t", G_i.shape)
#             # Note; G_i should be a 1D vector over ques_len

#             # Attention weights
#             alpha_i_input_a = G_i.view(batch_size, -1, self.hidden_dim).matmul(self.alpha_i_w).view(batch_size, 1, -1)
#             if self.debug > 4: print("alpha_i_input_a:\t", alpha_i_input_a.shape)

#             alpha_i_input = alpha_i_input_a.add_(self.alpha_i_b.view(-1,1,1).repeat(1,1,self.ques_len))
#             if self.debug > 4: print("alpha_i_input:\t\t", alpha_i_input.shape)

#             # Softmax over alpha inputs
#             alpha_i = F.softmax(alpha_i_input, dim=-1)
#             if self.debug > 4: print("alpha_i:\t\t", alpha_i.shape)

#             # Weighted summary of question with alpha    
#             z_i_input_b = (
#                             H_q.view(batch_size, self.ques_len, -1) *
#                            (alpha_i.view(batch_size, self.ques_len, -1).repeat(1, 1, 2*self.hidden_dim))
#                           ).view(self.ques_len,batch_size, -1)
#             if self.debug > 4: print("z_i_input_b:\t\t", z_i_input_b.shape)

#             z_i = torch.cat((H_p[i].view(1, batch_size, -1), z_i_input_b), dim=0)
#             if self.debug > 4: print("z_i:\t\t\t", z_i.shape)

#             # Pass z_i, h_ri to the LSTM 
# #             lstm_input = torch.cat((z_i.view(1, batch_size,-1), H_r[i].view(1, batch_size, -1)), dim=2)
# #             if self.debug > 4: print("lstm_input:\t\t", lstm_input.shape)

#             # Take input from LSTM, concat in H_r and nullify the temp var.
#             h_ri, (_, hidden) = self.lstm_summary(z_i.view(1, batch_size, -1), 
#                                              (H_r[i].view(1,batch_size, -1), hidden))
#             if self.debug > 4:
#                 print("newh_ri:\t\t", h_ri.shape)
#                 print("newhidden:\t\t", hidden.shape)
#             H_r = torch.cat((H_r, h_ri), dim=0)
# #             h_ri = None
            
#             if self.debug > 4:
#                 print("\tH_r:\t\t\t", H_r.shape)
# #                 print("hidden new:\t\t", hidden[0].shape, hidden[1].shape)

#         return H_r[1:]
    
#     def init_hidden(self, batch_size, device):
#         # Before we've done anything, we dont have any hidden state.
#         # Refer to the Pytorch documentation to see exactly
#         # why they have this dimensionality.
#         # The axes semantics are (num_layers, minibatch_size, hidden_dim)
#         return torch.zeros((1, batch_size, self.hidden_dim), device=device)
# #                 torch.zeros((1, batch_size, self.hidden_dim), device=device))

# # with torch.no_grad():
# #     model = MatchLSTMEncoder(HIDDEN_DIM, QUES_LEN)
# #     h_pi = torch.randn(1, BATCH_SIZE, HIDDEN_DIM)
# #     h_ri = torch.randn(1, BATCH_SIZE, HIDDEN_DIM)
# #     hidden = model.init_hidden()
# #     H_q = torch.randn(QUES_LEN, BATCH_SIZE, HIDDEN_DIM)
    
# #     op, hid = model(h_pi, h_ri, H_q, hidden)
    
# #     print("\nDone:op", op.shape)
# #     print("Done:hid", hid[0].shape, hid[1].shape)

# if DEBUG >= 4:
#     with torch.no_grad():
        
#         macros = {
#             "ques_len": QUES_LEN,
#             "hidden_dim": HIDDEN_DIM, 
#             "vocab_size": VOCAB_SIZE, 
#             "batch_size": BATCH_SIZE,
#             "para_len": PARA_LEN,
#             "embedding_dim": EMBEDDING_DIM,
#             "lr": LR,
#             "debug":5,
#             "device":device
#         }
            
#         matchLSTMEncoder = MatchLSTMEncoder(macros).cuda(device)
#         hidden = matchLSTMEncoder.init_hidden(BATCH_SIZE)
#         para_embedded = torch.rand((PARA_LEN, BATCH_SIZE, 2*HIDDEN_DIM), device=device)
#         ques_embedded = torch.rand((QUES_LEN, BATCH_SIZE, 2*HIDDEN_DIM), device=device)
#         h_ri = torch.randn(1, BATCH_SIZE, HIDDEN_DIM, device=self.device)
#     #     if DEBUG:
#     #         print ("init h_ri shape is: ", h_ri.shape)
#     #         print ("the para length is ", len(para_embedded))
#         H_r = matchLSTMEncoder(para_embedded.view(-1,BATCH_SIZE,2*HIDDEN_DIM),
#                                h_ri, 
#                                ques_embedded, 
#                                hidden)
#         print("H_r: ", H_r.shape)
        
        
        

### Pointer Network

Using a ptrnet over $H_r$ to unfold and get most probable spans.
We use the **boundry model** to do that (predict start and end of seq).

A simple energy -> softmax -> decoder. Where softmaxed energy is supervised.

In [5]:
# class PointerDecoder(nn.Module):
    
#     def __init__(self, macros, device):
#         super(PointerDecoder, self).__init__()
        
#         # Keep args
#         self.hidden_dim = macros['hidden_dim']
#         self.batch_size = macros['batch_size']
#         self.para_len = macros['para_len']
#         self.debug = macros['debug']
        
#         self.lin_f_repeat = nn.Linear(self.hidden_dim, self.hidden_dim)
#         self.lin_f_nobias = nn.Linear(self.hidden_dim, self.hidden_dim, bias=False)
        
#         self.beta_k_w = nn.Parameter(torch.randn(self.hidden_dim, 1))
#         self.beta_k_b = nn.Parameter(torch.randn(1))
        
#         self.lstm = nn.LSTM(self.hidden_dim*self.para_len, self.hidden_dim)

    
#     def init_hidden(self, batch_size, device):
#         # Before we've done anything, we dont have any hidden state.
#         # Refer to the Pytorch documentation to see exactly
#         # why they have this dimensionality.
#         # The axes semantics are (num_layers, minibatch_size, hidden_dim)
#         return torch.zeros((1, batch_size, self.hidden_dim), device=device)
# #                 torch.zeros((1, batch_size, self.hidden_dim), device=device))
    
#     def forward(self, h_ak, H_r, hidden):
        
#         # h_ak (current decoder's last op) (1,batch,hiddendim)
#         # H_r (weighted summary of para) (P, batch, hiddendim)
#         batch_size = H_r.shape[1]
        
#         if self.debug > 4:
#             print("h_ak:\t\t\t", h_ak.shape)
#             print("H_r:\t\t\t", H_r.shape)
#             print("hidden:\t\t\t", hidden.shape)
            
#         # Prepare inputs for the tanh used to compute energy
#         f_input_b = self.lin_f_repeat(h_ak)
#         if self.debug > 4: print("f_input_b unrepeated:  ", f_input_b.shape)
        
#         #H_r shape is ([PARA_LEN, BATCHSIZE, EmbeddingDIM])
#         f_input_b = f_input_b.repeat(H_r.shape[0], 1, 1)
#         if self.debug > 4: print("f_input_b repeated:\t", f_input_b.shape)
            
#         f_input_a = self.lin_f_nobias(H_r)
#         if self.debug > 4: print("f_input_a:\t\t", f_input_a.shape)
            
#         # Send it off to tanh now
#         F_k = F.tanh(f_input_a+f_input_b)
#         if self.debug > 4: print("F_k:\t\t\t", F_k.shape) #PARA_LEN,BATCHSIZE,EmbeddingDim
            
#         # Attention weights
#         beta_k_input_a = F_k.view(batch_size, -1, self.hidden_dim).matmul(self.beta_k_w).view(batch_size, 1, -1)
#         if self.debug > 4: print("beta_k_input_a:\t\t", beta_k_input_a.shape)
            
#         beta_k_input = beta_k_input_a.add_(self.beta_k_b.repeat(1,1,self.para_len))
#         if self.debug > 4: print("beta_k_input:\t\t", beta_k_input.shape)
            
#         beta_k = F.softmax(beta_k_input, dim=-1)
#         if self.debug > 4: print("beta_k:\t\t\t", beta_k.shape)
            
#         lstm_input_a = H_r.view(batch_size, self.para_len, -1) * (beta_k.view(batch_size, self.para_len, -1).repeat(1,1,self.hidden_dim))
#         if self.debug > 4: print("lstm_input_a:\t\t", lstm_input_a.shape)
            
# #         lstm_input = torch.cat((lstm_input_a.view(1, batch_size,-1), h_ak.view(1, batch_size, -1)), dim=2)
# #         if self.debug > 4: print("lstm_input:\t\t", lstm_input.shape)
        
#         h_ak, (_, hidden) = self.lstm(lstm_input_a.view(1, batch_size, -1), (h_ak, hidden))
        
#         return h_ak, hidden, F.log_softmax(beta_k_input, dim=-1)
            
# if DEBUG > 4:
#     with torch.no_grad():
#         macros = {
#             "ques_len": QUES_LEN,
#             "hidden_dim": HIDDEN_DIM, 
#             "vocab_size": VOCAB_SIZE, 
#             "batch_size": BATCH_SIZE,
#             "para_len": PARA_LEN,
#             "embedding_dim": EMBEDDING_DIM,
#             "lr": LR,
#             "debug":5,
#             "device":device
#         }
        
#         pointerDecoder = PointerDecoder(macros).cuda(device)
#         h_ak = torch.randn(1,BATCH_SIZE,HIDDEN_DIM, device=device)
#         H_r = torch.randn(PARA_LEN, BATCH_SIZE, HIDDEN_DIM, device=device)
#         hidden = pointerDecoder.init_hidden(BATCH_SIZE)
#         h_ak, hidden, beta_k = pointerDecoder(h_ak, H_r, hidden)
#         print (beta_k.shape)

# Pull the real data from disk.

Files stored in `./data/squad/train.ids.*`
Pull both train and test.

In [6]:
def prepare_data(data_loc, macros, crop=None):
    """
        Given the dataloc and the data available in a specific format, it would pick the data up, and make trainable matrices,
        Harvest train_P, train_Q, train_Y, test_P, test_Q, test_Y matrices in this format
        
        If crop given, will trim the data at a certain length
        
        **return_type**: np matrices
    """
    
    # Unpacking macros
    PARA_LEN = macros['para_len']
    QUES_LEN = macros['ques_len']
    
    train_q = np.asarray([[int(x) for x in datum.split()] for datum in list(open(os.path.join(data_loc, 'train.ids.question')))])
    train_p = np.asarray([[int(x) for x in datum.split()] for datum in list(open(os.path.join(data_loc, 'train.ids.context')))])
    train_y = np.asarray([[int(x) for x in datum.split()] for datum in list(open(os.path.join(data_loc, 'train.span')))])

    test_q = np.asarray([[int(x) for x in datum.split()] for datum in list(open(os.path.join(data_loc, 'val.ids.question')))])
    test_p = np.asarray([[int(x) for x in datum.split()] for datum in list(open(os.path.join(data_loc, 'val.ids.context')))])
    test_y = np.asarray([[int(x) for x in datum.split()] for datum in list(open(os.path.join(data_loc, 'val.span')))])

    if macros['debug'] > 3:
        print("Train Q: ", train_q.shape)
        print("Train P: ", train_p.shape)
        print("Train Y: ", train_y.shape)
        print("Test Q: ", test_q.shape)
        print("Test P: ", test_p.shape)
        print("Test Y: ", test_y.shape)
    
    """
        Parse the semi-raw data:
            - shuffle
            - pad, prepare
            - dump useless vars
    """
    # Shuffle data
    
    if crop:
        index_train, index_test = np.random.choice(np.arange(len(train_p)), crop), \
                                  np.random.choice(np.arange(len(test_p)), crop)
    else:
        index_train, index_test = np.arange(len(train_p)), np.arange(len(test_p))
        np.random.shuffle(index_train)
        np.random.shuffle(index_test)

    train_p, train_q, train_y = train_p[index_train], train_q[index_train], train_y[index_train]
    test_p, test_q, test_y = test_p[index_test], test_q[index_test], test_y[index_test]

#     sanity_check(train_p, train_y)

    if macros['debug'] >= 5:
        print("Max q len: ", max(len(q) for q in train_q))
        
    
    # Pad and prepare
    train_P = np.zeros((len(train_p), PARA_LEN))
    train_Q = np.zeros((len(train_q), QUES_LEN))
    train_Y_start = np.zeros((len(train_p), PARA_LEN))
    train_Y_end = np.zeros((len(train_p), PARA_LEN))

    test_P = np.zeros((len(test_p), PARA_LEN))
    test_Q = np.zeros((len(test_q), QUES_LEN))
    test_Y_start = np.zeros((len(test_p), PARA_LEN))
    test_Y_end = np.zeros((len(test_p), PARA_LEN))
    
#     print(train_P.shape)

    crop_train = []    # Remove these rows from training
    for i in range(len(train_p)):
        p = train_p[i]
        q = train_q[i]
        y = train_y[i]
        
        # First see if you can keep this example or not (due to size)
        if y[0] >= PARA_LEN or y[1] >= PARA_LEN:
            crop_train.append(i)
            continue


        train_P[i, :min(PARA_LEN, len(p))] = p[:min(PARA_LEN, len(p))]
        train_Q[i, :min(QUES_LEN, len(q))] = q[:min(QUES_LEN, len(q))]
        train_Y_start[i, y[0]] = 1
        train_Y_end[i, y[1]] = 1

    crop_test = []
    for i in range(len(test_p)):
        p = test_p[i]
        q = test_q[i]
        y = test_y[i]

        # First see if you can keep this example or not (due to size)
        if y[0] >= PARA_LEN or y[1] >= PARA_LEN:
            crop_test.append(i)
            continue

        test_P[i, :min(PARA_LEN, len(p))] = p[:min(PARA_LEN, len(p))]
        test_Q[i, :min(QUES_LEN, len(q))] = q[:min(QUES_LEN, len(q))]
        test_Y_start[i, y[0]] = 1
        test_Y_end[i, y[1]] = 1
        
        
    # Remove the instances which are in crop_train
    train_P = np.delete(train_P, crop_train, axis=0)
    train_Q = np.delete(train_Q, crop_train, axis=0)
    train_Y_start = np.delete(train_Y_start, crop_train, axis=0)
    train_Y_end = np.delete(train_Y_end, crop_train, axis=0)
    
    test_P = np.delete(test_P, crop_test, axis=0)
    test_Q = np.delete(test_Q, crop_test, axis=0)
    test_Y_start = np.delete(test_Y_start, crop_test, axis=0)
    test_Y_end = np.delete(test_Y_end, crop_test, axis=0)

    if macros['debug'] >= 1:
        print("Train Q: ", train_Q.shape)
        print("Train P: ", train_P.shape)
        print("Train Y: ", train_Y_start.shape)
        print("Test Q: ", test_Q.shape)
        print("Test P: ", test_P.shape)
        print("Test Y: ", test_Y_start.shape)
        print("Crop_train: ", len(crop_train))
        print("Crop_test: ", len(crop_test))
    # Let's free up some memory now
    train_p, train_q, train_y, test_p, test_q, test_y = None, None, None, None, None, None
    
    # Load embedding matrics
    vectors = np.load(os.path.join(data_loc, 'glove.new.trimmed.300.npy'))
    
    return train_P, train_Q, train_Y_start, train_Y_end, test_P, test_Q, test_Y_start, test_Y_end, vectors

In [7]:
# macros = {
#     "ques_len": QUES_LEN,
#     "hidden_dim": HIDDEN_DIM, 
#     "vocab_size": VOCAB_SIZE, 
#     "batch_size": BATCH_SIZE,
#     "para_len": PARA_LEN,
#     "embedding_dim": EMBEDDING_DIM,
#     "debug": 5
# } 

# a = prepare_data(DATA_LOC, macros=macros, crop=None)

# Training, and running the model
- Write a train fn
- Write a training loop invoking it
- Fill in real data

----------

Feats:
- Function to test every n epochs.
- Report train accuracy every epoch
- Store the train, test accuracy for every instance.


In [8]:
# Function to save the model
def save_model(loc, models, epochs=0, optimizer=None):
    """
        Input:
            loc: str of the folder where the models are to be saved
            models: dict of 'model_name': model_object
            epochs, optimizers are int, torch.optims (discarded right now).
    """
    
    assert type(models) is dict and len(models.keys()) == 4
    
    # Assumes four models. Doesn't save device/epochs/optimizer right now.
    
    for name in models:
        torch.save(models[name], os.path.join(loc, name+'.torch'))
    

In [9]:
def train(para_batch,
          ques_batch,
          answer_start_batch,
          answer_end_batch,
          ques_model,
          para_model,
          mlstm_model,
          pointer_decoder_model,
          optimizer, 
          loss_fn,
          macros,
          debug=2):

    """
    
    :param para_batch: paragraphs (batch, max_seq_len_para) 
    :param ques_batch: questions corresponding to para (batch, max_seq_len_ques)
    :param answer_start_batch: one-hot vector denoting pos of span start (batch, max_seq_len_para)
    :param answer_end_batch: one-hot vector denoting pos of span end (batch, max_seq_len_para)
    
    # Models
    :param ques_model: model to encode ques
    :param para_model: model to encode para
    :param mlstm_model: model to match para, ques to get para summary
    :param pointer_decoder_model: model to get a pointer over start and end span pointer
    
    # Loss and Optimizer.
    :param loss_fn: 
    :param optimizer: 
    
    :return: 
    
    
    NOTE: When using MSE, 
        - target labels are one-hot
        - target label is float tensor
        - shape (batch, 1, len)
        
        When using CrossEntropy
        - target is not onehot
        - long
        - shape (batch, )
    """
    try:    
    #     DEBUG = debug
    #     BATCH_SIZE = macros['batch_size']
    #     HIDDEN_DIM = macros['hidden_dim']

        if debug >=2: 
            print("\tpara_batch:\t\t", para_batch.shape)
            print("\tques_batch:\t\t", ques_batch.shape)
            print("\tanswer_start_batch:\t", answer_start_batch.shape)
            print("\tanswer_end_batch:\t\t", answer_end_batch.shape)

        # Wiping all gradients
        optimizer.zero_grad()

        # Initializing all hidden states.
        hidden_quesenc = ques_model.init_hidden(macros['batch_size'], device)
        hidden_paraenc = para_model.init_hidden(macros['batch_size'], device)
        hidden_mlstm = mlstm_model.init_hidden(macros['batch_size'], device)
        hidden_ptrnet = pointer_decoder_model.init_hidden(macros['batch_size'], device)
        h_ri = torch.zeros((1, macros['batch_size'], macros['hidden_dim']), dtype=torch.float, device=device)
        h_ak = torch.zeros((1, macros['batch_size'], macros['hidden_dim']), dtype=torch.float, device=device)
        if debug >= 2: print("------------Instantiated hidden states------------")

        #passing the data through LSTM pre-processing layer
        H_q, ques_model_hidden = ques_model(ques_batch, hidden_quesenc, device=device)
        H_p, para_model_hidden = para_model(para_batch, hidden_paraenc, device=device)
        if debug >= 2: 
            print("\tH_q:\t\t", H_q.shape)
            print("\tH_p:\t\t", H_p.shape)
            print("\tH_ri:\t\t", h_ri.shape)
    #         raw_input("Check memory and ye shall continue")
            print("------------Encoded hidden states------------")

        H_r = mlstm_model(H_p.view(-1, macros['batch_size'], 2*macros['hidden_dim']), h_ri, H_q, hidden_mlstm, device=device)
        if debug >= 2: print("------------Passed through matchlstm------------")

        #Passing the paragraph embddin via pointer network to generate final answer pointer.
        h_ak, hidden_ptrnet, beta_k_start = pointer_decoder_model(h_ak, H_r, hidden_ptrnet, device=device)
        h_ak, hidden_ptrnet, beta_k_end = pointer_decoder_model(h_ak, H_r, hidden_ptrnet, device=device)
        if debug >= 2: print("------------Passed through pointernet------------")


        # For crossentropy
        _, answer_start_batch = answer_start_batch.max(dim=2)
        _, answer_end_batch = answer_end_batch.max(dim=2)
        answer_start_batch = answer_start_batch.view(-1).long()
        answer_end_batch = answer_end_batch.view(-1).long()
#         print(beta_k_start.view(-1, macros['para_len']).shape, answer_start_batch.view(-1).shape)
        
        # Calculate Loss
        loss = loss_fn(beta_k_start.view(-1, macros['para_len']), answer_start_batch)
        loss += loss_fn(beta_k_end.view(-1, macros['para_len']), answer_end_batch)
#         loss = loss_fn(beta_k_start, answer_start_batch)
#         loss += loss_fn(beta_k_end, answer_end_batch)
        if debug >= 2: print("------------Calculated loss------------")

        loss.backward()
        if debug >= 2: print("------------Calculated Gradients------------")

        #optimization step
        optimizer.step()
        if debug >= 2: print("------------Updated weights.------------")
            
        return beta_k_start, beta_k_end, loss
    
    except: 
        traceback.print_exc()

In [10]:
# Predict function (no grad, no eval)
def predict(para_batch,
            ques_batch,
            ques_model,
            para_model,
            mlstm_model,
            pointer_decoder_model,
            macros,
            loss_fn=None,
            debug=DEBUG):
    """
        Function which returns the model's output based on a given set of P&Q's. 
        Does not convert to strings, gives the direct model output.
        
        Expects:
            four models
            data
            misc macros
    """
    
#     BATCH_SIZE = macros['batch_size']
    BATCH_SIZE = ques_batch.shape[0]
    HIDDEN_DIM = macros['hidden_dim']
    DEBUG = debug
    
    if debug >=2: 
        print("\tpara_batch:\t\t", para_batch.shape)
        print("\tques_batch:\t\t", ques_batch.shape)
        
    with torch.no_grad():    

        # Initializing all hidden states.
        hidden_quesenc = ques_model.init_hidden(BATCH_SIZE, device)
        hidden_paraenc = para_model.init_hidden(BATCH_SIZE, device)
        hidden_mlstm = mlstm_model.init_hidden(BATCH_SIZE, device)
        hidden_ptrnet = pointer_decoder_model.init_hidden(BATCH_SIZE, device)
        h_ri = torch.zeros((1, BATCH_SIZE, HIDDEN_DIM), dtype=torch.float, device=device)
        h_ak = torch.zeros((1, BATCH_SIZE, HIDDEN_DIM), dtype=torch.float, device=device)
        if DEBUG >= 2: print("------------Instantiated hidden states------------")
            
        #passing the data through LSTM pre-processing layer
        H_q, ques_model_hidden = ques_model(ques_batch, hidden_quesenc, device)
        H_p, para_model_hidden = para_model(para_batch, hidden_paraenc, device)
        if DEBUG >= 2: 
            print("\tH_q:\t\t", H_q.shape)
            print("\tH_p:\t\t", H_p.shape)
            print("\tH_ri:\t\t", h_ri.shape)
#             raw_input("Check memory and ye shall continue")
            print("------------Encoded hidden states------------")

        H_r = mlstm_model(H_p.view(-1, BATCH_SIZE, 2*HIDDEN_DIM), h_ri, H_q, hidden_mlstm, device)
        if DEBUG >= 2: print("------------Passed through matchlstm------------")

        #Passing the paragraph embddin via pointer network to generate final answer pointer.
        h_ak, hidden_ptrnet, beta_k_start = pointer_decoder_model(h_ak, H_r, hidden_ptrnet, device)
        _, _, beta_k_end = pointer_decoder_model(h_ak, H_r, hidden_ptrnet, device)
        if DEBUG >= 2: print("------------Passed through pointernet------------")
                            
        # For crossentropy
#         _, answer_start_batch = answer_start_batch.max(dim=2)[1]
#         _, answer_end_batch = answer_end_batch.max(dim=2)[1]
#         print("labels: ", answer_start_batch.shape)[1]
            
#         #How will we manage batches for loss.
#         loss = loss_fn(beta_k_start, answer_start_batch)
#         loss += loss_fn(beta_k_end, answer_end_batch)
#         if debug >= 2: print("------------Calculated loss------------")
            
        return (beta_k_start, beta_k_end, 0.0)


In [11]:
# Eval function (no grad no eval no nothing)
def eval(y_cap, y, metrics={'em':None}):
    """ 
        Returns the exact-match (em) metric by default.
        Can specifiy more in a list (TODO)
        
        Inputs:
        - y_cap: list of two tensors (start, end) of dim [BATCH_SIZE, PARA_LEN] each
        - y: list of two tensors (start, end) of dim [BATCH_SIZE, 1] each
    """
    
#     print(y[0].shape, y[1].shape, y_cap[0].shape, y_cap[1].shape)
    
    y_cap= torch.argmax(y_cap[0], dim=1).float(), torch.argmax(y_cap[1], dim=1).float()
    y = torch.argmax(y[0], dim=1).float(), torch.argmax(y[1], dim=1).float()
    
    if "em" in metrics.keys():
        metrics['em'] = (y[0].eq(y_cap[0]) & y[1].eq(y_cap[1])).sum().item()/ float(y[0].shape[0])
        
    if DEBUG >= 3: 
        print("Test performance: ", metrics)
        print("------------Evaluated------------")
        
    return metrics

if DEBUG >=5:
    # Testing this function
    metrics = {'em':None}
#     y = torch.tensor([[3]]).float(), torch.tensor([[4]]).float()
    y = torch.tensor([[0,0,3,0], [0,2,0,0]]), torch.tensor([[0,0,0,3], [0,0,0,3]])
    y_cap = torch.tensor([[0,0,3,0],[0,0,3,0]]), torch.tensor([[0,0,0,3],[0,0,0,3]])
#     y = torch.randint(0, PARA_LEN, (BATCH_SIZE,)).float(), torch.randint(0, PARA_LEN, (BATCH_SIZE,)).float()
#     y_cap = torch.rand((BATCH_SIZE, PARA_LEN)), torch.rand((BATCH_SIZE, PARA_LEN))
    print(eval(y_cap, y))   

In [12]:
def training_loop(_models, _data, _macros, _epochs, _save=0, _test_eval=0, _train_eval=0, _debug=2):
    """
        > Instantiate models
        > Instantiate loss, optimizer
        > Instantiate ways to store loss

        > Per epoch
            > sample batch and give to train fn
            > get loss
            > if epoch %k ==0: get test accuracy

        > have fn to calculate test accuracy
        
        > _save: int
            > 0: dont
            > 1+: save every _save epoch (overwrite)
            > -1 -> save best (turned to 1 if test evals dont happen.)
        
        > Save the model at every epoch if we don't test on test. 
            > else save on the best performning mode
    """
    
    # Unpack data
    DEBUG = _debug
    train_P = _data['train']['P']
    train_Q = _data['train']['Q']
    train_Y_start = _data['train']['Ys']
    train_Y_end = _data['train']['Ye']
    test_P = _data['test']['P']
    test_Q = _data['test']['Q']
    test_Y_start = _data['test']['Ys']
    test_Y_end = _data['test']['Ye']

    ques_model, para_model, mlstm_model, pointer_decoder_model = _models
    _data = None

    # Instantiate Loss
#         loss_fn = nn.MSELoss()
    loss_fn = nn.NLLLoss()
    optimizer = optim.Adam(list(filter(lambda p: p.requires_grad, ques_model.parameters())) + 
                             list(filter(lambda p: p.requires_grad, para_model.parameters())) + 
                             list(mlstm_model.parameters()) + 
                             list(pointer_decoder_model.parameters()), lr=macros['lr'])
#         optimizer = optim.Adam(list(ques_model.parameters()) + \
#                                list(para_model.parameters()) + \
#                                list(mlstm_model.parameters()) + \
#                               list(pointer_decoder_model.parameters()), lr=macros['lr'])

    # Losses
    train_losses = []
    train_em = []
    test_losses = []
    test_em = []
    best_test_em = 0.0
    found_best_test_em = False
    
    try: 

        # Training Loop
        for epoch in range(_epochs):
            print("Epoch: ", epoch, "/", _epochs)

            epoch_loss = []
            epoch_train_em = []
            epoch_time = time.time()

            for iter in range(int(len(train_P)/BATCH_SIZE)):
    #         for iter in range(2):

                batch_time = time.time()

                # Sample batch and train on it
                sample_index = np.random.randint(0, len(train_P), _macros['batch_size'])
            
#                 grad_old = sum([x.grad.sum().item() for x in params])

                y_cap_start, y_cap_end, loss = train(
                    para_batch = torch.tensor(train_P[sample_index], dtype=torch.long, device=device),
                    ques_batch = torch.tensor(train_Q[sample_index], dtype=torch.long, device=device),
                    answer_start_batch = torch.tensor(train_Y_start[sample_index], dtype=torch.float, device=device).view( _macros['batch_size'], 1, _macros['para_len']),
                    answer_end_batch = torch.tensor(train_Y_end[sample_index], dtype=torch.float, device=device).view(_macros['batch_size'], 1, _macros['para_len']),
                    ques_model = ques_model,
                    para_model = para_model,
                    mlstm_model = mlstm_model,
                    pointer_decoder_model = pointer_decoder_model,
                    optimizer = optimizer, 
                    loss_fn= loss_fn,
                    macros=_macros,
                    debug=_macros['debug']
                )

                if _train_eval: 

                    # Calculate train accuracy for this minibatch
                    metrics = eval(
                        y=(torch.tensor(train_Y_start[sample_index], dtype=torch.long, device=device).view( -1, _macros['para_len']),
                            torch.tensor(train_Y_end[sample_index], dtype=torch.long, device=device).view(-1, _macros['para_len'])),
                        y_cap=[y_cap_start.squeeze(), y_cap_end.squeeze()])

                    epoch_train_em.append(metrics['em'])
    
                epoch_loss.append(loss.item())
    
#                 grad_new = sum([x.grad.sum().item() for x in params])

                print("Batch:\t%d" % iter,"/%d\t: " % (len(train_P)/_macros['batch_size']),
                      str("%s" % (time.time() - batch_time))[:8], 
                      str("\t\b%s" % (time.time() - epoch_time))[:10], 
                      "\tl:%f" % loss.item(),
                      "\tem:%f" % epoch_train_em[-1] if _train_eval else "")
#                      "\t\b\b%s" % grad_new - grad_old)
#                      end=None if iter+1 == int(len(train_P)/BATCH_SIZE) else "\r")

            train_losses.append(epoch_loss)
        
            if _train_eval: train_em.append(epoch_train_em)
            if _test_eval and epoch % _test_eval == 0:

                y_cap_start, y_cap_end, test_loss = predict(
                    para_batch = torch.tensor(test_P, dtype=torch.long, device=device),
                    ques_batch = torch.tensor(test_Q, dtype=torch.long, device=device),
                    ques_model = ques_model,
                    para_model = para_model,
                    mlstm_model = mlstm_model,
                    pointer_decoder_model = pointer_decoder_model,
                    macros = _macros,
                    loss_fn= loss_fn,
                    debug = _macros['debug']
                )
                metrics = eval(
                    y=(torch.tensor(test_Y_start, dtype=torch.long, device=device).view( -1, _macros['para_len']),
                        torch.tensor(test_Y_end, dtype=torch.long, device=device).view(-1, _macros['para_len'])),
                    y_cap=[y_cap_start.squeeze(), y_cap_end.squeeze()])

                test_losses.append(test_loss)
                test_em.append(metrics['em'])
                
                # Check if we outperformed the best one.
                if metrics['em'] > best_test_em:
                    
                    # Set flag
                    found_best_test_em = True
                    
                    # Update value
                    best_test_em = metrics['em']   
                
            # Saving logic
            if _save == 0:
                pass
            elif ( _save>0 and epoch % _save == 0) or \
            ( _save == -1 and found_best_test_em ):
                models = { 'ques_model': ques_model,
                           'para_model': para_model,
                           'mlstm_model':  mlstm_model,
                           'pointer_decoder_model': pointer_decoder_model
                         }
                
                save_model(macros['save_model_loc'], models,
                          epochs=epoch,
                           optimizer=optimizer)
                
                print("Saving new model on epoch %d" % epoch)
            
            # Reset flags
            found_best_test_em = False
            
            # At the end of every epoch, do print the average epoch loss, and other stat
            print("\nEpoch performance: ",
                  "%ssec" % str(time.time() - epoch_time)[:6],
                  "Trl:%f" % np.mean(epoch_loss, axis=0),
                  "\tTrem:%f" % np.mean(epoch_train_em) if _train_eval and epoch % _train_eval == 0 else "",
                  "\tTeem:%f\n" % test_em[-1] if _test_eval and epoch % _test_eval == 0 else "\n")

#         return train_losses, train_em, test_losses, test_em
    
    except KeyboardInterrupt:
        
        # someone called a ctrl+c on it. Let' return the things computed so far atlest.
        print("Found keyboard interrupt. Stopping training loop")
        
    except:
        traceback.print_exc()
        
    finally:       
        return train_losses, train_em, test_losses, test_em
            
            

In [13]:
def visualize_loss(loss, _label="Some label", _only_epoch=True):
    """
        Fn to visualize loss.
        Expects either
            - [int, int] for epoch level stuff
            - [ [int, int], [int, int] ] for batch level data. 
    """
    
    plt.rcParams['figure.figsize'] = [15, 8] 
    
    # Detect input format
    if type(loss[0]) in [int, float, long]:
        
#         print("here")
        
        plt.plot(loss)
        plt.ylabel(_label)
        plt.show()
        
    elif type(loss[0]) == list:
        
        if _only_epoch:
            loss = [ sum(x) for x in loss ]
            
        else:
            loss = [ y for x in loss for y in x ]
            
        plt.plot(loss)
        plt.ylabel(_label)
        plt.show()        

## Orchestrator

One cell which instantiates and runs everything

In [14]:
"""
    Cell which pulls everything together.

    > init models
    > get data prepared
    > pass models and data to training loop
    > gets trained models and loss
    > saves models
    > visualizes loss?

No other function but this one ever sees global macros!
"""
macros = {
    "ques_len": QUES_LEN,
    "hidden_dim": HIDDEN_DIM, 
    "vocab_size": VOCAB_SIZE, 
    "batch_size": BATCH_SIZE,
    "para_len": PARA_LEN,
    "embedding_dim": EMBEDDING_DIM,
    "lr": LR,
    "debug":DEBUG,
    "save_model_loc": MODEL_LOC
#     "device": device
} 

data = {'train':{}, 'test':{}}
data['train']['P'], data['train']['Q'], data['train']['Ys'], data['train']['Ye'], \
data['test']['P'], data['test']['Q'], data['test']['Ys'], data['test']['Ye'], vectors = \
    prepare_data(DATA_LOC, macros, crop=CROP)

Train Q:  (585, 30)
Train P:  (585, 200)
Train Y:  (585, 200)
Test Q:  (585, 30)
Test P:  (585, 200)
Test Y:  (585, 200)
Crop_train:  7
Crop_test:  7


In [15]:
# # Instantiate modelshttp://localhost:8888/notebooks/model.ipynb#
ques_model = Encoder(QUES_LEN, macros, vectors, device).cuda(device)
para_model = Encoder(PARA_LEN, macros, vectors, device).cuda(device)
mlstm_model = MatchLSTMEncoder(macros, device).cuda(device)
pointer_decoder_model = PointerDecoder(macros, device).cuda(device)

In [None]:
op = training_loop(_models=[ques_model, para_model, mlstm_model, pointer_decoder_model],
                   _data=data,
                   _debug=macros['debug'],
                   _save=-1,
                   _test_eval=1,
                   _train_eval=1,
                   _epochs=EPOCHS,
                   _macros=macros)    

Epoch:  0 / 300
Batch:	0 /13	:  2.899268 2.899290 	l:10.599597 	em:0.000000
Batch:	1 /13	:  2.098588 4.998671 	l:10.601473 	em:0.000000
Batch:	2 /13	:  1.224442 6.224090 	l:16.737503 	em:0.000000
Batch:	3 /13	:  1.110588 7.336225 	l:11.116800 	em:0.000000
Batch:	4 /13	:  1.148734 8.485335 	l:11.286239 	em:0.000000
Batch:	5 /13	:  1.208545 9.695005 	l:11.417948 	em:0.000000
Batch:	6 /13	:  1.416246 11.11267 	l:11.204296 	em:0.000000
Batch:	7 /13	:  1.338632 12.45235 	l:11.151123 	em:0.000000
Batch:	8 /13	:  1.203566 13.65629 	l:10.702057 	em:0.000000
Batch:	9 /13	:  1.236538 14.89353 	l:10.615250 	em:0.000000
Batch:	10 /13	:  1.089864 15.98459 	l:10.679499 	em:0.000000
Batch:	11 /13	:  1.202126 17.18744 	l:10.677469 	em:0.000000
Batch:	12 /13	:  1.219276 18.40701 	l:10.562300 	em:0.000000

Epoch performance:  20.184sec Trl:11.334735 	Trem:0.000000 	Teem:0.000000

Epoch:  1 / 300
Batch:	0 /13	:  1.288601 1.288634 	l:10.558344 	em:0.000000
Batch:	1 /13	:  1.295498 2.585401 	l:10.593222 	e

Batch:	6 /13	:  1.629773 10.16144 	l:10.146042 	em:0.000000
Batch:	7 /13	:  1.264277 11.42599 	l:10.143568 	em:0.000000
Batch:	8 /13	:  1.268383 12.69583 	l:10.177079 	em:0.000000
Batch:	9 /13	:  1.224024 13.92065 	l:10.193758 	em:0.000000
Batch:	10 /13	:  1.100074 15.03355 	l:10.274378 	em:0.000000
Batch:	11 /13	:  1.246199 16.27998 	l:9.976515 	em:0.000000
Batch:	12 /13	:  1.214328 17.49484 	l:10.207054 	em:0.000000

Epoch performance:  18.031sec Trl:10.180516 	Trem:0.000000 	Teem:0.000000

Epoch:  10 / 300
Batch:	0 /13	:  1.317121 1.317137 	l:10.153084 	em:0.000000
Batch:	1 /13	:  1.185134 2.502820 	l:9.943476 	em:0.000000
Batch:	2 /13	:  1.272467 3.775929 	l:10.317671 	em:0.000000
Batch:	3 /13	:  1.306425 5.082540 	l:10.136183 	em:0.000000
Batch:	4 /13	:  1.214663 6.298072 	l:10.235573 	em:0.000000
Batch:	5 /13	:  1.206660 7.505429 	l:10.027359 	em:0.000000
Batch:	6 /13	:  1.162761 8.669305 	l:10.242208 	em:0.000000
Batch:	7 /13	:  1.244574 9.914696 	l:9.892371 	em:0.000000
Batch:	


Epoch performance:  16.634sec Trl:9.166313 	Trem:0.000000 	Teem:0.000000

Epoch:  19 / 300
Batch:	0 /13	:  1.191817 1.191838 	l:8.954819 	em:0.000000
Batch:	1 /13	:  1.218003 2.410455 	l:8.967849 	em:0.000000
Batch:	2 /13	:  1.201412 3.612701 	l:8.913912 	em:0.000000
Batch:	3 /13	:  1.200503 4.813416 	l:9.103481 	em:0.000000
Batch:	4 /13	:  1.257920 6.071951 	l:8.912699 	em:0.000000
Batch:	5 /13	:  1.266173 7.338605 	l:8.579823 	em:0.000000
Batch:	6 /13	:  1.255811 8.595403 	l:9.315403 	em:0.000000
Batch:	7 /13	:  1.328448 9.924140 	l:8.925169 	em:0.000000
Batch:	8 /13	:  1.238105 11.16472 	l:9.704403 	em:0.000000
Batch:	9 /13	:  1.325431 12.49043 	l:9.150646 	em:0.000000
Batch:	10 /13	:  1.340656 13.83174 	l:8.806993 	em:0.000000
Batch:	11 /13	:  1.252622 15.08485 	l:8.976320 	em:0.000000
Batch:	12 /13	:  1.240444 16.32616 	l:9.013291 	em:0.000000

Epoch performance:  16.861sec Trl:9.024985 	Trem:0.000000 	Teem:0.000000

Epoch:  20 / 300
Batch:	0 /13	:  1.199485 1.199501 	l:9.031777 

Batch:	6 /13	:  1.278530 8.810286 	l:8.814169 	em:0.000000
Batch:	7 /13	:  1.675161 10.48594 	l:8.302948 	em:0.000000
Batch:	8 /13	:  1.498485 11.98517 	l:8.300072 	em:0.000000
Batch:	9 /13	:  1.343045 13.32871 	l:8.577065 	em:0.000000
Batch:	10 /13	:  1.268935 14.59860 	l:8.194869 	em:0.022222
Batch:	11 /13	:  1.213076 15.81195 	l:8.561716 	em:0.000000
Batch:	12 /13	:  1.338537 17.15098 	l:8.463583 	em:0.000000

Epoch performance:  17.682sec Trl:8.506818 	Trem:0.001709 	Teem:0.000000

Epoch:  29 / 300
Batch:	0 /13	:  1.334999 1.335021 	l:8.615215 	em:0.000000
Batch:	1 /13	:  1.348253 2.683615 	l:8.661180 	em:0.000000
Batch:	2 /13	:  1.283866 3.967972 	l:8.382283 	em:0.000000
Batch:	3 /13	:  1.182403 5.151139 	l:8.780975 	em:0.000000
Batch:	4 /13	:  1.220479 6.371823 	l:8.249283 	em:0.022222
Batch:	5 /13	:  1.167196 7.540329 	l:8.417465 	em:0.000000
Batch:	6 /13	:  1.176419 8.718096 	l:8.695539 	em:0.000000
Batch:	7 /13	:  1.236386 9.954911 	l:8.904514 	em:0.000000
Batch:	8 /13	:  1.26

Batch:	0 /13	:  1.208418 1.208437 	l:8.189112 	em:0.044444
Batch:	1 /13	:  1.187083 2.396034 	l:7.846777 	em:0.022222
Batch:	2 /13	:  1.186818 3.585213 	l:8.384945 	em:0.022222
Batch:	3 /13	:  1.244778 4.830187 	l:8.212650 	em:0.022222
Batch:	4 /13	:  1.173564 6.006362 	l:9.082903 	em:0.000000
Batch:	5 /13	:  1.262842 7.269747 	l:7.940022 	em:0.000000
Batch:	6 /13	:  1.267364 8.538008 	l:8.137354 	em:0.044444
Batch:	7 /13	:  1.587845 10.12637 	l:8.213108 	em:0.022222
Batch:	8 /13	:  1.364126 11.49139 	l:7.382912 	em:0.044444
Batch:	9 /13	:  1.301906 12.79398 	l:8.252935 	em:0.000000
Batch:	10 /13	:  1.268820 14.06402 	l:7.739686 	em:0.022222
Batch:	11 /13	:  1.221064 15.28605 	l:8.968601 	em:0.022222
Batch:	12 /13	:  1.265130 16.55171 	l:8.375381 	em:0.022222

Epoch performance:  17.088sec Trl:8.209722 	Trem:0.022222 	Teem:0.000000

Epoch:  39 / 300
Batch:	0 /13	:  1.184944 1.184961 	l:7.846654 	em:0.044444
Batch:	1 /13	:  1.217423 2.403501 	l:8.080296 	em:0.000000
Batch:	2 /13	:  1.29

Batch:	8 /13	:  1.191896 11.19995 	l:7.661027 	em:0.000000
Batch:	9 /13	:  1.261132 12.46136 	l:7.414926 	em:0.066667
Batch:	10 /13	:  1.231427 13.69401 	l:7.735720 	em:0.044444
Batch:	11 /13	:  1.206017 14.90061 	l:7.761703 	em:0.022222
Batch:	12 /13	:  1.242202 16.14340 	l:7.844460 	em:0.044444

Epoch performance:  16.677sec Trl:7.848049 	Trem:0.022222 	Teem:0.000000

Epoch:  48 / 300
Batch:	0 /13	:  1.303324 1.303343 	l:7.683990 	em:0.000000
Batch:	1 /13	:  1.305784 2.609755 	l:7.365055 	em:0.088889
Batch:	2 /13	:  1.166458 3.776457 	l:7.263144 	em:0.022222
Batch:	3 /13	:  1.353377 5.130269 	l:7.271157 	em:0.044444
Batch:	4 /13	:  1.226620 6.357568 	l:8.305884 	em:0.022222
Batch:	5 /13	:  1.162201 7.520509 	l:7.489197 	em:0.022222
Batch:	6 /13	:  1.186228 8.707853 	l:6.982262 	em:0.044444
Batch:	7 /13	:  1.261159 9.969650 	l:7.689156 	em:0.000000
Batch:	8 /13	:  1.327145 11.29728 	l:7.981586 	em:0.022222
Batch:	9 /13	:  1.636858 12.93541 	l:7.011295 	em:0.044444
Batch:	10 /13	:  1.4

Batch:	1 /13	:  1.116771 2.414994 	l:7.004052 	em:0.022222
Batch:	2 /13	:  1.194713 3.610310 	l:7.095398 	em:0.044444
Batch:	3 /13	:  1.173743 4.784820 	l:7.178915 	em:0.044444
Batch:	4 /13	:  1.237162 6.022219 	l:7.735615 	em:0.022222
Batch:	5 /13	:  1.425894 7.448781 	l:6.824162 	em:0.066667
Batch:	6 /13	:  1.187648 8.636726 	l:7.195035 	em:0.088889
Batch:	7 /13	:  1.205189 9.842931 	l:7.197079 	em:0.044444
Batch:	8 /13	:  1.229902 11.07312 	l:7.488467 	em:0.066667
Batch:	9 /13	:  1.196325 12.27233 	l:7.190402 	em:0.066667
Batch:	10 /13	:  1.255527 13.52882 	l:6.514241 	em:0.088889
Batch:	11 /13	:  1.230661 14.76003 	l:6.350024 	em:0.022222
Batch:	12 /13	:  1.197080 15.95757 	l:6.341732 	em:0.066667

Epoch performance:  16.496sec Trl:7.056477 	Trem:0.054701 	Teem:0.000000

Epoch:  58 / 300
Batch:	0 /13	:  1.159960 1.159981 	l:6.961891 	em:0.133333
Batch:	1 /13	:  1.243587 2.404264 	l:7.294584 	em:0.022222
Batch:	2 /13	:  1.290464 3.695657 	l:7.335271 	em:0.044444
Batch:	3 /13	:  1.18

Batch:	9 /13	:  1.611346 12.59626 	l:7.386455 	em:0.000000
Batch:	10 /13	:  1.194097 13.79104 	l:6.816950 	em:0.088889
Batch:	11 /13	:  1.239887 15.03181 	l:5.948918 	em:0.022222
Batch:	12 /13	:  1.161570 16.19420 	l:7.105240 	em:0.066667

Epoch performance:  16.728sec Trl:6.678368 	Trem:0.066667 	Teem:0.000000

Epoch:  67 / 300
Batch:	0 /13	:  1.379084 1.379101 	l:6.961443 	em:0.066667
Batch:	1 /13	:  1.204032 2.583961 	l:6.432675 	em:0.111111
Batch:	2 /13	:  1.185533 3.770380 	l:6.509604 	em:0.177778
Batch:	3 /13	:  1.328209 5.099201 	l:6.391733 	em:0.088889
Batch:	4 /13	:  1.148263 6.248287 	l:6.835643 	em:0.044444
Batch:	5 /13	:  1.335870 7.584961 	l:6.237793 	em:0.022222
Batch:	6 /13	:  1.264353 8.849849 	l:6.721598 	em:0.044444
Batch:	7 /13	:  1.215543 10.06590 	l:6.228030 	em:0.133333
Batch:	8 /13	:  1.170289 11.23667 	l:6.539603 	em:0.088889
Batch:	9 /13	:  1.255450 12.49269 	l:6.015684 	em:0.133333
Batch:	10 /13	:  1.216995 13.71051 	l:6.059768 	em:0.088889
Batch:	11 /13	:  1.

Batch:	2 /13	:  1.235563 3.849728 	l:6.714953 	em:0.044444
Batch:	3 /13	:  1.299339 5.149561 	l:6.024949 	em:0.111111
Batch:	4 /13	:  1.291872 6.442435 	l:5.489487 	em:0.022222
Batch:	5 /13	:  1.325559 7.770185 	l:6.314895 	em:0.088889
Batch:	6 /13	:  1.280632 9.051473 	l:6.876702 	em:0.066667
Batch:	7 /13	:  1.225950 10.27811 	l:5.839423 	em:0.088889
Batch:	8 /13	:  1.195107 11.47454 	l:5.945044 	em:0.133333
Batch:	9 /13	:  1.249892 12.72524 	l:5.917655 	em:0.111111
Batch:	10 /13	:  1.452963 14.17847 	l:5.711803 	em:0.133333
Batch:	11 /13	:  1.469702 15.64889 	l:6.104566 	em:0.044444
Batch:	12 /13	:  1.190299 16.84049 	l:6.036381 	em:0.066667

Epoch performance:  17.378sec Trl:6.127960 	Trem:0.083761 	Teem:0.000000

Epoch:  77 / 300
Batch:	0 /13	:  1.335264 1.335281 	l:6.661150 	em:0.133333
Batch:	1 /13	:  1.268509 2.604518 	l:6.416002 	em:0.088889
Batch:	2 /13	:  1.310543 3.915302 	l:5.776568 	em:0.177778
Batch:	3 /13	:  1.274667 5.190840 	l:6.713096 	em:0.044444
Batch:	4 /13	:  1.28

Batch:	10 /13	:  1.189327 14.36589 	l:5.271995 	em:0.200000
Batch:	11 /13	:  1.157418 15.52413 	l:5.949439 	em:0.066667
Batch:	12 /13	:  1.614816 17.13949 	l:5.725629 	em:0.177778

Epoch performance:  17.676sec Trl:5.893518 	Trem:0.107692 	Teem:0.000000

Epoch:  86 / 300
Batch:	0 /13	:  1.545009 1.545036 	l:5.632473 	em:0.088889
Batch:	1 /13	:  1.453027 2.998753 	l:5.657427 	em:0.155556
Batch:	2 /13	:  1.312621 4.312324 	l:5.193488 	em:0.111111
Batch:	3 /13	:  1.315919 5.628501 	l:5.504465 	em:0.111111
Batch:	4 /13	:  1.162839 6.791596 	l:5.493328 	em:0.133333
Batch:	5 /13	:  1.249315 8.041601 	l:5.474663 	em:0.222222
Batch:	6 /13	:  1.110724 9.152569 	l:5.695811 	em:0.133333
Batch:	7 /13	:  1.251112 10.40413 	l:5.481245 	em:0.066667
Batch:	8 /13	:  1.168226 11.57331 	l:5.154232 	em:0.133333
Batch:	9 /13	:  1.183150 12.75731 	l:6.009338 	em:0.022222
Batch:	10 /13	:  1.325384 14.08335 	l:5.624886 	em:0.133333
Batch:	11 /13	:  1.182880 15.26688 	l:5.903529 	em:0.155556
Batch:	12 /13	:  1

Batch:	3 /13	:  1.171072 4.882639 	l:4.715835 	em:0.177778
Batch:	4 /13	:  1.361136 6.244310 	l:5.037410 	em:0.244444
Batch:	5 /13	:  1.306435 7.551969 	l:5.342187 	em:0.155556
Batch:	6 /13	:  1.274074 8.826258 	l:4.653421 	em:0.200000
Batch:	7 /13	:  1.164460 9.991318 	l:5.082252 	em:0.266667
Batch:	8 /13	:  1.253479 11.24532 	l:5.206324 	em:0.200000
Batch:	9 /13	:  1.181699 12.42729 	l:4.780197 	em:0.266667
Batch:	10 /13	:  1.256238 13.68444 	l:4.803442 	em:0.288889
Batch:	11 /13	:  1.328116 15.01306 	l:5.983500 	em:0.133333
Batch:	12 /13	:  1.197741 16.21153 	l:5.110669 	em:0.155556

Epoch performance:  16.749sec Trl:5.091225 	Trem:0.215385 	Teem:0.000000

Epoch:  96 / 300
Batch:	0 /13	:  1.177964 1.177985 	l:4.587953 	em:0.288889
Batch:	1 /13	:  1.299376 2.477761 	l:4.565787 	em:0.222222
Batch:	2 /13	:  1.227925 3.706420 	l:5.467566 	em:0.133333
Batch:	3 /13	:  1.144292 4.851366 	l:4.492886 	em:0.355556
Batch:	4 /13	:  1.316185 6.169875 	l:4.705434 	em:0.222222
Batch:	5 /13	:  1.28

Batch:	11 /13	:  1.189896 14.88834 	l:5.529966 	em:0.133333
Batch:	12 /13	:  1.205585 16.09448 	l:4.955268 	em:0.155556

Epoch performance:  16.631sec Trl:4.858304 	Trem:0.200000 	Teem:0.000000

Epoch:  105 / 300
Batch:	0 /13	:  1.311851 1.311866 	l:4.701422 	em:0.266667
Batch:	1 /13	:  1.236974 2.549287 	l:5.242278 	em:0.222222
Batch:	2 /13	:  1.244404 3.794600 	l:4.643709 	em:0.244444
Batch:	3 /13	:  1.207047 5.002568 	l:3.906671 	em:0.333333
Batch:	4 /13	:  1.145272 6.149025 	l:4.561994 	em:0.288889
Batch:	5 /13	:  1.314615 7.464228 	l:5.159281 	em:0.200000
Batch:	6 /13	:  1.166702 8.631150 	l:5.453938 	em:0.177778
Batch:	7 /13	:  1.162693 9.794074 	l:5.307329 	em:0.200000
Batch:	8 /13	:  1.245194 11.03994 	l:3.907757 	em:0.311111
Batch:	9 /13	:  1.252132 12.29281 	l:4.388330 	em:0.244444
Batch:	10 /13	:  1.261307 13.55459 	l:4.710231 	em:0.222222
Batch:	11 /13	:  1.170108 14.72588 	l:4.670554 	em:0.200000
Batch:	12 /13	:  1.197556 15.92490 	l:3.793766 	em:0.222222

Epoch performanc

Batch:	4 /13	:  1.160604 5.969398 	l:5.349213 	em:0.155556
Batch:	5 /13	:  1.254004 7.223613 	l:4.194384 	em:0.311111
Batch:	6 /13	:  1.178080 8.402513 	l:4.104772 	em:0.244444
Batch:	7 /13	:  1.310388 9.713500 	l:4.167998 	em:0.288889
Batch:	8 /13	:  1.322471 11.03622 	l:4.045462 	em:0.355556
Batch:	9 /13	:  1.232204 12.26859 	l:3.029608 	em:0.444444
Batch:	10 /13	:  1.165148 13.43449 	l:3.941019 	em:0.311111
Batch:	11 /13	:  1.172557 14.60768 	l:3.625957 	em:0.355556
Batch:	12 /13	:  1.222868 15.83127 	l:3.988453 	em:0.288889

Epoch performance:  16.368sec Trl:4.152866 	Trem:0.297436 	Teem:0.000000

Epoch:  115 / 300
Batch:	0 /13	:  1.163897 1.163913 	l:4.124322 	em:0.333333
Batch:	1 /13	:  1.283920 2.448961 	l:4.636970 	em:0.266667
Batch:	2 /13	:  1.582226 4.031661 	l:4.827706 	em:0.333333
Batch:	3 /13	:  1.316382 5.348277 	l:4.536469 	em:0.266667
Batch:	4 /13	:  1.186817 6.535356 	l:4.082763 	em:0.266667
Batch:	5 /13	:  1.280141 7.816418 	l:4.152645 	em:0.244444
Batch:	6 /13	:  1.2

Batch:	12 /13	:  1.292616 16.69010 	l:5.035965 	em:0.244444

Epoch performance:  17.229sec Trl:4.038066 	Trem:0.319658 	Teem:0.000000

Epoch:  124 / 300
Batch:	0 /13	:  1.207754 1.207770 	l:4.465027 	em:0.288889
Batch:	1 /13	:  1.245075 2.453475 	l:4.425053 	em:0.333333
Batch:	2 /13	:  1.276560 3.730648 	l:3.605652 	em:0.244444
Batch:	3 /13	:  1.279208 5.010171 	l:4.128536 	em:0.244444
Batch:	4 /13	:  1.172652 6.183339 	l:3.304970 	em:0.355556
Batch:	5 /13	:  1.327805 7.511442 	l:4.046235 	em:0.244444
Batch:	6 /13	:  1.191879 8.703949 	l:4.165912 	em:0.311111
Batch:	7 /13	:  1.250766 9.955603 	l:4.260337 	em:0.311111
Batch:	8 /13	:  1.249651 11.20787 	l:3.402171 	em:0.266667
Batch:	9 /13	:  1.191159 12.40024 	l:4.313380 	em:0.266667
Batch:	10 /13	:  1.265436 13.66617 	l:3.857375 	em:0.333333
Batch:	11 /13	:  1.204451 14.87138 	l:4.341208 	em:0.266667
Batch:	12 /13	:  1.450906 16.32281 	l:4.332821 	em:0.288889

Epoch performance:  16.860sec Trl:4.049898 	Trem:0.288889 	Teem:0.000000

Ep

Batch:	5 /13	:  1.225028 7.410030 	l:3.768864 	em:0.333333
Batch:	6 /13	:  1.189455 8.600041 	l:3.606367 	em:0.266667
Batch:	7 /13	:  1.266394 9.866741 	l:3.637654 	em:0.311111
Batch:	8 /13	:  1.145547 11.01297 	l:3.910313 	em:0.266667
Batch:	9 /13	:  1.180571 12.19449 	l:4.326208 	em:0.288889
Batch:	10 /13	:  1.202994 13.39808 	l:3.938955 	em:0.266667
Batch:	11 /13	:  1.174896 14.57340 	l:3.752193 	em:0.355556
Batch:	12 /13	:  1.224117 15.79812 	l:4.138731 	em:0.288889

Epoch performance:  16.335sec Trl:3.928659 	Trem:0.314530 	Teem:0.000000

Epoch:  134 / 300
Batch:	0 /13	:  1.178073 1.178090 	l:3.476314 	em:0.422222
Batch:	1 /13	:  1.277471 2.456722 	l:4.279392 	em:0.333333
Batch:	2 /13	:  1.289844 3.747673 	l:4.260713 	em:0.355556
Batch:	3 /13	:  1.244493 4.992980 	l:4.241673 	em:0.377778
Batch:	4 /13	:  1.233747 6.227230 	l:3.318808 	em:0.377778
Batch:	5 /13	:  1.280781 7.509170 	l:3.320836 	em:0.244444
Batch:	6 /13	:  1.330099 8.839671 	l:3.321091 	em:0.355556
Batch:	7 /13	:  1.1


Epoch performance:  17.179sec Trl:3.761216 	Trem:0.340171 	Teem:0.000000

Epoch:  143 / 300
Batch:	0 /13	:  1.284596 1.284622 	l:4.031110 	em:0.333333
Batch:	1 /13	:  1.259681 2.545379 	l:3.617400 	em:0.333333
Batch:	2 /13	:  1.209380 3.755601 	l:4.355881 	em:0.333333
Batch:	3 /13	:  1.236431 4.993077 	l:2.478930 	em:0.577778
Batch:	4 /13	:  1.200852 6.194401 	l:3.868932 	em:0.311111
Batch:	5 /13	:  1.205169 7.400103 	l:4.042234 	em:0.333333
Batch:	6 /13	:  1.309128 8.710102 	l:3.524354 	em:0.444444
Batch:	7 /13	:  1.233728 9.944656 	l:3.589455 	em:0.444444
Batch:	8 /13	:  1.174327 11.11987 	l:3.227040 	em:0.444444
Batch:	9 /13	:  1.216033 12.33613 	l:2.966758 	em:0.377778
Batch:	10 /13	:  1.362843 13.69991 	l:3.297104 	em:0.422222
Batch:	11 /13	:  1.305873 15.02625 	l:3.862188 	em:0.355556
Batch:	12 /13	:  1.294677 16.32157 	l:4.005174 	em:0.333333

Epoch performance:  16.861sec Trl:3.605120 	Trem:0.388034 	Teem:0.000000

Epoch:  144 / 300
Batch:	0 /13	:  1.336136 1.336153 	l:3.68391

Batch:	6 /13	:  1.169108 9.056172 	l:3.430406 	em:0.311111
Batch:	7 /13	:  1.145946 10.20289 	l:3.185239 	em:0.288889
Batch:	8 /13	:  1.217787 11.42167 	l:2.815740 	em:0.466667
Batch:	9 /13	:  1.191339 12.61394 	l:2.950340 	em:0.466667
Batch:	10 /13	:  1.266872 13.88143 	l:4.225254 	em:0.266667
Batch:	11 /13	:  1.275022 15.15707 	l:3.093291 	em:0.466667
Batch:	12 /13	:  1.315347 16.47270 	l:3.026332 	em:0.533333

Epoch performance:  17.008sec Trl:3.302817 	Trem:0.415385 	Teem:0.000000

Epoch:  153 / 300
Batch:	0 /13	:  1.237254 1.237272 	l:2.924236 	em:0.444444
Batch:	1 /13	:  1.263589 2.501828 	l:3.592330 	em:0.400000
Batch:	2 /13	:  1.208928 3.710993 	l:2.490006 	em:0.444444
Batch:	3 /13	:  1.312824 5.025115 	l:3.585414 	em:0.355556
Batch:	4 /13	:  1.180083 6.205634 	l:3.538699 	em:0.377778
Batch:	5 /13	:  1.230077 7.436008 	l:2.042181 	em:0.511111
Batch:	6 /13	:  1.203047 8.639847 	l:3.441142 	em:0.377778
Batch:	7 /13	:  1.219883 9.860626 	l:2.974392 	em:0.400000
Batch:	8 /13	:  1.2


Epoch performance:  16.784sec Trl:3.250296 	Trem:0.406838 	Teem:0.000000

Epoch:  162 / 300
Batch:	0 /13	:  1.541599 1.541617 	l:2.677903 	em:0.444444
Batch:	1 /13	:  1.262814 2.805188 	l:3.035177 	em:0.400000
Batch:	2 /13	:  1.250750 4.056433 	l:3.386211 	em:0.377778
Batch:	3 /13	:  1.144889 5.202807 	l:3.338682 	em:0.311111
Batch:	4 /13	:  1.169532 6.372839 	l:3.074247 	em:0.422222
Batch:	5 /13	:  1.208364 7.581678 	l:2.802576 	em:0.488889
Batch:	6 /13	:  1.185503 8.779905 	l:3.233363 	em:0.400000
Batch:	7 /13	:  1.251642 10.03255 	l:3.366601 	em:0.511111
Batch:	8 /13	:  1.306156 11.33955 	l:3.358422 	em:0.400000
Batch:	9 /13	:  1.258062 12.59815 	l:3.341055 	em:0.355556
Batch:	10 /13	:  1.184144 13.78272 	l:2.589741 	em:0.466667
Batch:	11 /13	:  1.210204 14.99320 	l:2.833583 	em:0.333333
Batch:	12 /13	:  1.339417 16.33281 	l:3.167778 	em:0.355556

Epoch performance:  16.869sec Trl:3.092718 	Trem:0.405128 	Teem:0.000000

Epoch:  163 / 300
Batch:	0 /13	:  1.345446 1.345461 	l:3.51681

Batch:	6 /13	:  1.203603 8.860396 	l:2.900811 	em:0.488889
Batch:	7 /13	:  1.290712 10.15155 	l:3.157223 	em:0.444444
Batch:	8 /13	:  1.280004 11.43270 	l:3.017657 	em:0.377778
Batch:	9 /13	:  1.197978 12.63176 	l:2.393865 	em:0.466667
Batch:	10 /13	:  1.223631 13.85664 	l:3.680150 	em:0.377778
Batch:	11 /13	:  1.215806 15.07301 	l:2.962487 	em:0.466667
Batch:	12 /13	:  1.291309 16.36517 	l:2.154743 	em:0.533333

Epoch performance:  16.905sec Trl:2.976336 	Trem:0.435897 	Teem:0.000000

Epoch:  172 / 300
Batch:	0 /13	:  1.220772 1.220790 	l:2.996270 	em:0.333333
Batch:	1 /13	:  1.201365 2.422387 	l:2.422991 	em:0.533333
Batch:	2 /13	:  1.159075 3.582309 	l:3.068401 	em:0.400000
Batch:	3 /13	:  1.282635 4.865514 	l:2.139909 	em:0.488889
Batch:	4 /13	:  1.207340 6.073531 	l:2.909626 	em:0.533333
Batch:	5 /13	:  1.245405 7.319589 	l:3.402286 	em:0.466667
Batch:	6 /13	:  1.188973 8.509588 	l:2.176204 	em:0.466667
Batch:	7 /13	:  1.262614 9.772964 	l:3.248294 	em:0.400000
Batch:	8 /13	:  1.2

#### Visualizations

So far, we plot the training losss. 
Shall we superimpose test loss on it too? We don't calculate test loss per batch though (fortunately).

In [None]:
# Visualizations
print("Training Loss")
visualize_loss(op[0], "train loss", _only_epoch=True)

# if len(op[1]) > 0:

print("Training EM")
visualize_loss(op[1], "train em", _only_epoch=True)

print("Testing EM")
visualize_loss(op[3], "test em")


# print(op[1])


In [None]:
# # Testing (temp)
# models = { 'ques_model': ques_model,
#            'para_model': para_model,
#            'mlstm_model':  mlstm_model,
#            'pointer_decoder_model': pointer_decoder_model
#          }
# save_model(loc=macros['save_model_loc'], models=models, epochs=0)

In [None]:
# Try loading the model
ques_model = torch.load(os.path.join(macros['save_model_loc'], 'ques_model.torch'))
print("Ques Model\n", ques_model)

para_model = torch.load(os.path.join(macros['save_model_loc'], 'para_model.torch'))
print("Para Model\n", para_model)

mlstm_model = torch.load(os.path.join(macros['save_model_loc'], 'mlstm_model.torch'))
print("MLSTM Model\n", mlstm_model)

pointer_decoder_model = torch.load(os.path.join(macros['save_model_loc'], 'pointer_decoder_model.torch'))
print("Pointer Decoder model\n", pointer_decoder_model)

# Create dummy data for testing the predict fn
q = np.random.randint(0, len(vectors), (3, 30))
p = np.random.randint(0, len(vectors), (3, 200))

y_cap_start, y_cap_end, _ = predict(torch.tensor(p, dtype=torch.long, device=device), 
                                   torch.tensor(q, dtype=torch.long, device=device),
                                   ques_model=ques_model,
                                   para_model=para_model,
                                   mlstm_model=mlstm_model,
                                   pointer_decoder_model=pointer_decoder_model,
                                    macros=macros,
                                    debug=macros['debug'])

In [None]:
torch.argmax(y_cap_start.squeeze(), dim=1)