In [26]:
import json
import random
import numpy as np
from collections import Counter
import pickle as pickle
import scipy.stats
import time

import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F

## Load Data and Embeddings

In [46]:
unwanted_chars = ['\\','.',',','/','\'s']
start = ['<null>']

label_dict = {'neutral':0,'contradiction':1,'entailment':2}

def load_data(path): #load SNLI words
    '''
    Constructs 4 dictionaries with the same key values across the dictionaries
    '''
    #data = []
    excluded = 0
    hypothesis = {}
    premise = {}
    label = {}
    label_enc = {}
    with open(path, 'r') as f:
        #need this so indexing is continuous when sentences are skipped over
        idx = 0
        for i,line in enumerate(f):
            obj = json.loads(line)
            #skip these rows per readme
            if obj["gold_label"] == '-':
                excluded += 1
            else:
                label[idx] = obj["gold_label"]
                label_enc[idx] = label_dict[obj["gold_label"]]
                premise[idx] = obj["sentence1"]
                hypothesis[idx] = obj["sentence2"]
                idx += 1
    print('%s excluded' %excluded)
    return hypothesis, premise, label, label_enc

def load_embeddings(path,words_to_load,emb_dim): #load pre-trained GloVe embeddings
    with open(path) as f:
        loaded_embeddings = np.zeros((words_to_load, emb_dim))
        words = {}
        idx2words = {}
        ordered_words = []
        for i, line in enumerate(f):
            if i >= words_to_load: 
                break
            s = line.split()
            loaded_embeddings[i, :] = np.asarray(s[1:])
            words[s[0]] = i
            idx2words[i] = s[0]
            ordered_words.append(s[0])

    return loaded_embeddings, words, idx2words, ordered_words
            
def add_tokens(idx_mapping, embeddings, emb_dim):
    '''
    This function increases the index of the word to index mapping for GloVe so that
    0: padding index
    1: unk
    2: BoS
    '''
    words_cnt = Counter(idx_mapping)
    increment = Counter(dict.fromkeys(words, 3))
    words_cnt = words_cnt + increment
    words_cnt['<PAD_IDX>'] = 0
    words_cnt['<UNK>'] = 1
    words_cnt['<BoS>'] = 2
    
    #insert embeddings for tokens
    '''
    TO DO: FIX INITILIZATION
    '''
    #<BoS>
    print(embeddings.shape)
    embed = np.insert(embeddings,[0],np.random.rand(300),axis=0)
    print(embed.shape)
    #<UNK>
    embed = np.insert(embed,[0],np.random.rand(300),axis=0)
    print(embed.shape)
    #<PAD_IDX>
    embed = np.insert(embed,[0],np.zeros(300),axis=0)
    print(embed.shape)
    
    return words_cnt, embed

def clean_words(text_list): # Removes characters and makes all words lowercase
    for i,word in enumerate(text_list):
        for ch in unwanted_chars:
            if ch in text_list[i]:
                text_list[i] = text_list[i].replace(ch,'')
            text_list[i] = text_list[i].lower()

def tokenize(text_dict, idx_mapping, pad_len):
    '''
    text_dict: dictionary with index as key, sentence as value
    returns dictionary with the index as key, sentenece mapped to index as value, and padded to pad_len
    
    QUESTION: How should we choose pad_len?  Should we truncate or should we set to the max length of 
    premise and hypothesis?
    '''
    tokenized_data = {}
    for i in range(len(text_dict.keys())):
        text_list = text_dict[i].split()
        clean_words(text_list)
        text_idx = []
        for word in text_list:
            try:
                text_idx.append(idx_mapping[word])
            except KeyError:
                #UNK token
                text_idx.append(1)
                continue
        #insert BoS token
        text_idx.insert(0,2)
        if len(text_idx) > pad_len:
            text_idx = text_idx[:pad_len]
        text_idx = np.concatenate((text_idx,np.zeros(max(pad_len-len(text_idx),0))))
                                    
        tokenized_data[i] = np.array(text_idx).astype(int)
    return tokenized_data

In [None]:
#for lisa's laptop
vocab_size = 50000
emb_dim = 300
num_classes = 3
learning_rate = .05
path = '/Users/Lisa/Documents/Grad School/DS-GA 1101/data/'
glove_path = path+'glove.6B/glove.6B.300d.txt'
text_path = path+'snli_1.0/snli_1.0_train.jsonl'

In [248]:
vocab_size = 50000
emb_dim = 300
num_classes = 3
#do we use learning rate anywhere?
learning_rate = .05
glove_path = 'glove/glove.6B.300d.txt'
text_path = 'snli_1.0/snli_1.0_train.jsonl'

In [32]:
hypothesis, premise, label, label_enc = load_data(text_path)

785 excluded


In [50]:
embeddings, words, idx2words, ordered_words = load_embeddings(glove_path, vocab_size, emb_dim)

In [51]:
#modifies embeddings, words, idx2words in place to add tokens
words, embeddings = add_tokens(words, embeddings, emb_dim)
idx2words = {v:k for k,v in words.items()}

(50000, 300)
(50001, 300)
(50002, 300)
(50003, 300)


In [52]:
h_len = 10
p_len = 15
h_idx = tokenize(hypothesis, words, h_len)
p_idx = tokenize(premise, words, p_len)

In [55]:
fp = path+'pickles/'
with open(fp+'h_idx.pt', 'wb') as f:
    pickle.dump(h_idx, f)
with open(fp+'p_idx.pt', 'wb') as f:
    pickle.dump(p_idx, f)

## Create Batches

In [56]:
#sample indices
idx_list = np.array((0,1,2,3))

In [57]:
h_test_batch = torch.LongTensor(4, h_len)
p_test_batch = torch.LongTensor(4, p_len)
l_test_batch = torch.LongTensor(4,1)

In [58]:
#test batch
for i in idx_list:
    h_test_batch[i] = torch.from_numpy(h_idx[i])
    p_test_batch[i] = torch.from_numpy(p_idx[i])
    l_test_batch[i] = label_enc[i]
h_test_batch = h_test_batch.long()
p_test_batch = p_test_batch.long()
l_test_batch = Variable(l_test_batch)

In [855]:
class DecomposableAttention(nn.Module):
    '''
    Starting with premise (a), we see if the hypothesis (b) is an 
    entailment, a contradiction, or neutral.
    '''
    def __init__(self, glove_emb, batch_size, hidden_size, h_len, p_len, num_classes, dropout=0.2):
        super(DecomposableAttention, self).__init__()
        self.glove = glove_emb
        self.num_embeddings = glove_emb.shape[0]
        self.embedding_dim = glove_emb.shape[1]
        self.batch_size = batch_size
        self.hidden_size = hidden_size
        self.h_len = h_len
        self.p_len = p_len
        self.num_classes = num_classes
        self.dropout = dropout
        
        self.embed = nn.Embedding(self.num_embeddings, self.embedding_dim)
        
        '''
        MLP LAYERS
        '''
        self.mlp_f = self._mlp_layers(self.hidden_size, self.hidden_size)
        self.mlp_g = self._mlp_layers(2 * self.hidden_size, self.hidden_size)
        self.mlp_h = self._mlp_layers(2 * self.hidden_size, self.hidden_size)
        
        '''
        Linear Layers
        '''
        self.project_h = nn.Linear(self.embedding_dim,self.hidden_size)
        self.project_p = nn.Linear(self.embedding_dim,self.hidden_size)
        self.final_linear = nn.Linear(self.hidden_size,self.num_classes)
        self.init_weights()
    
    def _mlp_layers(self, input_dim, output_dim):
        mlp_layers = []
        mlp_layers.append(nn.Dropout(p=self.dropout))
        mlp_layers.append(nn.Linear(
            input_dim, output_dim, bias=True))
        mlp_layers.append(nn.ReLU())
        mlp_layers.append(nn.Dropout(p=self.dropout))
        mlp_layers.append(nn.Linear(
            output_dim, output_dim, bias=True))
        mlp_layers.append(nn.ReLU())   
        #sequential runs all the layers in order
        return nn.Sequential(*mlp_layers)  
    
    def forward(self, hypothesis, premise, label):
        start_time = time.time()
        
        '''
        Get padding masks
        Need to be LongTensors to avoid overflow with byte tensors
        '''
        h_mask = (hypothesis!=0).long()
        p_mask = (premise!=0).long()
        
        '''
        Embedding layer (only projection layer is trained)
        max length = max length of of hypothesis/premise (respectively) in batch
        Input dim: batch size x max length
        Output dim: batch size x max length x hidden dimensions
        '''
        p_embedded = self.embed(Variable(premise))
        h_embedded = self.embed(Variable(hypothesis))
        #project from embedding dim to hidden dim
        p_projected = self.project_p(p_embedded.view(-1,self.embedding_dim))\
                                         .view(self.batch_size,-1,self.hidden_size)
        h_projected = self.project_h(h_embedded.view(-1,self.embedding_dim))\
                                         .view(self.batch_size,-1,self.hidden_size)
        
        '''
        First Feed Forward Network (F)
        max length = max length of of hypothesis/premise (respectively) in batch
        Input dim: batch size x max length x hidden dimensions
        Output dim: batch size x max length x hidden dimension
        '''
        
        '''
        NEW MULTILAYER PERCEPTRON
        '''
        F_a = self.mlp_f(p_projected.view(-1, self.hidden_size)).view(self.batch_size,-1,self.hidden_size)
        F_b = self.mlp_f(h_projected.view(-1, self.hidden_size)).view(self.batch_size,-1,self.hidden_size)
        
        
        #E dim: batch_size x max len of hypothesis x max len of premise
        #transpose function swaps second and third axis so that F_b is batch size x hidden dim x len premise
        E = torch.matmul(F_a,torch.transpose(F_b,1,2))  
        
        '''
        Attention! 
        Given E, we reweight using the softmax and store in W_beta, W_alpha
        W_beta dim: batch_size x len(premise) x hidden dim
        W_alpha dim: batch_size x len(hypothesis) x hidden dim
        
        OLD:
        W_beta = Variable(torch.Tensor(self.batch_size,self.p_len,self.hidden_size))
        W_alpha = Variable(torch.Tensor(self.batch_size,self.h_len,self.hidden_size))
        for i in range(self.batch_size):
            for j in range(F_b.size()[1]):
                W_beta[i,j] = torch.mm(F.softmax(E[i,j]).view(1,-1),h_projected[i]).data
            for k in range(F_a.size()[1]):
                W_alpha[i,j] = torch.mm(F.softmax(E[i,:,j]).view(1,-1),p_projected[i]).data
        
        '''
        #p_mask is batch_size x p_len
        mask_a = p_mask.unsqueeze(1) #unsqueeze makes it batch_size x 1 x p_len
        mask_a = mask_a.expand_as(E.data.transpose(1, 2)).float() #expands it to (batch_size*h_len)x p_len
        mask_a = Variable(mask_a.view(-1, self.p_len))
        #mask_a.requires_grad = False
        
        mask_b = h_mask.unsqueeze(1) #unsqueeze makes it batch_size x 1 x h_len
        mask_b = mask_b.expand_as(E.data).float()  #expands it to (batch_size*p_len)x h_len
        mask_b = Variable(mask_b.view(-1, self.h_len))
        #mask_b.requires_grad = False
        
        #alpha is softmax over premise
        #dim: batch_size x h_len x p_len
        softmax_alpha = F.softmax(E.transpose(1, 2).contiguous().\
                                  view(-1, E.data.transpose(1, 2).size()[-1]))*mask_a
        #the +1e-13 is from allennlp. something about limiting numerical errors
        softmax_alpha = softmax_alpha / (softmax_alpha.sum(dim=1, keepdim=True) + 1e-13)
        softmax_alpha = softmax_alpha.view(E.data.transpose(1, 2).contiguous().size())
        
        #beta is softmax over the hypothesis
        #dim: batch_size x p_len x h_len
        softmax_beta = F.softmax(E.view(-1, E.data.size()[-1]))*mask_b
        softmax_beta = softmax_beta / (softmax_beta.sum(dim=1, keepdim=True) + 1e-13)
        softmax_beta = softmax_beta.view(E.data.size())
        
        
        '''
        softmax_beta is batch_size x p_len x h_len
        h_projected is batch size x h_len x hidden dimensions
        so W_beta is batch_size x p_len x hidden dim
        
        
        softmax_alpha is batch_size x h_len x p_len
        p_projected is batch size x p_len x hidden dimensions
        so W_alpha is batch size x h_len x hidden dime
        
        '''
        W_beta = torch.bmm(softmax_beta,h_projected)
        W_alpha = torch.bmm(softmax_alpha,p_projected)
        
        
        '''
        Compare
        Open items:
        1) Check that we're concatenating along the right dimensions.  Based on AllenNLP and libowen, 
            concatenated input should be batch size x len(hypothesis/premise) x (2 * embedding dim)
        
        Output:
        v1 dim: batch_size x len(hypothesis) x compare_dim
        v2 dim: batch_size x len(premise) x compare_dim
        '''
        #dim: batch size x len(hypotheis/premise) x (2* hidden dim)
        cat_p_beta = torch.cat((p_projected,W_beta),2)
        cat_h_alpha = torch.cat((h_projected,W_alpha),2)
        
        '''
        MLP with masking
        '''
        v_a = self.mlp_g(cat_p_beta.view(-1, 2*self.hidden_size))
        v_a = v_a.data*p_mask.view(-1).unsqueeze(1).expand_as(v_a).float()
        v_a = v_a.view(self.batch_size,-1,self.hidden_size)
        
        v_b = self.mlp_g(cat_h_alpha.view(-1, 2*self.hidden_size))
        v_b = v_b.data*h_mask.view(-1).unsqueeze(1).expand_as(v_b).float()
        v_b = v_b.view(self.batch_size,-1,self.hidden_size)
        '''
        Aggregate
        Given:
        v_a = output of relu activation on the concatenation of a (premise) and beta
        v_b = output of relu activation on the concatenation of b (hypothesis) and alpha
        '''
        v1 = torch.sum(v_a, dim=1)
        v2 = torch.sum(v_b, dim=1)
        
        H = self.mlp_h(torch.cat((v1,v2),1))
        H = self.final_linear(H)
        out = F.softmax(H)
        
        return out
    
    
    def init_weights(self):
        self.embed.weight.data.copy_(torch.from_numpy(self.glove))
        #does not train embedded weights
        self.embed.weight.requires_grad = False

In [854]:
'''TESTING FOR SINGLE BATCH'''
'''This should go in the training loop once our batch iterator is working:'''
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, da.parameters()), lr=learning_rate)
da = DecomposableAttention(embeddings,batch_size,200,\
                           h_len,p_len,num_classes,dropout=dropout_rate)
optimizer.zero_grad()
out= da(h_test_batch,p_test_batch,l_test_batch)
loss = criterion(out,l_test_batch.view(-1))
loss.backward()
optimizer.step()

In [None]:
def batch_iter(dataset_size, hypothesis, premise, label_enc, batch_size, hLen, pLen):  
    start        = -1 * batch_size
    order        = list(range(dataset_size))
    random.shuffle(order)

    while True:
        start     += batch_size
        if start > dataset_size - batch_size:
            # Start another epoch.
            start = 0
            random.shuffle(order)

        hBatch = torch.LongTensor(batch_size, hLen)
        pBatch = torch.LongTensor(batch_size, pLen)
        lBatch = torch.LongTensor(batch_size, 1)

        idx_list = order[start:start + batch_size]
        i = 0
        for idx in idx_list:
            hBatch[i] = torch.from_numpy(hypothesis[idx])
            pBatch[i] = torch.from_numpy(premise[idx])
            lBatch[i] = label_enc[idx]
            i += 1
            
        hBatch = hBatch.long()
        pBatch = pBatch.long()
        lBatch = Variable(lBatch)

        yield [hBatch, pBatch, lBatch]

In [857]:
def training_loop(dataset_size, batch_size, num_epochs, model, data_iter, optimizer, criterion):
    step = 0
    epoch = 0
    losses = []
    total_batches = int(dataset_size / batch_size)
    start_time = time.time()
    while epoch <= num_epochs:
        hypothesis, premise, label = next(data_iter) 
        #start_time = time.time()
        optimizer.zero_grad()
        output = model(hypothesis, premise, label)
        loss = criterion(output, label.view(-1))
        
        losses.append(loss.data[0])
        loss.backward()
        optimizer.step()
        print('loss: ',loss.data)

        if step % total_batches == 0:
            epoch += 1
            if epoch % 25 == 0:
                print( "Epoch:", (epoch), "Avg Loss:", np.mean(losses)/(total_batches*epoch), \
                      "Elapsed Time: ", (start_time - time.time()))
                start_time = time.time()
        step += 1
        #print("runtime for single batch: %s seconds" % (time.time() - start_time))


In [858]:
num_classes = 3
dropout_rate = .2
batch_size = 4
hidden_size = 200
h_len = 10
p_len = 15
num_epochs  = 10
learning_rate = .05
da = DecomposableAttention(embeddings,batch_size,hidden_size,\
                           h_len,p_len,num_classes,dropout=dropout_rate)

In [859]:
#filters out embedding layer which is not tuned
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, da.parameters()), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

In [860]:
dataset_size = len(hypothesis)
data_iter = batch_iter(dataset_size, h_idx, p_idx, label_enc, batch_size, h_len, p_len)

In [861]:
training_loop(dataset_size, batch_size, num_epochs, da, data_iter, optimizer, criterion)

loss:  
 1.0894
[torch.FloatTensor of size 1]

loss:  
 1.5514
[torch.FloatTensor of size 1]

loss:  
 1.2137
[torch.FloatTensor of size 1]

loss:  
 1.5514
[torch.FloatTensor of size 1]

loss:  
 1.0514
[torch.FloatTensor of size 1]

loss:  
 1.0514
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 0.8014
[torch.FloatTensor of size 1]

loss:  
 1.0514
[torch.FloatTensor of size 1]

loss:  
 1.0514
[torch.FloatTensor of size 1]

loss:  
 1.0514
[torch.FloatTensor of size 1]

loss:  
 1.5514
[torch.FloatTensor of size 1]

loss:  
 0.8014
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.0514
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.0514
[torch.FloatTensor of size 1]

loss:  
 1.5514
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.5514
[torch.FloatTensor of size 1]

loss:  
 1.05

loss:  
 1.0514
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.5514
[torch.FloatTensor of size 1]

loss:  
 1.0514
[torch.FloatTensor of size 1]

loss:  
 1.5514
[torch.FloatTensor of size 1]

loss:  
 1.0514
[torch.FloatTensor of size 1]

loss:  
 1.5514
[torch.FloatTensor of size 1]

loss:  
 1.5514
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.0514
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.5514
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 0.8014
[torch.FloatTensor of size 1]

loss:  
 1.5514
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.30

loss:  
 1.0514
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.5514
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.0514
[torch.FloatTensor of size 1]

loss:  
 1.5514
[torch.FloatTensor of size 1]

loss:  
 1.5514
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.0514
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.0514
[torch.FloatTensor of size 1]

loss:  
 1.5514
[torch.FloatTensor of size 1]

loss:  
 1.0514
[torch.FloatTensor of size 1]

loss:  
 0.8014
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.0514
[torch.FloatTensor of size 1]

loss:  
 1.55

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.5514
[torch.FloatTensor of size 1]

loss:  
 1.0514
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.5514
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.5514
[torch.FloatTensor of size 1]

loss:  
 1.0514
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.0514
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.5514
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.0514
[torch.FloatTensor of size 1]

loss:  
 1.0514
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.05

loss:  
 1.0514
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.5514
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.0514
[torch.FloatTensor of size 1]

loss:  
 1.5514
[torch.FloatTensor of size 1]

loss:  
 1.0514
[torch.FloatTensor of size 1]

loss:  
 1.5514
[torch.FloatTensor of size 1]

loss:  
 1.5514
[torch.FloatTensor of size 1]

loss:  
 1.5514
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.5514
[torch.FloatTensor of size 1]

loss:  
 1.5514
[torch.FloatTensor of size 1]

loss:  
 1.0514
[torch.FloatTensor of size 1]

loss:  
 1.0514
[torch.FloatTensor of size 1]

loss:  
 1.0514
[torch.FloatTensor of size 1]

loss:  
 1.5514
[torch.FloatTensor of size 1]

loss:  
 1.0514
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 0.80

loss:  
 1.0514
[torch.FloatTensor of size 1]

loss:  
 1.5514
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.0514
[torch.FloatTensor of size 1]

loss:  
 1.0514
[torch.FloatTensor of size 1]

loss:  
 1.5514
[torch.FloatTensor of size 1]

loss:  
 1.5514
[torch.FloatTensor of size 1]

loss:  
 1.0514
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.0514
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.0514
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.0514
[torch.FloatTensor of size 1]

loss:  
 1.0514
[torch.FloatTensor of size 1]

loss:  
 1.5514
[torch.FloatTensor of size 1]

loss:  
 1.0514
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.0514
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.05

loss:  
 1.5514
[torch.FloatTensor of size 1]

loss:  
 1.5514
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.5514
[torch.FloatTensor of size 1]

loss:  
 0.8014
[torch.FloatTensor of size 1]

loss:  
 1.0514
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.0514
[torch.FloatTensor of size 1]

loss:  
 1.0514
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.0514
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.0514
[torch.FloatTensor of size 1]

loss:  
 1.5514
[torch.FloatTensor of size 1]

loss:  
 1.5514
[torch.FloatTensor of size 1]

loss:  
 1.0514
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.5514
[torch.FloatTensor of size 1]

loss:  
 1.05

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.0514
[torch.FloatTensor of size 1]

loss:  
 1.0514
[torch.FloatTensor of size 1]

loss:  
 1.5514
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.0514
[torch.FloatTensor of size 1]

loss:  
 0.8014
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.0514
[torch.FloatTensor of size 1]

loss:  
 1.0514
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.5514
[torch.FloatTensor of size 1]

loss:  
 1.0514
[torch.FloatTensor of size 1]

loss:  
 0.8014
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 0.8014
[torch.FloatTensor of size 1]

loss:  
 1.0514
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.05

loss:  
 0.8014
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.5514
[torch.FloatTensor of size 1]

loss:  
 1.0514
[torch.FloatTensor of size 1]

loss:  
 1.5514
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.0514
[torch.FloatTensor of size 1]

loss:  
 1.5514
[torch.FloatTensor of size 1]

loss:  
 1.0514
[torch.FloatTensor of size 1]

loss:  
 1.0514
[torch.FloatTensor of size 1]

loss:  
 1.0514
[torch.FloatTensor of size 1]

loss:  
 1.0514
[torch.FloatTensor of size 1]

loss:  
 1.0514
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.5514
[torch.FloatTensor of size 1]

loss:  
 1.5514
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.0514
[torch.FloatTensor of size 1]

loss:  
 0.8014
[torch.FloatTensor of size 1]

loss:  
 1.5514
[torch.FloatTensor of size 1]

loss:  
 1.30

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.0514
[torch.FloatTensor of size 1]

loss:  
 1.0514
[torch.FloatTensor of size 1]

loss:  
 1.0514
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.5514
[torch.FloatTensor of size 1]

loss:  
 1.0514
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.0514
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.0514
[torch.FloatTensor of size 1]

loss:  
 1.5514
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 1.3014
[torch.FloatTensor of size 1]

loss:  
 0.8014
[torch.FloatTensor of size 1]

loss:  
 1.5514
[torch.FloatTensor of size 1]

loss:  
 1.5514
[torch.FloatTensor of size 1]

loss:  
 1.0514
[torch.FloatTensor of size 1]

loss:  
 1.05

KeyboardInterrupt: 

In [271]:
print('approximate minutes for a single epoch: ',(dataset_size/4*0.03537607192993164/60))

approximate minutes for a single epoch:  80.97686044971148
