In [1]:
import torch 
import math 
import numpy as np
import copy
from torch import nn
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
import ast 
from numpy import load
import torch.nn as nn
import random
import time
import matplotlib.pyplot as plt
import torch.nn.functional as F

In [2]:
import collections
#import helper
import numpy as np
#import project_tests as tests
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional
from keras.layers.embeddings import Embedding
from tensorflow.keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy

In [3]:
import torch.nn as nn

class Embedder(nn.Module):
    def __init__(self, vocab_size, d_model):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, d_model)
    def forward(self, x):
        return self.embed(x)

In [4]:
class PositionalEncoder(nn.Module):
    def __init__(self, d_model, max_seq_len = 120):
        super().__init__()
        self.d_model = d_model
        
        # create constant 'pe' matrix with values dependant on 
        # pos and i
        pe = torch.zeros(max_seq_len, d_model)
        for pos in range(max_seq_len):
            for i in range(0, d_model, 2):
                pe[pos, i] = math.sin(pos / (10000 ** ((2 * i)/d_model)))
                pe[pos, i + 1] = math.cos(pos / (10000 ** ((2 * (i + 1))/d_model)))
                
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
 
    
    def forward(self, x):
        # make embeddings relatively larger
        x = x * math.sqrt(self.d_model)
        #add constant to embedding
        seq_len = x.size(1)
        x = x + Variable(self.pe[:,:seq_len], \
        requires_grad=False).cuda(1)
        return x

In [5]:
import torch
from   core_qnn.quaternion_layers       import QuaternionLinearAutograd

In [6]:
class MultiHeadAttention(nn.Module):
    def __init__(self, heads, d_model, dropout = 0.1):
        super().__init__()
        
        self.d_model = d_model
        self.d_k = d_model // heads
        self.h = heads
        
        self.q_linear = QuaternionLinearAutograd(d_model, d_model)
        self.v_linear = QuaternionLinearAutograd(d_model, d_model)
        self.k_linear = QuaternionLinearAutograd(d_model, d_model)
        self.dropout = nn.Dropout(dropout)
        self.out = QuaternionLinearAutograd(d_model, d_model)
    
    def forward(self, q, k, v, mask=None):
        
        bs = q.size(0)
        
        # perform linear operation and split into h heads
        
        k = self.k_linear(k).view(bs, -1, self.h, self.d_k)
        q = self.q_linear(q).view(bs, -1, self.h, self.d_k)
        v = self.v_linear(v).view(bs, -1, self.h, self.d_k)
        
        # transpose to get dimensions bs * h * sl * d_model
       
        k = k.transpose(1,2)
        q = q.transpose(1,2)
        v = v.transpose(1,2)
        # calculate attention using function we will define next
        
        #print(k.shape,q.shape,v.shape)
        scores = attention(q, k, v, self.d_k, mask, self.dropout)
        
        # concatenate heads and put through final linear layer
        concat = scores.transpose(1,2).contiguous()\
        .view(bs, -1, self.d_model)
        
        output = self.out(concat)
    
        return output

In [7]:
def quarternion_multiplication(a, b, transpose=True):
    """ Performs hamilton product between two quarternion sequences.
    a = (r,x,y,z)
    b = (r',x',y',z')
    following:
    (rr' - xx' - yy' - zz')  +
    (rx' + xr' + yz' - zy')i +
    (ry' - xz' + yr' + zx')j +
    (rz' + xy' - yx' + zr')k
    """


    ar, ax, ay, az = torch.chunk(a, chunks=4, dim=-1)
    br, bx, by, bz = torch.chunk(b, chunks=4, dim=-1)
    #print(ar.shape)
    #print(br.shape)

    if transpose==True:
        if len(br.shape)>2:
            #r = torch.matmul(br.transpose(-2,-1),ar) - torch.matmul(bx.transpose(-2,-1),ax) - torch.matmul(by.transpose(-2,-1),ay) - torch.matmul(bz.transpose(-2,-1),az)
            #i = torch.matmul(bx.transpose(-2,-1),ar) + torch.matmul(br.transpose(-2,-1),ax) + torch.matmul(bz.transpose(-2,-1),ay) - torch.matmul(by.transpose(-2,-1),az)
            #j = torch.matmul(by.transpose(-2,-1),ar) - torch.matmul(bz.transpose(-2,-1),ax) + torch.matmul(br.transpose(-2,-1),ay) + torch.matmul(bx.transpose(-2,-1),az)
            #k = torch.matmul(bz.transpose(-2,-1),ar) + torch.matmul(by.transpose(-2,-1),ax) - torch.matmul(bx.transpose(-2,-1),ay) + torch.matmul(br.transpose(-2,-1),az)
        
            r = torch.matmul(ar,br.transpose(-2,-1)) - torch.matmul(ax,bx.transpose(-2,-1)) - torch.matmul(ay,by.transpose(-2,-1)) - torch.matmul(az,bz.transpose(-2,-1))
            i = torch.matmul(ar,bx.transpose(-2,-1)) + torch.matmul(ax,br.transpose(-2,-1)) + torch.matmul(ay,bz.transpose(-2,-1)) - torch.matmul(az,by.transpose(-2,-1))
            j = torch.matmul(ar,by.transpose(-2,-1)) - torch.matmul(ax,bz.transpose(-2,-1)) + torch.matmul(ay,br.transpose(-2,-1)) + torch.matmul(az,bx.transpose(-2,-1))
            k = torch.matmul(ar,bz.transpose(-2,-1)) + torch.matmul(ax,by.transpose(-2,-1)) - torch.matmul(ay,bx.transpose(-2,-1)) + torch.matmul(az,br.transpose(-2,-1))
        
            
        else:
            r = torch.matmul(ar, br.t()) - torch.matmul(ax, bx.t()) - torch.matmul(ay, by.t()) - torch.matmul(az, bz.t())
            i = torch.matmul(ar, bx.t()) + torch.matmul(ax, br.t()) + torch.matmul(ay, bz.t()) - torch.matmul(az, by.t())
            j = torch.matmul(ar, by.t()) - torch.matmul(ax, bz.t()) + torch.matmul(ay, br.t()) + torch.matmul(az, bx.t())
            k = torch.matmul(ar, bz.t()) + torch.matmul(ax, by.t()) - torch.matmul(ay, bx.t()) + torch.matmul(az, br.t())
    else:
        r = torch.matmul(ar,br) - torch.matmul(ax,bx) - torch.matmul(ay,by) - torch.matmul(az,bz)
        i = torch.matmul(ar,bx) + torch.matmul(ax,br) + torch.matmul(ay,bz) - torch.matmul(az,by)
        j = torch.matmul(ar,by) - torch.matmul(ax,bz) + torch.matmul(ay,br) + torch.matmul(az,bx)
        k = torch.matmul(ar,bz) + torch.matmul(ax,by) - torch.matmul(ay,bx) + torch.matmul(az,br)
        
    return [r, i, j, k]

In [8]:
def attention(q,k, v, d_k, mask=None, dropout=None):
    [scores_r,scores_i,scores_j,scores_k] = [x/math.sqrt(d_k) for x in quarternion_multiplication(q,k)]
    
    if mask is not None:
        #print("mask",mask)
        mask = mask.unsqueeze(1)
        
        #print(scores_r.shape)
        scores_r = scores_r.masked_fill(mask == 0, -1e9)
        scores_r = F.softmax(scores_r, dim=-1)
        scores_i = scores_i.masked_fill(mask == 0, -1e9)
        scores_i = F.softmax(scores_i, dim=-1)
        scores_j = scores_j.masked_fill(mask == 0, -1e9)
        scores_j = F.softmax(scores_j, dim=-1)
        scores_k = scores_k.masked_fill(mask == 0, -1e9)
        scores_k = F.softmax(scores_k, dim=-1)
    
    if dropout is not None:
        scores_r = dropout(scores_r)
        scores_i = dropout(scores_i)
        scores_j = dropout(scores_j)
        scores_k = dropout(scores_k)
    
    scores = torch.cat([scores_r,scores_i,scores_j,scores_k],dim=-1)
    #print(scores.shape)
    #print(v.shape)
    output = quarternion_multiplication(scores,v,transpose=False)
    output = torch.cat(output, dim=-1)
    return output

In [12]:
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff=2048, dropout = 0.1):
        super().__init__() 
        # We set d_ff as a default to 2048
        self.linear_1 = QuaternionLinearAutograd(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = QuaternionLinearAutograd(d_ff, d_model)
    def forward(self, x):
        x = self.dropout(F.relu(self.linear_1(x)))
        x = self.linear_2(x)
        return x

In [13]:
def QNorm(x,eps):
    r,i,j,k = torch.chunk(x, chunks=4, dim=-1)
    qnorm = torch.sqrt(r*r + i*i + j*j + k*k + eps)
    r = r/qnorm
    i = i/qnorm
    j = j/qnorm
    k = k/qnorm
        
    return [r,i,j,k]

In [14]:
class Norm(nn.Module):
    def __init__(self, d_model, eps = 1e-6):
        super().__init__()
    
        self.size = d_model//4
        # create two learnable parameters to calibrate normalisation
        self.alpha = nn.Parameter(torch.ones(self.size))
        self.bias = nn.Parameter(torch.zeros(self.size))
        self.eps = eps
    def forward(self, x):
        
        [r,i,j,k] = QNorm(x,self.eps)

        norm_r = self.alpha *r + self.bias
        norm_i = self.alpha *i + self.bias
        norm_j = self.alpha *j + self.bias
        norm_k = self.alpha *k + self.bias
        norm = torch.cat([norm_r,norm_i,norm_j,norm_k],dim=-1)

        return norm
    
    

In [15]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, heads, dropout = 0.1):
        super().__init__()
        self.norm_1 = Norm(d_model)
        self.norm_2 = Norm(d_model)
        self.attn = MultiHeadAttention(heads, d_model)
        self.ff = FeedForward(d_model)
        self.dropout_1 = nn.Dropout(dropout)
        self.dropout_2 = nn.Dropout(dropout)
        
    def forward(self, x, mask):
        x2 = self.norm_1(x)
        
        x = x + self.dropout_1(self.attn(x2,x2,x2,mask))
        x2 = self.norm_2(x)
        x = x + self.dropout_2(self.ff(x2))
        return x
    
# build a decoder layer with two multi-head attention layers and
# one feed-forward layer
class DecoderLayer(nn.Module):
    def __init__(self, d_model, heads, dropout=0.1):
        super().__init__()
        self.norm_1 = Norm(d_model)
        self.norm_2 = Norm(d_model)
        self.norm_3 = Norm(d_model)
        
        self.dropout_1 = nn.Dropout(dropout)
        self.dropout_2 = nn.Dropout(dropout)
        self.dropout_3 = nn.Dropout(dropout)
        
        self.attn_1 = MultiHeadAttention(heads, d_model)
        self.attn_2 = MultiHeadAttention(heads, d_model)
        self.ff = FeedForward(d_model).cuda(1)

    def forward(self, x, e_outputs, src_mask, trg_mask):
        x2 = self.norm_1(x)
        x = x + self.dropout_1(self.attn_1(x2, x2, x2, trg_mask))
        x2 = self.norm_2(x)
        x = x + self.dropout_2(self.attn_2(x2, e_outputs, e_outputs,
        src_mask))
        x2 = self.norm_3(x)
        x = x + self.dropout_3(self.ff(x2))
        return x
        # We can then build a convenient cloning function that can generate multiple layers:
def get_clones(module, N):
    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])

In [16]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, d_model, N, heads):
        super().__init__()
        self.N = N
        self.embed = Embedder(vocab_size, d_model)
        self.pe = PositionalEncoder(d_model)
        self.layers = get_clones(EncoderLayer(d_model, heads), N)
        self.norm = Norm(d_model)
    def forward(self, src, mask):
        x = self.embed(src)
        x = self.pe(x)
        for i in range(N):
            x = self.layers[i](x, mask)
        return self.norm(x)
    
class Decoder(nn.Module):
    def __init__(self, vocab_size, d_model, N, heads):
        super().__init__()
        self.N = N
        self.embed = Embedder(vocab_size, d_model)
        self.pe = PositionalEncoder(d_model)
        self.layers = get_clones(DecoderLayer(d_model, heads), N)
        self.norm = Norm(d_model)
    def forward(self, trg, e_outputs, src_mask, trg_mask):
        x = self.embed(trg)
        x = self.pe(x)
        for i in range(self.N):
            x = self.layers[i](x, e_outputs, src_mask, trg_mask)
        return self.norm(x)

In [17]:
class Transformer(nn.Module):
    def __init__(self, src_vocab, trg_vocab, d_model, N, heads):
        super().__init__()
        self.encoder = Encoder(src_vocab, d_model, N, heads)
        self.decoder = Decoder(trg_vocab, d_model, N, heads)
        self.out = QuaternionLinearAutograd(d_model, trg_vocab)
    def forward(self, src, trg, src_mask, trg_mask):
        e_outputs = self.encoder(src, src_mask)
        d_output = self.decoder(trg, e_outputs, src_mask, trg_mask)
        output = self.out(d_output)
        return output

In [None]:
"""Prepare the data"""

In [18]:
import pandas as pd 

#test_data=open("/media/data6TB/deepak/NalinIdea/deu-eng/deu.txt")
#daat=[]
#for l in test_data.readlines():
#    daat.append(l.strip().split("\t")[:2])

#test_data.close()

#rest_df = pd.DataFrame(daat[:100000], columns = ['English', 'German'])
#rest_df.to_csv("/media/data6TB/deepak/NalinIdea/deu-eng/new_train_1.csv", index=False)
#test_df = pd.DataFrame(daat[100000:], columns = ['English', 'German'])

import spacy
import torchtext
from torchtext.legacy.data import Field, BucketIterator, TabularDataset

en = spacy.load('en')
de = spacy.load('de')
def tokenize_en(sentence):
    return [tok.text for tok in en.tokenizer(sentence)]
def tokenize_de(sentence):
    return [tok.text for tok in de.tokenizer(sentence)]
EN_TEXT = Field(tokenize=tokenize_en, init_token = "<sos>", eos_token = "<eos>")
DE_TEXT = Field(tokenize=tokenize_de, init_token = "<sos>", eos_token = "<eos>")

from sklearn.model_selection import train_test_split

# create train and validation set 
#train, val = train_test_split(test_df, test_size=0.01)
#train.to_csv("/media/data6TB/deepak/NalinIdea/deu-eng/new_train_2.csv", index=False)
#val.to_csv("/media/data6TB/deepak/NalinIdea/deu-eng/new_val.csv", index=False)

from torchtext.legacy import data
data_fields = [('English', EN_TEXT), ('German', DE_TEXT)]
train,val = data.TabularDataset.splits(path='/media/data6TB/deepak/NalinIdea/deu-eng/', train='new_train_2.csv', validation='new_val.csv', format='csv', fields=data_fields)

DE_TEXT.build_vocab(train, val)
EN_TEXT.build_vocab(train, val)

seq_len=0
for x in train.German:
    if len(x)>seq_len:
        seq_len = len(x)
print(seq_len)


global max_src_in_batch, max_tgt_in_batch
def batch_size_fn(new, count, sofar):
    "Keep augmenting batch and calculate total number of tokens + padding."
    global max_src_in_batch, max_tgt_in_batch
    if count == 1:
        max_src_in_batch = 0
        max_tgt_in_batch = 0
    max_src_in_batch = max(max_src_in_batch,  len(new.English))
    max_tgt_in_batch = max(max_tgt_in_batch,  len(new.German))
    src_elements = count * max_src_in_batch
    tgt_elements = count * max_tgt_in_batch
    return max(src_elements, tgt_elements)
class MyIterator(data.Iterator):
    def create_batches(self):
        if self.train:
            def pool(d, random_shuffler):
                for p in data.batch(d, self.batch_size * 100):
                    p_batch = data.batch(
                        sorted(p, key=self.sort_key),
                        self.batch_size, self.batch_size_fn)
                    for b in random_shuffler(list(p_batch)):
                        yield b
            self.batches = pool(self.data(), self.random_shuffler)
            
        else:
            self.batches = []
            for b in data.batch(self.data(), self.batch_size,
                                          self.batch_size_fn):
                self.batches.append(sorted(b, key=self.sort_key))

train_iter = MyIterator(train, batch_size=64, device=1,
                        repeat=False, sort_key= lambda x:
                        (len(x.English), len(x.German)),
                        batch_size_fn=batch_size_fn, train=True,
                        shuffle=True)

The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.


85


In [19]:
batch = next(iter(train_iter))
input_seq = batch.English.transpose(0,1)
input_pad = EN_TEXT.vocab.stoi['<pad>']
# creates mask with 0s wherever there is padding in the input
input_msk = (input_seq != input_pad).unsqueeze(1)

from torch.autograd import Variable
import torch
import numpy as np
# create mask as before
target_seq = batch.German.transpose(0,1)
target_pad = DE_TEXT.vocab.stoi['<pad>']
target_msk = (target_seq != target_pad).unsqueeze(1)
size = target_seq.size(1) 
# get seq_len for matrix
nopeak_mask = np.triu(np.ones((1, size, size)),k=1).astype('uint8')
nopeak_mask = Variable(torch.from_numpy(nopeak_mask) == 0)
target_msk = target_msk & nopeak_mask

input_msk = input_msk.cuda(1)
target_msk = target_msk.cuda(1)

In [20]:

def nopeak_mask(size,cuda_enabled):
    np_mask = np.triu(np.ones((1, size, size)),
    k=1).astype('uint8')
    np_mask =  torch.autograd.Variable(torch.from_numpy(np_mask) == 0)

    if cuda_enabled:
        np_mask = np_mask.cuda(1)
    return np_mask

def create_masks(src, trg):
    src_mask = (src != 0).unsqueeze(-2)
    if trg is not None:
        trg_mask = (trg != 0).unsqueeze(-2)
        size = trg.size(1) # get seq_len for matrix
        # print("Sequence lenght in mask ",size)
        np_mask = nopeak_mask(size,True)
        # print(np_mask.shape,trg_mask.shape)
        if trg.is_cuda:
            np_mask.cuda(1)
        trg_mask = trg_mask & np_mask
    else:
        trg_mask = None
    return src_mask, trg_mask

In [21]:
d_model = 80
heads = 4
N = 3
src_vocab = len(EN_TEXT.vocab)
trg_vocab = len(DE_TEXT.vocab)

model = Transformer(src_vocab, trg_vocab, d_model, N, heads)
for p in model.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)
        
optim = torch.optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

In [22]:
model_parameters = filter(lambda p: p.requires_grad, model.parameters())
params = sum([np.prod(p.size()) for p in model_parameters])
embed_parm = 2*sum([np.prod(p.size()) for p in model.encoder.embed.parameters()])
params,params-embed_parm

(5228104, 2661064)

In [23]:
def train_model(epochs, print_every=100):
    model.cuda(1)
    model.train()

    start = time.time()
    temp = start
    
    total_loss = 0
    
    for epoch in range(epochs):
       
        for i, batch in enumerate(train_iter):            
            src = batch.English.transpose(0,1)
            trg = batch.German.transpose(0,1)            
            
            
            trg_input = trg[:, :-1]
            
            # the words we are trying to predict
            
            targets = trg[:, 1:].contiguous().view(-1)
            
            # create function to make masks using mask code above
            
            if torch.cuda.is_available():
                src = src.cuda(1)
                trg_input = trg_input.cuda(1)
            
            #print(src.is_cuda)    
            #print(trg_input.is_cuda)
            #print(input_msk.is_cuda)
            #print(target_msk.is_cuda)
            
            src_mask, trg_mask = create_masks(src, trg_input)
            #print(src_mask)
            preds = model(src, trg_input, src_mask, trg_mask)
            
            optim.zero_grad()
            
            loss = F.cross_entropy(preds.view(-1, preds.size(-1)),targets.cuda(1), ignore_index=target_pad)            
            loss.backward()
            optim.step()
            
            print(loss.data)
            total_loss += loss.data
            if (i + 1) % print_every == 0:
                loss_avg = total_loss / print_every
                print("time = %dm, epoch %d, iter = %d, loss = %.3f, %ds per %d iters" % ((time.time() - start) // 60, epoch + 1, i + 1, loss_avg, time.time() - temp,print_every))
                total_loss = 0
                temp = time.time()

In [None]:
train_model(50, print_every=1)

tensor(10.4338, device='cuda:1')
time = 0m, epoch 1, iter = 1, loss = 10.434, 0s per 1 iters
tensor(10.4326, device='cuda:1')
time = 0m, epoch 1, iter = 2, loss = 10.433, 0s per 1 iters
tensor(10.4259, device='cuda:1')
time = 0m, epoch 1, iter = 3, loss = 10.426, 0s per 1 iters
tensor(10.4266, device='cuda:1')
time = 0m, epoch 1, iter = 4, loss = 10.427, 0s per 1 iters
tensor(10.4104, device='cuda:1')
time = 0m, epoch 1, iter = 5, loss = 10.410, 0s per 1 iters
tensor(10.4075, device='cuda:1')
time = 0m, epoch 1, iter = 6, loss = 10.407, 0s per 1 iters
tensor(10.4018, device='cuda:1')
time = 0m, epoch 1, iter = 7, loss = 10.402, 0s per 1 iters
tensor(10.4035, device='cuda:1')
time = 0m, epoch 1, iter = 8, loss = 10.403, 0s per 1 iters
tensor(10.4092, device='cuda:1')
time = 0m, epoch 1, iter = 9, loss = 10.409, 0s per 1 iters
tensor(10.4027, device='cuda:1')
time = 0m, epoch 1, iter = 10, loss = 10.403, 0s per 1 iters
tensor(10.4016, device='cuda:1')
time = 0m, epoch 1, iter = 11, loss 

tensor(10.1999, device='cuda:1')
time = 0m, epoch 1, iter = 89, loss = 10.200, 0s per 1 iters
tensor(10.1421, device='cuda:1')
time = 0m, epoch 1, iter = 90, loss = 10.142, 0s per 1 iters
tensor(10.1541, device='cuda:1')
time = 0m, epoch 1, iter = 91, loss = 10.154, 0s per 1 iters
tensor(10.1490, device='cuda:1')
time = 0m, epoch 1, iter = 92, loss = 10.149, 0s per 1 iters
tensor(10.1377, device='cuda:1')
time = 0m, epoch 1, iter = 93, loss = 10.138, 0s per 1 iters
tensor(10.1039, device='cuda:1')
time = 0m, epoch 1, iter = 94, loss = 10.104, 0s per 1 iters
tensor(10.1520, device='cuda:1')
time = 0m, epoch 1, iter = 95, loss = 10.152, 0s per 1 iters
tensor(10.1638, device='cuda:1')
time = 0m, epoch 1, iter = 96, loss = 10.164, 0s per 1 iters
tensor(10.1346, device='cuda:1')
time = 0m, epoch 1, iter = 97, loss = 10.135, 0s per 1 iters
tensor(10.1056, device='cuda:1')
time = 0m, epoch 1, iter = 98, loss = 10.106, 0s per 1 iters
tensor(10.1241, device='cuda:1')
time = 0m, epoch 1, iter = 

tensor(9.9346, device='cuda:1')
time = 0m, epoch 1, iter = 178, loss = 9.935, 0s per 1 iters
tensor(9.8449, device='cuda:1')
time = 0m, epoch 1, iter = 179, loss = 9.845, 0s per 1 iters
tensor(9.9315, device='cuda:1')
time = 0m, epoch 1, iter = 180, loss = 9.931, 0s per 1 iters
tensor(9.8266, device='cuda:1')
time = 0m, epoch 1, iter = 181, loss = 9.827, 0s per 1 iters
tensor(9.8589, device='cuda:1')
time = 0m, epoch 1, iter = 182, loss = 9.859, 0s per 1 iters
tensor(9.8896, device='cuda:1')
time = 0m, epoch 1, iter = 183, loss = 9.890, 0s per 1 iters
tensor(9.9056, device='cuda:1')
time = 0m, epoch 1, iter = 184, loss = 9.906, 0s per 1 iters
tensor(9.8519, device='cuda:1')
time = 0m, epoch 1, iter = 185, loss = 9.852, 0s per 1 iters
tensor(9.9170, device='cuda:1')
time = 0m, epoch 1, iter = 186, loss = 9.917, 0s per 1 iters
tensor(9.8996, device='cuda:1')
time = 0m, epoch 1, iter = 187, loss = 9.900, 0s per 1 iters
tensor(9.9206, device='cuda:1')
time = 0m, epoch 1, iter = 188, loss =

tensor(9.6517, device='cuda:1')
time = 0m, epoch 1, iter = 267, loss = 9.652, 0s per 1 iters
tensor(9.6747, device='cuda:1')
time = 0m, epoch 1, iter = 268, loss = 9.675, 0s per 1 iters
tensor(9.6562, device='cuda:1')
time = 0m, epoch 1, iter = 269, loss = 9.656, 0s per 1 iters
tensor(9.6001, device='cuda:1')
time = 0m, epoch 1, iter = 270, loss = 9.600, 0s per 1 iters
tensor(9.6002, device='cuda:1')
time = 0m, epoch 1, iter = 271, loss = 9.600, 0s per 1 iters
tensor(9.6012, device='cuda:1')
time = 0m, epoch 1, iter = 272, loss = 9.601, 0s per 1 iters
tensor(9.7122, device='cuda:1')
time = 0m, epoch 1, iter = 273, loss = 9.712, 0s per 1 iters
tensor(9.6629, device='cuda:1')
time = 0m, epoch 1, iter = 274, loss = 9.663, 0s per 1 iters
tensor(9.6236, device='cuda:1')
time = 0m, epoch 1, iter = 275, loss = 9.624, 0s per 1 iters
tensor(9.6710, device='cuda:1')
time = 0m, epoch 1, iter = 276, loss = 9.671, 0s per 1 iters
tensor(9.5788, device='cuda:1')
time = 0m, epoch 1, iter = 277, loss =

tensor(9.3174, device='cuda:1')
time = 1m, epoch 1, iter = 356, loss = 9.317, 0s per 1 iters
tensor(9.3677, device='cuda:1')
time = 1m, epoch 1, iter = 357, loss = 9.368, 0s per 1 iters
tensor(9.3172, device='cuda:1')
time = 1m, epoch 1, iter = 358, loss = 9.317, 0s per 1 iters
tensor(9.4269, device='cuda:1')
time = 1m, epoch 1, iter = 359, loss = 9.427, 0s per 1 iters
tensor(9.4751, device='cuda:1')
time = 1m, epoch 1, iter = 360, loss = 9.475, 0s per 1 iters
tensor(9.3782, device='cuda:1')
time = 1m, epoch 1, iter = 361, loss = 9.378, 0s per 1 iters
tensor(9.3337, device='cuda:1')
time = 1m, epoch 1, iter = 362, loss = 9.334, 0s per 1 iters
tensor(9.4225, device='cuda:1')
time = 1m, epoch 1, iter = 363, loss = 9.423, 0s per 1 iters
tensor(9.6592, device='cuda:1')
time = 1m, epoch 1, iter = 364, loss = 9.659, 0s per 1 iters
tensor(9.4232, device='cuda:1')
time = 1m, epoch 1, iter = 365, loss = 9.423, 0s per 1 iters
tensor(9.3122, device='cuda:1')
time = 1m, epoch 1, iter = 366, loss =

time = 1m, epoch 1, iter = 445, loss = 9.073, 0s per 1 iters
tensor(9.0848, device='cuda:1')
time = 1m, epoch 1, iter = 446, loss = 9.085, 0s per 1 iters
tensor(9.2021, device='cuda:1')
time = 1m, epoch 1, iter = 447, loss = 9.202, 0s per 1 iters
tensor(9.0357, device='cuda:1')
time = 1m, epoch 1, iter = 448, loss = 9.036, 0s per 1 iters
tensor(9.0027, device='cuda:1')
time = 1m, epoch 1, iter = 449, loss = 9.003, 0s per 1 iters
tensor(9.2205, device='cuda:1')
time = 1m, epoch 1, iter = 450, loss = 9.221, 0s per 1 iters
tensor(9.0126, device='cuda:1')
time = 1m, epoch 1, iter = 451, loss = 9.013, 0s per 1 iters
tensor(9.2028, device='cuda:1')
time = 1m, epoch 1, iter = 452, loss = 9.203, 0s per 1 iters
tensor(9.0541, device='cuda:1')
time = 1m, epoch 1, iter = 453, loss = 9.054, 0s per 1 iters
tensor(9.0563, device='cuda:1')
time = 1m, epoch 1, iter = 454, loss = 9.056, 0s per 1 iters
tensor(9.1006, device='cuda:1')
time = 1m, epoch 1, iter = 455, loss = 9.101, 0s per 1 iters
tensor(9.

tensor(8.7591, device='cuda:1')
time = 1m, epoch 1, iter = 534, loss = 8.759, 0s per 1 iters
tensor(8.8856, device='cuda:1')
time = 1m, epoch 1, iter = 535, loss = 8.886, 0s per 1 iters
tensor(9.0303, device='cuda:1')
time = 1m, epoch 1, iter = 536, loss = 9.030, 0s per 1 iters
tensor(8.9425, device='cuda:1')
time = 1m, epoch 1, iter = 537, loss = 8.943, 0s per 1 iters
tensor(8.6213, device='cuda:1')
time = 1m, epoch 1, iter = 538, loss = 8.621, 0s per 1 iters
tensor(8.8981, device='cuda:1')
time = 1m, epoch 1, iter = 539, loss = 8.898, 0s per 1 iters
tensor(8.9135, device='cuda:1')
time = 1m, epoch 1, iter = 540, loss = 8.913, 0s per 1 iters
tensor(8.7666, device='cuda:1')
time = 1m, epoch 1, iter = 541, loss = 8.767, 0s per 1 iters
tensor(8.6930, device='cuda:1')
time = 1m, epoch 1, iter = 542, loss = 8.693, 0s per 1 iters
tensor(8.7844, device='cuda:1')
time = 1m, epoch 1, iter = 543, loss = 8.784, 0s per 1 iters
tensor(8.9452, device='cuda:1')
time = 1m, epoch 1, iter = 544, loss =

tensor(8.4664, device='cuda:1')
time = 2m, epoch 1, iter = 624, loss = 8.466, 0s per 1 iters
tensor(8.6938, device='cuda:1')
time = 2m, epoch 1, iter = 625, loss = 8.694, 0s per 1 iters
tensor(8.3605, device='cuda:1')
time = 2m, epoch 1, iter = 626, loss = 8.360, 0s per 1 iters
tensor(8.5293, device='cuda:1')
time = 2m, epoch 1, iter = 627, loss = 8.529, 0s per 1 iters
tensor(8.5354, device='cuda:1')
time = 2m, epoch 1, iter = 628, loss = 8.535, 0s per 1 iters
tensor(8.5753, device='cuda:1')
time = 2m, epoch 1, iter = 629, loss = 8.575, 0s per 1 iters
tensor(8.5876, device='cuda:1')
time = 2m, epoch 1, iter = 630, loss = 8.588, 0s per 1 iters
tensor(8.3921, device='cuda:1')
time = 2m, epoch 1, iter = 631, loss = 8.392, 0s per 1 iters
tensor(8.4348, device='cuda:1')
time = 2m, epoch 1, iter = 632, loss = 8.435, 0s per 1 iters
tensor(8.5531, device='cuda:1')
time = 2m, epoch 1, iter = 633, loss = 8.553, 0s per 1 iters
tensor(8.3866, device='cuda:1')
time = 2m, epoch 1, iter = 634, loss =

tensor(8.2605, device='cuda:1')
time = 2m, epoch 1, iter = 714, loss = 8.261, 0s per 1 iters
tensor(8.4942, device='cuda:1')
time = 2m, epoch 1, iter = 715, loss = 8.494, 0s per 1 iters
tensor(8.2408, device='cuda:1')
time = 2m, epoch 1, iter = 716, loss = 8.241, 0s per 1 iters
tensor(8.1709, device='cuda:1')
time = 2m, epoch 1, iter = 717, loss = 8.171, 0s per 1 iters
tensor(8.2056, device='cuda:1')
time = 2m, epoch 1, iter = 718, loss = 8.206, 0s per 1 iters
tensor(8.0570, device='cuda:1')
time = 2m, epoch 1, iter = 719, loss = 8.057, 0s per 1 iters
tensor(8.6145, device='cuda:1')
time = 2m, epoch 1, iter = 720, loss = 8.615, 0s per 1 iters
tensor(8.2870, device='cuda:1')
time = 2m, epoch 1, iter = 721, loss = 8.287, 0s per 1 iters
tensor(8.0851, device='cuda:1')
time = 2m, epoch 1, iter = 722, loss = 8.085, 0s per 1 iters
tensor(8.1969, device='cuda:1')
time = 2m, epoch 1, iter = 723, loss = 8.197, 0s per 1 iters
tensor(8.1801, device='cuda:1')
time = 2m, epoch 1, iter = 724, loss =

tensor(7.9142, device='cuda:1')
time = 2m, epoch 1, iter = 803, loss = 7.914, 0s per 1 iters
tensor(7.9669, device='cuda:1')
time = 2m, epoch 1, iter = 804, loss = 7.967, 0s per 1 iters
tensor(8.1873, device='cuda:1')
time = 2m, epoch 1, iter = 805, loss = 8.187, 0s per 1 iters
tensor(7.9938, device='cuda:1')
time = 2m, epoch 1, iter = 806, loss = 7.994, 0s per 1 iters
tensor(7.8214, device='cuda:1')
time = 2m, epoch 1, iter = 807, loss = 7.821, 0s per 1 iters
tensor(8.0572, device='cuda:1')
time = 2m, epoch 1, iter = 808, loss = 8.057, 0s per 1 iters
tensor(8.0375, device='cuda:1')
time = 2m, epoch 1, iter = 809, loss = 8.038, 0s per 1 iters
tensor(8.3804, device='cuda:1')
time = 2m, epoch 1, iter = 810, loss = 8.380, 0s per 1 iters
tensor(7.8917, device='cuda:1')
time = 2m, epoch 1, iter = 811, loss = 7.892, 0s per 1 iters
tensor(8.1473, device='cuda:1')
time = 2m, epoch 1, iter = 812, loss = 8.147, 0s per 1 iters
tensor(8.0065, device='cuda:1')
time = 2m, epoch 1, iter = 813, loss =

tensor(8.0482, device='cuda:1')
time = 2m, epoch 1, iter = 893, loss = 8.048, 0s per 1 iters
tensor(7.7512, device='cuda:1')
time = 2m, epoch 1, iter = 894, loss = 7.751, 0s per 1 iters
tensor(7.7715, device='cuda:1')
time = 2m, epoch 1, iter = 895, loss = 7.771, 0s per 1 iters
tensor(7.5290, device='cuda:1')
time = 2m, epoch 1, iter = 896, loss = 7.529, 0s per 1 iters
tensor(7.6583, device='cuda:1')
time = 2m, epoch 1, iter = 897, loss = 7.658, 0s per 1 iters
tensor(7.4362, device='cuda:1')
time = 2m, epoch 1, iter = 898, loss = 7.436, 0s per 1 iters
tensor(8.1764, device='cuda:1')
time = 2m, epoch 1, iter = 899, loss = 8.176, 0s per 1 iters
tensor(7.7628, device='cuda:1')
time = 2m, epoch 1, iter = 900, loss = 7.763, 0s per 1 iters
tensor(7.8467, device='cuda:1')
time = 2m, epoch 1, iter = 901, loss = 7.847, 0s per 1 iters
tensor(7.4717, device='cuda:1')
time = 2m, epoch 1, iter = 902, loss = 7.472, 0s per 1 iters
tensor(7.9611, device='cuda:1')
time = 2m, epoch 1, iter = 903, loss =

tensor(7.5324, device='cuda:1')
time = 3m, epoch 1, iter = 983, loss = 7.532, 0s per 1 iters
tensor(7.3660, device='cuda:1')
time = 3m, epoch 1, iter = 984, loss = 7.366, 0s per 1 iters
tensor(7.3768, device='cuda:1')
time = 3m, epoch 1, iter = 985, loss = 7.377, 0s per 1 iters
tensor(7.2768, device='cuda:1')
time = 3m, epoch 1, iter = 986, loss = 7.277, 0s per 1 iters
tensor(7.2678, device='cuda:1')
time = 3m, epoch 1, iter = 987, loss = 7.268, 0s per 1 iters
tensor(7.5020, device='cuda:1')
time = 3m, epoch 1, iter = 988, loss = 7.502, 0s per 1 iters
tensor(8.1325, device='cuda:1')
time = 3m, epoch 1, iter = 989, loss = 8.132, 0s per 1 iters
tensor(7.7158, device='cuda:1')
time = 3m, epoch 1, iter = 990, loss = 7.716, 0s per 1 iters
tensor(7.5267, device='cuda:1')
time = 3m, epoch 1, iter = 991, loss = 7.527, 0s per 1 iters
tensor(7.2853, device='cuda:1')
time = 3m, epoch 1, iter = 992, loss = 7.285, 0s per 1 iters
tensor(7.5997, device='cuda:1')
time = 3m, epoch 1, iter = 993, loss =

tensor(7.1950, device='cuda:1')
time = 3m, epoch 1, iter = 1071, loss = 7.195, 0s per 1 iters
tensor(6.9944, device='cuda:1')
time = 3m, epoch 1, iter = 1072, loss = 6.994, 0s per 1 iters
tensor(7.5236, device='cuda:1')
time = 3m, epoch 1, iter = 1073, loss = 7.524, 0s per 1 iters
tensor(7.4987, device='cuda:1')
time = 3m, epoch 1, iter = 1074, loss = 7.499, 0s per 1 iters
tensor(7.0031, device='cuda:1')
time = 3m, epoch 1, iter = 1075, loss = 7.003, 0s per 1 iters
tensor(7.3474, device='cuda:1')
time = 3m, epoch 1, iter = 1076, loss = 7.347, 0s per 1 iters
tensor(7.3260, device='cuda:1')
time = 3m, epoch 1, iter = 1077, loss = 7.326, 0s per 1 iters
tensor(7.4774, device='cuda:1')
time = 3m, epoch 1, iter = 1078, loss = 7.477, 0s per 1 iters
tensor(8.0716, device='cuda:1')
time = 3m, epoch 1, iter = 1079, loss = 8.072, 0s per 1 iters
tensor(7.1617, device='cuda:1')
time = 3m, epoch 1, iter = 1080, loss = 7.162, 0s per 1 iters
tensor(7.4961, device='cuda:1')
time = 3m, epoch 1, iter = 1

tensor(7.0205, device='cuda:1')
time = 3m, epoch 1, iter = 1160, loss = 7.020, 0s per 1 iters
tensor(7.1248, device='cuda:1')
time = 3m, epoch 1, iter = 1161, loss = 7.125, 0s per 1 iters
tensor(7.1348, device='cuda:1')
time = 3m, epoch 1, iter = 1162, loss = 7.135, 0s per 1 iters
tensor(6.9430, device='cuda:1')
time = 3m, epoch 1, iter = 1163, loss = 6.943, 0s per 1 iters
tensor(7.0863, device='cuda:1')
time = 3m, epoch 1, iter = 1164, loss = 7.086, 0s per 1 iters
tensor(7.1709, device='cuda:1')
time = 3m, epoch 1, iter = 1165, loss = 7.171, 0s per 1 iters
tensor(6.8083, device='cuda:1')
time = 3m, epoch 1, iter = 1166, loss = 6.808, 0s per 1 iters
tensor(6.7420, device='cuda:1')
time = 3m, epoch 1, iter = 1167, loss = 6.742, 0s per 1 iters
tensor(7.0447, device='cuda:1')
time = 3m, epoch 1, iter = 1168, loss = 7.045, 0s per 1 iters
tensor(6.9405, device='cuda:1')
time = 3m, epoch 1, iter = 1169, loss = 6.940, 0s per 1 iters
tensor(7.0381, device='cuda:1')
time = 3m, epoch 1, iter = 1

tensor(6.2463, device='cuda:1')
time = 3m, epoch 1, iter = 1248, loss = 6.246, 0s per 1 iters
tensor(7.3958, device='cuda:1')
time = 3m, epoch 1, iter = 1249, loss = 7.396, 0s per 1 iters
tensor(6.5554, device='cuda:1')
time = 3m, epoch 1, iter = 1250, loss = 6.555, 0s per 1 iters
tensor(6.9151, device='cuda:1')
time = 3m, epoch 1, iter = 1251, loss = 6.915, 0s per 1 iters
tensor(7.1857, device='cuda:1')
time = 3m, epoch 1, iter = 1252, loss = 7.186, 0s per 1 iters
tensor(6.6153, device='cuda:1')
time = 3m, epoch 1, iter = 1253, loss = 6.615, 0s per 1 iters
tensor(6.8604, device='cuda:1')
time = 3m, epoch 1, iter = 1254, loss = 6.860, 0s per 1 iters
tensor(6.6305, device='cuda:1')
time = 3m, epoch 1, iter = 1255, loss = 6.631, 0s per 1 iters
tensor(6.0287, device='cuda:1')
time = 3m, epoch 1, iter = 1256, loss = 6.029, 0s per 1 iters
tensor(6.6890, device='cuda:1')
time = 3m, epoch 1, iter = 1257, loss = 6.689, 0s per 1 iters
tensor(6.9156, device='cuda:1')
time = 3m, epoch 1, iter = 1

tensor(6.2227, device='cuda:1')
time = 4m, epoch 1, iter = 1336, loss = 6.223, 0s per 1 iters
tensor(6.8738, device='cuda:1')
time = 4m, epoch 1, iter = 1337, loss = 6.874, 0s per 1 iters
tensor(6.6562, device='cuda:1')
time = 4m, epoch 1, iter = 1338, loss = 6.656, 0s per 1 iters
tensor(6.8762, device='cuda:1')
time = 4m, epoch 1, iter = 1339, loss = 6.876, 0s per 1 iters
tensor(7.0337, device='cuda:1')
time = 4m, epoch 1, iter = 1340, loss = 7.034, 0s per 1 iters
tensor(6.6954, device='cuda:1')
time = 4m, epoch 1, iter = 1341, loss = 6.695, 0s per 1 iters
tensor(6.5036, device='cuda:1')
time = 4m, epoch 1, iter = 1342, loss = 6.504, 0s per 1 iters
tensor(6.4440, device='cuda:1')
time = 4m, epoch 1, iter = 1343, loss = 6.444, 0s per 1 iters
tensor(6.8065, device='cuda:1')
time = 4m, epoch 1, iter = 1344, loss = 6.807, 0s per 1 iters
tensor(6.9198, device='cuda:1')
time = 4m, epoch 1, iter = 1345, loss = 6.920, 0s per 1 iters
tensor(6.6243, device='cuda:1')
time = 4m, epoch 1, iter = 1

tensor(6.4337, device='cuda:1')
time = 4m, epoch 1, iter = 1425, loss = 6.434, 0s per 1 iters
tensor(6.4896, device='cuda:1')
time = 4m, epoch 1, iter = 1426, loss = 6.490, 0s per 1 iters
tensor(6.3459, device='cuda:1')
time = 4m, epoch 1, iter = 1427, loss = 6.346, 0s per 1 iters
tensor(6.1180, device='cuda:1')
time = 4m, epoch 1, iter = 1428, loss = 6.118, 0s per 1 iters
tensor(6.2239, device='cuda:1')
time = 4m, epoch 1, iter = 1429, loss = 6.224, 0s per 1 iters
tensor(7.0567, device='cuda:1')
time = 4m, epoch 1, iter = 1430, loss = 7.057, 0s per 1 iters
tensor(6.8338, device='cuda:1')
time = 4m, epoch 1, iter = 1431, loss = 6.834, 0s per 1 iters
tensor(7.2220, device='cuda:1')
time = 4m, epoch 1, iter = 1432, loss = 7.222, 0s per 1 iters
tensor(6.7119, device='cuda:1')
time = 4m, epoch 1, iter = 1433, loss = 6.712, 0s per 1 iters
tensor(6.2403, device='cuda:1')
time = 4m, epoch 1, iter = 1434, loss = 6.240, 0s per 1 iters
tensor(6.3419, device='cuda:1')
time = 4m, epoch 1, iter = 1

tensor(6.6418, device='cuda:1')
time = 4m, epoch 1, iter = 1513, loss = 6.642, 0s per 1 iters
tensor(7.6402, device='cuda:1')
time = 4m, epoch 1, iter = 1514, loss = 7.640, 0s per 1 iters
tensor(6.7637, device='cuda:1')
time = 4m, epoch 1, iter = 1515, loss = 6.764, 0s per 1 iters
tensor(6.8329, device='cuda:1')
time = 4m, epoch 1, iter = 1516, loss = 6.833, 0s per 1 iters
tensor(6.7046, device='cuda:1')
time = 4m, epoch 1, iter = 1517, loss = 6.705, 0s per 1 iters
tensor(6.5359, device='cuda:1')
time = 4m, epoch 1, iter = 1518, loss = 6.536, 0s per 1 iters
tensor(6.1169, device='cuda:1')
time = 4m, epoch 1, iter = 1519, loss = 6.117, 0s per 1 iters
tensor(6.2275, device='cuda:1')
time = 4m, epoch 1, iter = 1520, loss = 6.228, 0s per 1 iters
tensor(6.0991, device='cuda:1')
time = 4m, epoch 1, iter = 1521, loss = 6.099, 0s per 1 iters
tensor(6.6944, device='cuda:1')
time = 4m, epoch 1, iter = 1522, loss = 6.694, 0s per 1 iters
tensor(6.5932, device='cuda:1')
time = 4m, epoch 1, iter = 1

tensor(6.4600, device='cuda:1')
time = 4m, epoch 1, iter = 1601, loss = 6.460, 0s per 1 iters
tensor(6.4108, device='cuda:1')
time = 4m, epoch 1, iter = 1602, loss = 6.411, 0s per 1 iters
tensor(6.1083, device='cuda:1')
time = 4m, epoch 1, iter = 1603, loss = 6.108, 0s per 1 iters
tensor(6.2349, device='cuda:1')
time = 4m, epoch 1, iter = 1604, loss = 6.235, 0s per 1 iters
tensor(6.1451, device='cuda:1')
time = 4m, epoch 1, iter = 1605, loss = 6.145, 0s per 1 iters
tensor(6.5830, device='cuda:1')
time = 4m, epoch 1, iter = 1606, loss = 6.583, 0s per 1 iters
tensor(6.5686, device='cuda:1')
time = 4m, epoch 1, iter = 1607, loss = 6.569, 0s per 1 iters
tensor(6.3436, device='cuda:1')
time = 4m, epoch 1, iter = 1608, loss = 6.344, 0s per 1 iters
tensor(6.7221, device='cuda:1')
time = 4m, epoch 1, iter = 1609, loss = 6.722, 0s per 1 iters
tensor(6.1556, device='cuda:1')
time = 4m, epoch 1, iter = 1610, loss = 6.156, 0s per 1 iters
tensor(6.5167, device='cuda:1')
time = 4m, epoch 1, iter = 1

tensor(6.0303, device='cuda:1')
time = 4m, epoch 1, iter = 1689, loss = 6.030, 0s per 1 iters
tensor(6.6467, device='cuda:1')
time = 4m, epoch 1, iter = 1690, loss = 6.647, 0s per 1 iters
tensor(5.9049, device='cuda:1')
time = 4m, epoch 1, iter = 1691, loss = 5.905, 0s per 1 iters
tensor(6.6999, device='cuda:1')
time = 4m, epoch 1, iter = 1692, loss = 6.700, 0s per 1 iters
tensor(5.9051, device='cuda:1')
time = 4m, epoch 1, iter = 1693, loss = 5.905, 0s per 1 iters
tensor(6.3996, device='cuda:1')
time = 4m, epoch 1, iter = 1694, loss = 6.400, 0s per 1 iters
tensor(6.5927, device='cuda:1')
time = 5m, epoch 1, iter = 1695, loss = 6.593, 0s per 1 iters
tensor(6.1783, device='cuda:1')
time = 5m, epoch 1, iter = 1696, loss = 6.178, 0s per 1 iters
tensor(6.0839, device='cuda:1')
time = 5m, epoch 1, iter = 1697, loss = 6.084, 0s per 1 iters
tensor(6.5252, device='cuda:1')
time = 5m, epoch 1, iter = 1698, loss = 6.525, 0s per 1 iters
tensor(6.4840, device='cuda:1')
time = 5m, epoch 1, iter = 1

tensor(6.6031, device='cuda:1')
time = 5m, epoch 1, iter = 1778, loss = 6.603, 0s per 1 iters
tensor(6.1294, device='cuda:1')
time = 5m, epoch 1, iter = 1779, loss = 6.129, 0s per 1 iters
tensor(6.2359, device='cuda:1')
time = 5m, epoch 1, iter = 1780, loss = 6.236, 0s per 1 iters
tensor(5.8765, device='cuda:1')
time = 5m, epoch 1, iter = 1781, loss = 5.876, 0s per 1 iters
tensor(5.8861, device='cuda:1')
time = 5m, epoch 1, iter = 1782, loss = 5.886, 0s per 1 iters
tensor(5.9966, device='cuda:1')
time = 5m, epoch 1, iter = 1783, loss = 5.997, 0s per 1 iters
tensor(6.6525, device='cuda:1')
time = 5m, epoch 1, iter = 1784, loss = 6.652, 0s per 1 iters
tensor(5.9190, device='cuda:1')
time = 5m, epoch 1, iter = 1785, loss = 5.919, 0s per 1 iters
tensor(6.8766, device='cuda:1')
time = 5m, epoch 1, iter = 1786, loss = 6.877, 0s per 1 iters
tensor(6.6573, device='cuda:1')
time = 5m, epoch 1, iter = 1787, loss = 6.657, 0s per 1 iters
tensor(6.1598, device='cuda:1')
time = 5m, epoch 1, iter = 1

tensor(6.0529, device='cuda:1')
time = 5m, epoch 1, iter = 1867, loss = 6.053, 0s per 1 iters
tensor(6.1851, device='cuda:1')
time = 5m, epoch 1, iter = 1868, loss = 6.185, 0s per 1 iters
tensor(5.9122, device='cuda:1')
time = 5m, epoch 1, iter = 1869, loss = 5.912, 0s per 1 iters
tensor(6.0712, device='cuda:1')
time = 5m, epoch 1, iter = 1870, loss = 6.071, 0s per 1 iters
tensor(6.1153, device='cuda:1')
time = 5m, epoch 1, iter = 1871, loss = 6.115, 0s per 1 iters
tensor(5.4095, device='cuda:1')
time = 5m, epoch 1, iter = 1872, loss = 5.409, 0s per 1 iters
tensor(6.2413, device='cuda:1')
time = 5m, epoch 1, iter = 1873, loss = 6.241, 0s per 1 iters
tensor(6.4358, device='cuda:1')
time = 5m, epoch 1, iter = 1874, loss = 6.436, 0s per 1 iters
tensor(5.9628, device='cuda:1')
time = 5m, epoch 1, iter = 1875, loss = 5.963, 0s per 1 iters
tensor(6.2604, device='cuda:1')
time = 5m, epoch 1, iter = 1876, loss = 6.260, 0s per 1 iters
tensor(6.4197, device='cuda:1')
time = 5m, epoch 1, iter = 1

tensor(6.8127, device='cuda:1')
time = 5m, epoch 1, iter = 1956, loss = 6.813, 0s per 1 iters
tensor(6.2461, device='cuda:1')
time = 5m, epoch 1, iter = 1957, loss = 6.246, 0s per 1 iters
tensor(5.8816, device='cuda:1')
time = 5m, epoch 1, iter = 1958, loss = 5.882, 0s per 1 iters
tensor(6.2288, device='cuda:1')
time = 5m, epoch 1, iter = 1959, loss = 6.229, 0s per 1 iters
tensor(5.9182, device='cuda:1')
time = 5m, epoch 1, iter = 1960, loss = 5.918, 0s per 1 iters
tensor(6.1889, device='cuda:1')
time = 5m, epoch 1, iter = 1961, loss = 6.189, 0s per 1 iters
tensor(6.1506, device='cuda:1')
time = 5m, epoch 1, iter = 1962, loss = 6.151, 0s per 1 iters
tensor(5.9256, device='cuda:1')
time = 5m, epoch 1, iter = 1963, loss = 5.926, 0s per 1 iters
tensor(6.1709, device='cuda:1')
time = 5m, epoch 1, iter = 1964, loss = 6.171, 0s per 1 iters
tensor(6.8647, device='cuda:1')
time = 5m, epoch 1, iter = 1965, loss = 6.865, 0s per 1 iters
tensor(6.0679, device='cuda:1')
time = 5m, epoch 1, iter = 1

tensor(6.7251, device='cuda:1')
time = 5m, epoch 1, iter = 2045, loss = 6.725, 0s per 1 iters
tensor(6.4175, device='cuda:1')
time = 5m, epoch 1, iter = 2046, loss = 6.418, 0s per 1 iters
tensor(6.1965, device='cuda:1')
time = 5m, epoch 1, iter = 2047, loss = 6.197, 0s per 1 iters
tensor(5.6094, device='cuda:1')
time = 5m, epoch 1, iter = 2048, loss = 5.609, 0s per 1 iters
tensor(5.4613, device='cuda:1')
time = 5m, epoch 1, iter = 2049, loss = 5.461, 0s per 1 iters
tensor(6.4706, device='cuda:1')
time = 5m, epoch 1, iter = 2050, loss = 6.471, 0s per 1 iters
tensor(5.8846, device='cuda:1')
time = 5m, epoch 1, iter = 2051, loss = 5.885, 0s per 1 iters
tensor(6.2568, device='cuda:1')
time = 5m, epoch 1, iter = 2052, loss = 6.257, 0s per 1 iters
tensor(6.4070, device='cuda:1')
time = 5m, epoch 1, iter = 2053, loss = 6.407, 0s per 1 iters
tensor(6.3042, device='cuda:1')
time = 5m, epoch 1, iter = 2054, loss = 6.304, 0s per 1 iters
tensor(6.5229, device='cuda:1')
time = 5m, epoch 1, iter = 2

tensor(6.7334, device='cuda:1')
time = 6m, epoch 1, iter = 2133, loss = 6.733, 0s per 1 iters
tensor(6.9149, device='cuda:1')
time = 6m, epoch 1, iter = 2134, loss = 6.915, 0s per 1 iters
tensor(5.8826, device='cuda:1')
time = 6m, epoch 1, iter = 2135, loss = 5.883, 0s per 1 iters
tensor(5.7896, device='cuda:1')
time = 6m, epoch 1, iter = 2136, loss = 5.790, 0s per 1 iters
tensor(6.8797, device='cuda:1')
time = 6m, epoch 1, iter = 2137, loss = 6.880, 0s per 1 iters
tensor(6.4842, device='cuda:1')
time = 6m, epoch 1, iter = 2138, loss = 6.484, 0s per 1 iters
tensor(6.1939, device='cuda:1')
time = 6m, epoch 1, iter = 2139, loss = 6.194, 0s per 1 iters
tensor(5.7565, device='cuda:1')
time = 6m, epoch 1, iter = 2140, loss = 5.756, 0s per 1 iters
tensor(6.2720, device='cuda:1')
time = 6m, epoch 1, iter = 2141, loss = 6.272, 0s per 1 iters
tensor(5.8299, device='cuda:1')
time = 6m, epoch 1, iter = 2142, loss = 5.830, 0s per 1 iters
tensor(6.7673, device='cuda:1')
time = 6m, epoch 1, iter = 2

tensor(5.6648, device='cuda:1')
time = 6m, epoch 1, iter = 2222, loss = 5.665, 0s per 1 iters
tensor(6.4928, device='cuda:1')
time = 6m, epoch 1, iter = 2223, loss = 6.493, 0s per 1 iters
tensor(6.2088, device='cuda:1')
time = 6m, epoch 1, iter = 2224, loss = 6.209, 0s per 1 iters
tensor(6.0235, device='cuda:1')
time = 6m, epoch 1, iter = 2225, loss = 6.023, 0s per 1 iters
tensor(6.0136, device='cuda:1')
time = 6m, epoch 1, iter = 2226, loss = 6.014, 0s per 1 iters
tensor(6.4496, device='cuda:1')
time = 6m, epoch 1, iter = 2227, loss = 6.450, 0s per 1 iters
tensor(6.0523, device='cuda:1')
time = 6m, epoch 1, iter = 2228, loss = 6.052, 0s per 1 iters
tensor(5.8906, device='cuda:1')
time = 6m, epoch 1, iter = 2229, loss = 5.891, 0s per 1 iters
tensor(6.2027, device='cuda:1')
time = 6m, epoch 1, iter = 2230, loss = 6.203, 0s per 1 iters
tensor(5.9126, device='cuda:1')
time = 6m, epoch 1, iter = 2231, loss = 5.913, 0s per 1 iters
tensor(6.3399, device='cuda:1')
time = 6m, epoch 1, iter = 2

tensor(6.4163, device='cuda:1')
time = 6m, epoch 1, iter = 2311, loss = 6.416, 0s per 1 iters
tensor(5.9846, device='cuda:1')
time = 6m, epoch 1, iter = 2312, loss = 5.985, 0s per 1 iters
tensor(5.9519, device='cuda:1')
time = 6m, epoch 1, iter = 2313, loss = 5.952, 0s per 1 iters
tensor(6.0328, device='cuda:1')
time = 6m, epoch 1, iter = 2314, loss = 6.033, 0s per 1 iters
tensor(6.2029, device='cuda:1')
time = 6m, epoch 1, iter = 2315, loss = 6.203, 0s per 1 iters
tensor(5.7560, device='cuda:1')
time = 6m, epoch 1, iter = 2316, loss = 5.756, 0s per 1 iters
tensor(6.0223, device='cuda:1')
time = 6m, epoch 1, iter = 2317, loss = 6.022, 0s per 1 iters
tensor(5.9501, device='cuda:1')
time = 6m, epoch 1, iter = 2318, loss = 5.950, 0s per 1 iters
tensor(6.1960, device='cuda:1')
time = 6m, epoch 1, iter = 2319, loss = 6.196, 0s per 1 iters
tensor(6.8748, device='cuda:1')
time = 6m, epoch 1, iter = 2320, loss = 6.875, 0s per 1 iters
tensor(6.9567, device='cuda:1')
time = 6m, epoch 1, iter = 2

tensor(6.1136, device='cuda:1')
time = 6m, epoch 1, iter = 2399, loss = 6.114, 0s per 1 iters
tensor(6.1370, device='cuda:1')
time = 6m, epoch 1, iter = 2400, loss = 6.137, 0s per 1 iters
tensor(5.6425, device='cuda:1')
time = 6m, epoch 1, iter = 2401, loss = 5.642, 0s per 1 iters
tensor(5.7311, device='cuda:1')
time = 6m, epoch 1, iter = 2402, loss = 5.731, 0s per 1 iters
tensor(6.4071, device='cuda:1')
time = 6m, epoch 1, iter = 2403, loss = 6.407, 0s per 1 iters
tensor(5.8575, device='cuda:1')
time = 6m, epoch 1, iter = 2404, loss = 5.857, 0s per 1 iters
tensor(6.2810, device='cuda:1')
time = 6m, epoch 1, iter = 2405, loss = 6.281, 0s per 1 iters
tensor(5.5829, device='cuda:1')
time = 6m, epoch 1, iter = 2406, loss = 5.583, 0s per 1 iters
tensor(6.0179, device='cuda:1')
time = 6m, epoch 1, iter = 2407, loss = 6.018, 0s per 1 iters
tensor(5.7605, device='cuda:1')
time = 6m, epoch 1, iter = 2408, loss = 5.761, 0s per 1 iters
tensor(6.2604, device='cuda:1')
time = 6m, epoch 1, iter = 2

tensor(5.9913, device='cuda:1')
time = 7m, epoch 1, iter = 2487, loss = 5.991, 0s per 1 iters
tensor(5.4834, device='cuda:1')
time = 7m, epoch 1, iter = 2488, loss = 5.483, 0s per 1 iters
tensor(6.0348, device='cuda:1')
time = 7m, epoch 1, iter = 2489, loss = 6.035, 0s per 1 iters
tensor(6.4862, device='cuda:1')
time = 7m, epoch 1, iter = 2490, loss = 6.486, 0s per 1 iters
tensor(5.5680, device='cuda:1')
time = 7m, epoch 1, iter = 2491, loss = 5.568, 0s per 1 iters
tensor(5.4467, device='cuda:1')
time = 7m, epoch 1, iter = 2492, loss = 5.447, 0s per 1 iters
tensor(5.9062, device='cuda:1')
time = 7m, epoch 1, iter = 2493, loss = 5.906, 0s per 1 iters
tensor(7.0666, device='cuda:1')
time = 7m, epoch 1, iter = 2494, loss = 7.067, 0s per 1 iters
tensor(5.3744, device='cuda:1')
time = 7m, epoch 1, iter = 2495, loss = 5.374, 0s per 1 iters
tensor(5.5629, device='cuda:1')
time = 7m, epoch 1, iter = 2496, loss = 5.563, 0s per 1 iters
tensor(5.2224, device='cuda:1')
time = 7m, epoch 1, iter = 2

tensor(5.9583, device='cuda:1')
time = 7m, epoch 1, iter = 2575, loss = 5.958, 0s per 1 iters
tensor(6.2327, device='cuda:1')
time = 7m, epoch 1, iter = 2576, loss = 6.233, 0s per 1 iters
tensor(5.5615, device='cuda:1')
time = 7m, epoch 1, iter = 2577, loss = 5.561, 0s per 1 iters
tensor(5.6538, device='cuda:1')
time = 7m, epoch 1, iter = 2578, loss = 5.654, 0s per 1 iters
tensor(5.6712, device='cuda:1')
time = 7m, epoch 1, iter = 2579, loss = 5.671, 0s per 1 iters
tensor(5.8430, device='cuda:1')
time = 7m, epoch 1, iter = 2580, loss = 5.843, 0s per 1 iters
tensor(5.5016, device='cuda:1')
time = 7m, epoch 1, iter = 2581, loss = 5.502, 0s per 1 iters
tensor(5.9109, device='cuda:1')
time = 7m, epoch 1, iter = 2582, loss = 5.911, 0s per 1 iters
tensor(5.8974, device='cuda:1')
time = 7m, epoch 1, iter = 2583, loss = 5.897, 0s per 1 iters
tensor(5.3308, device='cuda:1')
time = 7m, epoch 1, iter = 2584, loss = 5.331, 0s per 1 iters
tensor(6.5042, device='cuda:1')
time = 7m, epoch 1, iter = 2

tensor(5.4260, device='cuda:1')
time = 7m, epoch 1, iter = 2664, loss = 5.426, 0s per 1 iters
tensor(5.8054, device='cuda:1')
time = 7m, epoch 1, iter = 2665, loss = 5.805, 0s per 1 iters
tensor(5.8732, device='cuda:1')
time = 7m, epoch 1, iter = 2666, loss = 5.873, 0s per 1 iters
tensor(5.2671, device='cuda:1')
time = 7m, epoch 1, iter = 2667, loss = 5.267, 0s per 1 iters
tensor(5.7329, device='cuda:1')
time = 7m, epoch 1, iter = 2668, loss = 5.733, 0s per 1 iters
tensor(5.8942, device='cuda:1')
time = 7m, epoch 1, iter = 2669, loss = 5.894, 0s per 1 iters
tensor(5.6980, device='cuda:1')
time = 7m, epoch 1, iter = 2670, loss = 5.698, 0s per 1 iters
tensor(5.8839, device='cuda:1')
time = 7m, epoch 1, iter = 2671, loss = 5.884, 0s per 1 iters
tensor(5.4696, device='cuda:1')
time = 7m, epoch 1, iter = 2672, loss = 5.470, 0s per 1 iters
tensor(6.5071, device='cuda:1')
time = 7m, epoch 1, iter = 2673, loss = 6.507, 0s per 1 iters
tensor(5.8844, device='cuda:1')
time = 7m, epoch 1, iter = 2

tensor(5.3240, device='cuda:1')
time = 7m, epoch 1, iter = 2752, loss = 5.324, 0s per 1 iters
tensor(5.9084, device='cuda:1')
time = 7m, epoch 1, iter = 2753, loss = 5.908, 0s per 1 iters
tensor(5.3647, device='cuda:1')
time = 7m, epoch 1, iter = 2754, loss = 5.365, 0s per 1 iters
tensor(5.5933, device='cuda:1')
time = 7m, epoch 1, iter = 2755, loss = 5.593, 0s per 1 iters
tensor(6.0633, device='cuda:1')
time = 7m, epoch 1, iter = 2756, loss = 6.063, 0s per 1 iters
tensor(6.3826, device='cuda:1')
time = 7m, epoch 1, iter = 2757, loss = 6.383, 0s per 1 iters
tensor(5.5267, device='cuda:1')
time = 7m, epoch 1, iter = 2758, loss = 5.527, 0s per 1 iters
tensor(5.8970, device='cuda:1')
time = 7m, epoch 1, iter = 2759, loss = 5.897, 0s per 1 iters
tensor(6.2587, device='cuda:1')
time = 7m, epoch 1, iter = 2760, loss = 6.259, 0s per 1 iters
tensor(6.4053, device='cuda:1')
time = 7m, epoch 1, iter = 2761, loss = 6.405, 0s per 1 iters
tensor(6.2262, device='cuda:1')
time = 7m, epoch 1, iter = 2

tensor(5.6461, device='cuda:1')
time = 8m, epoch 1, iter = 2841, loss = 5.646, 0s per 1 iters
tensor(5.6899, device='cuda:1')
time = 8m, epoch 1, iter = 2842, loss = 5.690, 0s per 1 iters
tensor(6.0044, device='cuda:1')
time = 8m, epoch 1, iter = 2843, loss = 6.004, 0s per 1 iters
tensor(5.9966, device='cuda:1')
time = 8m, epoch 1, iter = 2844, loss = 5.997, 0s per 1 iters
tensor(5.8778, device='cuda:1')
time = 8m, epoch 1, iter = 2845, loss = 5.878, 0s per 1 iters
tensor(5.8867, device='cuda:1')
time = 8m, epoch 1, iter = 2846, loss = 5.887, 0s per 1 iters
tensor(5.8816, device='cuda:1')
time = 8m, epoch 1, iter = 2847, loss = 5.882, 0s per 1 iters
tensor(6.0793, device='cuda:1')
time = 8m, epoch 1, iter = 2848, loss = 6.079, 0s per 1 iters
tensor(5.6806, device='cuda:1')
time = 8m, epoch 1, iter = 2849, loss = 5.681, 0s per 1 iters
tensor(6.3196, device='cuda:1')
time = 8m, epoch 1, iter = 2850, loss = 6.320, 0s per 1 iters
tensor(5.7823, device='cuda:1')
time = 8m, epoch 1, iter = 2

tensor(5.9158, device='cuda:1')
time = 8m, epoch 1, iter = 2929, loss = 5.916, 0s per 1 iters
tensor(5.8059, device='cuda:1')
time = 8m, epoch 1, iter = 2930, loss = 5.806, 0s per 1 iters
tensor(6.1008, device='cuda:1')
time = 8m, epoch 1, iter = 2931, loss = 6.101, 0s per 1 iters
tensor(6.8837, device='cuda:1')
time = 8m, epoch 1, iter = 2932, loss = 6.884, 0s per 1 iters
tensor(5.6820, device='cuda:1')
time = 8m, epoch 1, iter = 2933, loss = 5.682, 0s per 1 iters
tensor(6.0253, device='cuda:1')
time = 8m, epoch 1, iter = 2934, loss = 6.025, 0s per 1 iters
tensor(6.4578, device='cuda:1')
time = 8m, epoch 1, iter = 2935, loss = 6.458, 0s per 1 iters
tensor(6.3104, device='cuda:1')
time = 8m, epoch 1, iter = 2936, loss = 6.310, 0s per 1 iters
tensor(5.4921, device='cuda:1')
time = 8m, epoch 1, iter = 2937, loss = 5.492, 0s per 1 iters
tensor(6.1991, device='cuda:1')
time = 8m, epoch 1, iter = 2938, loss = 6.199, 0s per 1 iters
tensor(5.9050, device='cuda:1')
time = 8m, epoch 1, iter = 2

tensor(6.0517, device='cuda:1')
time = 8m, epoch 1, iter = 3018, loss = 6.052, 0s per 1 iters
tensor(5.9032, device='cuda:1')
time = 8m, epoch 1, iter = 3019, loss = 5.903, 0s per 1 iters
tensor(5.2690, device='cuda:1')
time = 8m, epoch 1, iter = 3020, loss = 5.269, 0s per 1 iters
tensor(5.5687, device='cuda:1')
time = 8m, epoch 1, iter = 3021, loss = 5.569, 0s per 1 iters
tensor(6.2064, device='cuda:1')
time = 8m, epoch 1, iter = 3022, loss = 6.206, 0s per 1 iters
tensor(5.6266, device='cuda:1')
time = 8m, epoch 1, iter = 3023, loss = 5.627, 0s per 1 iters
tensor(5.8774, device='cuda:1')
time = 8m, epoch 1, iter = 3024, loss = 5.877, 0s per 1 iters
tensor(6.2929, device='cuda:1')
time = 8m, epoch 1, iter = 3025, loss = 6.293, 0s per 1 iters
tensor(6.2555, device='cuda:1')
time = 8m, epoch 1, iter = 3026, loss = 6.256, 0s per 1 iters
tensor(6.0165, device='cuda:1')
time = 8m, epoch 1, iter = 3027, loss = 6.017, 0s per 1 iters
tensor(6.4508, device='cuda:1')
time = 8m, epoch 1, iter = 3

tensor(5.8255, device='cuda:1')
time = 8m, epoch 1, iter = 3107, loss = 5.826, 0s per 1 iters
tensor(5.6341, device='cuda:1')
time = 8m, epoch 1, iter = 3108, loss = 5.634, 0s per 1 iters
tensor(6.7563, device='cuda:1')
time = 8m, epoch 1, iter = 3109, loss = 6.756, 0s per 1 iters
tensor(5.7235, device='cuda:1')
time = 8m, epoch 1, iter = 3110, loss = 5.724, 0s per 1 iters
tensor(5.2042, device='cuda:1')
time = 8m, epoch 1, iter = 3111, loss = 5.204, 0s per 1 iters
tensor(5.2732, device='cuda:1')
time = 8m, epoch 1, iter = 3112, loss = 5.273, 0s per 1 iters
tensor(5.2984, device='cuda:1')
time = 8m, epoch 1, iter = 3113, loss = 5.298, 0s per 1 iters
tensor(5.7634, device='cuda:1')
time = 8m, epoch 1, iter = 3114, loss = 5.763, 0s per 1 iters
tensor(5.1548, device='cuda:1')
time = 8m, epoch 1, iter = 3115, loss = 5.155, 0s per 1 iters
tensor(5.9835, device='cuda:1')
time = 8m, epoch 1, iter = 3116, loss = 5.984, 0s per 1 iters
tensor(5.2887, device='cuda:1')
time = 8m, epoch 1, iter = 3

tensor(5.5149, device='cuda:1')
time = 8m, epoch 1, iter = 3195, loss = 5.515, 0s per 1 iters
tensor(5.8202, device='cuda:1')
time = 8m, epoch 1, iter = 3196, loss = 5.820, 0s per 1 iters
tensor(6.3322, device='cuda:1')
time = 8m, epoch 1, iter = 3197, loss = 6.332, 0s per 1 iters
tensor(6.4444, device='cuda:1')
time = 8m, epoch 1, iter = 3198, loss = 6.444, 0s per 1 iters
tensor(5.9687, device='cuda:1')
time = 8m, epoch 1, iter = 3199, loss = 5.969, 0s per 1 iters
tensor(5.5871, device='cuda:1')
time = 8m, epoch 1, iter = 3200, loss = 5.587, 0s per 1 iters
tensor(5.7011, device='cuda:1')
time = 8m, epoch 1, iter = 3201, loss = 5.701, 0s per 1 iters
tensor(5.6956, device='cuda:1')
time = 8m, epoch 1, iter = 3202, loss = 5.696, 0s per 1 iters
tensor(6.5908, device='cuda:1')
time = 8m, epoch 1, iter = 3203, loss = 6.591, 0s per 1 iters
tensor(5.5972, device='cuda:1')
time = 8m, epoch 1, iter = 3204, loss = 5.597, 0s per 1 iters
tensor(5.6924, device='cuda:1')
time = 8m, epoch 1, iter = 3