In [1]:
import torch
import os
from os.path import exists
import torch.nn as nn
# from torch.nn.functional import log_softmax, pad, one_hot
import math
import copy
import time
from torch.optim.lr_scheduler import LambdaLR
import pandas as pd
from torch.utils.data import DataLoader
import random
import json
import csv
from pathlib import Path
import shutil
import re
import threading

### utils.py ###

class Dotdict(dict):
    """dot.notation access to dictionary attributes"""
    __getattr__ = dict.get
    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__
    
    def __iadd__(self, other):
        for k, v in self.items():
            if k in other and other[k]:
                self[k] += other[k]
            # end
        # end

        return self
    # end
# end


# Takes the file paths as arguments
def parse_csv_file_to_json(path_file_csv):
    # create a dictionary
    elements = []

    # Open a csv reader called DictReader
    with open(path_file_csv, encoding='utf-8') as file_csv:
    #with open(path_file_csv) as file_csv:
        reader_csv = csv.DictReader(file_csv, delimiter="\t")

        # Convert each row into a dictionary
        # and add it to data
        for dict_head_value in reader_csv:
            element = {}

            for head, value in dict_head_value.items():
                if value and (value[0] in ["[", "{"]):
                    element[head] = value
                else:
                    element[head] = value

            elements.append(element)
        # end
    # end

    return elements
# end

### utils.py ###



### core.py ###

"Produce N identical layers."
def clones(module, N):
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])
# end


class MultiHeadedAttention(nn.Module):

    "Take in model size and number of heads."
    def __init__(self, h, d_model, dropout=0.1):
        super(MultiHeadedAttention, self).__init__()
        assert d_model % h == 0
        # We assume d_v always equals d_k
        self.d_k = d_model // h
        self.h = h
        self.linears = clones(nn.Linear(d_model, d_model), 4)
        self.attn = None
        self.dropout = nn.Dropout(p=dropout)
    # end


    "Compute 'Scaled Dot Product Attention'"
    def attention(self, query, key, value, mask=None, dropout=None):
        d_k = query.size(-1)
        scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
        if mask is not None:
            # print('jinyuj: scores: {}, mask: {}'.format(scores.shape, mask.shape))
            scores = scores.masked_fill(mask == 0, -1e9)
        # end
        p_attn = scores.softmax(dim=-1)
        if dropout is not None:
            p_attn = dropout(p_attn)
        # end
        return torch.matmul(p_attn, value), p_attn
    # end


    "Implements Figure 2"
    def forward(self, query, key, value, mask=None):
        if mask is not None:
            # Same mask applied to all h heads.
            mask = mask.unsqueeze(1)
        nbatches = query.size(0)

        # 1) Do all the linear projections in batch from d_model => h x d_k
        query, key, value = [
            lin(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
            for lin, x in zip(self.linears, (query, key, value))
        ]

        # 2) Apply attention on all the projected vectors in batch.
        x, self.attn = self.attention(
            query, key, value, mask=mask, dropout=self.dropout
        )

        # 3) "Concat" using a view and apply a final linear.
        x = (
            x.transpose(1, 2)
            .contiguous()
            .view(nbatches, -1, self.h * self.d_k)
        )
        del query
        del key
        del value
        return self.linears[-1](x)
    # end
# end class


"""
A residual connection followed by a layer norm.
Note for code simplicity the norm is first as opposed to last.
"""
class ResidualLayer(nn.Module):

    def __init__(self, size, dropout=0.1, eps=1e-6):
        super(ResidualLayer, self).__init__()
        self.norm = torch.nn.LayerNorm(size, eps)
        self.dropout = nn.Dropout(p=dropout)
    # end

    "Apply residual connection to any sublayer with the same size."
    def forward(self, x, sublayer):
        return x + self.dropout(sublayer(self.norm(x)))
    # end
# end class


class PositionwiseFeedForward(nn.Module):
    "Implements FFN equation."

    def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.w_2(self.dropout(self.w_1(x).relu()))
    # end
# end


class SimpleIDEmbeddings(nn.Module):
    def __init__(self, size_vocab, dim_hidden, id_pad):
        super(SimpleIDEmbeddings, self).__init__()
        self.lut = nn.Embedding(size_vocab, dim_hidden, padding_idx=id_pad)
        self.dim_hidden = dim_hidden

    def forward(self, x):
        result = self.lut(x)
        return result * math.sqrt(self.dim_hidden)
    # end

    def get_shape(self):
        return (self.lut.num_embeddings, self.lut.embedding_dim)
    # end
# end


"Implement the PE function."
class PositionalEncoding(nn.Module):

    def __init__(self, dim_positional, max_len=512):
        super(PositionalEncoding, self).__init__()

        # Compute the positional encodings once in log space.
        self.dim_positional = dim_positional
        pe = torch.zeros(max_len, dim_positional)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, dim_positional, 2) * -(math.log(10000.0) / dim_positional)
        )
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).to('cuda')
        self.register_buffer("pe", pe)

    def forward(self, x):
        x = x + self.pe[:, : x.size(1)].requires_grad_(False)
        return x
    # end
# end


class SimpleEmbedder(nn.Module):    # no segment embedder as we do not need that
    def __init__(self, size_vocab=None, dim_hidden=128, dropout=0.1, id_pad=0):
        super(SimpleEmbedder, self).__init__()
        self.size_vocab = size_vocab
        self.dim_hidden = dim_hidden
        self.id_pad = id_pad

        self.embedder = nn.Sequential(
            SimpleIDEmbeddings(size_vocab, dim_hidden, id_pad),
            PositionalEncoding(dim_hidden),
            nn.Dropout(p=dropout)
        )
    # end

    def forward(self, ids_input):   # (batch, seqs_with_padding)
        return self.embedder(ids_input)
    # end

    def get_vocab_size(self):
        return self.size_vocab
    # end
# end

### core.py ###



class SimpleEncoderLayer(nn.Module):

    def __init__(self, dim_hidden, dim_feedforward, n_head, dropout=0.1):
        super(SimpleEncoderLayer, self).__init__()

        self.n_head = n_head
        self.dim_hidden = dim_hidden
        self.dim_feedforward = dim_feedforward

        self.layer_attention = MultiHeadedAttention(n_head, dim_hidden)
        self.layer_feedforward = PositionwiseFeedForward(dim_hidden, dim_feedforward, dropout)
        self.layers_residual = clones(ResidualLayer(dim_hidden, dropout), 2)
    # end

    def forward(self, embeddings, masks, *args):
        embeddings = self.layers_residual[0](embeddings, lambda embeddings: self.layer_attention(embeddings, embeddings, embeddings, masks))
        return self.layers_residual[1](embeddings, self.layer_feedforward)
    # end
# end



class SimpleDecoderLayer(nn.Module):

    def __init__(self, dim_hidden, dim_feedforward, n_head, dropout=0.1):
        super(SimpleDecoderLayer, self).__init__()

        self.n_head = n_head
        self.dim_hidden = dim_hidden
        self.dim_feedforward = dim_feedforward

        self.layer_attention_decoder = MultiHeadedAttention(n_head, dim_hidden)
        self.layer_attention_encoder = MultiHeadedAttention(n_head, dim_hidden)
        self.layer_feedforward = PositionwiseFeedForward(dim_hidden, dim_feedforward, dropout)
        self.layers_residual = clones(ResidualLayer(dim_hidden, dropout), 3)

    def forward(self, embeddings, masks_encoder, output_encoder, masks_decoder, *args):
        embeddings = self.layers_residual[0](embeddings, lambda embeddings: self.layer_attention_decoder(embeddings, embeddings, embeddings, masks_decoder))
        embeddings = self.layers_residual[1](embeddings, lambda embeddings: self.layer_attention_encoder(embeddings, output_encoder, output_encoder, masks_encoder))
        return self.layers_residual[2](embeddings, self.layer_feedforward)
    # end
# end


class SimpleTransformerStack(nn.Module):

    def __init__(self, obj_layer, n_layers):
        super(SimpleTransformerStack, self).__init__()
        self.layers = clones(obj_layer, n_layers)

        self.norm = torch.nn.LayerNorm(obj_layer.dim_hidden)
    # end

    def forward(self, embedding_encoder=None, masks_encoder=None, output_encoder=None, embedding_decoder=None, masks_decoder=None ,noncache=False, **kwargs):  # input -> (batch, len_seq, vocab)

        if output_encoder is not None and embedding_decoder is not None and masks_decoder is not None:
            embeddings = embedding_decoder
        else:
            embeddings = embedding_encoder
        # end

        for layer in self.layers:
            embeddings = layer(embeddings, masks_encoder, output_encoder, masks_decoder)
        # end

        output = self.norm(embeddings)
        return output
    # end

# end


class SimpleEncoderDecoder(nn.Module):
    def __init__(self, encoder, decoder, embedder_encoder, embedder_decoder, pooling=False):
        super(SimpleEncoderDecoder, self).__init__()

        self.pooling = pooling
        
        self.embedder_encoder = embedder_encoder
        self.encoder = encoder

        self.embedder_decoder = embedder_decoder
        self.decoder = decoder

    # end

    def forward(self, ids_encoder=None, masks_encoder=None, ids_decoder=None, masks_decoder=None, **kwargs):
        output_encoder = None
        output_encoder_pooled = None
        output_decoder = None
        
        output_encoder = self.embed_and_encode(ids_encoder=ids_encoder, masks_encoder=masks_encoder)
        output = output_encoder
        
        if self.pooling:
            output_encoder_refilled = output_encoder.masked_fill(masks_encoder.transpose(-1,-2)==False, 0)
            output_encoder_pooled = torch.mean(output_encoder_refilled, dim=-2)
            
            output_encoder_pooled_expanded = output_encoder_pooled.unsqueeze(-2).expand(output_encoder.shape)
            output = output_encoder_pooled_expanded
        # end
        
        if self.embedder_decoder and self.decoder:
            output_decoder = self.embed_and_decode(ids_decoder=ids_decoder, masks_encoder=masks_encoder, output_encoder=output, masks_decoder=masks_decoder)
        # end if
        
        print('[{}] in SimpleEncoderDecoder.forward: output {}, current {}'.format(threading.get_native_id(), output.device, torch.cuda.current_device()))        
        
        return {'output_encoder': output_encoder, 'output_encoder_pooled': output_encoder_pooled, 'output_decoder': output_decoder}
    # end
    
    def embed_and_encode(self, ids_encoder=None, masks_encoder=None, **kwargs):
        
        embedding_encoder = self.embedder_encoder(ids_encoder)
        output_encoder = self.encoder(
            embedding_encoder=embedding_encoder,
            masks_encoder=masks_encoder,
        )
        
        return output_encoder
    # end

    
    def embed_and_decode(self, ids_decoder=None, masks_encoder=None, output_encoder=None, masks_decoder=None, **kwargs):
        
        embedding_decoder = self.embedder_decoder(ids_decoder)
        output_decoder = self.decoder(
            masks_encoder=masks_encoder,
            output_encoder=output_encoder,    #(len_seq, dim_hidden) -> (1, dim_hidden)
            embedding_decoder=embedding_decoder,
            masks_decoder=masks_decoder,
        )

        return output_decoder
    # end
    

    def get_vocab_size(self, name_embedder):
        embedder = getattr(self, f'embedder_{name_embedder}')
        return embedder.get_vocab_size()
    # end

# end

class LinearAndNorm(nn.Module):
    def __init__(self, dim_in = None, dim_out = None, dropout=0.1, eps_norm=1e-12):
        super(LinearAndNorm, self).__init__()

        self.linear = torch.nn.Linear(dim_in, dim_out)
        self.norm = torch.nn.LayerNorm(dim_out, eps_norm)
        self.dropout = torch.nn.Dropout(p=dropout)
    # end

    def forward(self, seqs_in):
        return self.dropout(self.norm(self.linear(seqs_in).relu()))
    # end
# end




class Batch:

    def __init__(self, **kwargs):
        self.kwargs = {}
        for k, v in kwargs.items():
            if v is not None and type(v) is not bool:
                self.kwargs[k] = v.cuda()
            # end
        # end
        
    # end

    def __call__(self):
        return self.kwargs
    # end
# end



class Collator_Base:

    def __init__(self, tokenizer, size_seq_max, need_masked=0.3):
        self.tokenizer = tokenizer
        self.size_seq_max = size_seq_max
        self.need_masked = need_masked

        index_special_token_2_id = {k: v for k, v in zip(tokenizer.all_special_tokens, tokenizer.all_special_ids)}

        self.id_pad = index_special_token_2_id['[PAD]']
        self.id_mask = index_special_token_2_id['[MASK]']
        self.id_cls = index_special_token_2_id['[CLS]']
        self.id_sep = index_special_token_2_id['[SEP]']
        self.id_unk = index_special_token_2_id['[UNK]']
                                 

        self.regex_special_token = re.compile(r'\[(PAD|MASK|CLS|SEP|EOL|UNK)\]')
    # end

    def _preprocess(self, line):
        line = re.sub(self.regex_special_token, r'<\1>', line)
        line = re.sub(r'''('|"|`){2}''', '', line)
        line = re.sub(r'\.{2,3}', '', line)
        line = re.sub(r' {2,}', ' ', line)
        line = line.lstrip().rstrip()
        return line
    # end

    # return masks_attention?, return masks_segment?
    def pad_sequences(self, sequences, size_seq_max, need_diagonal=False,
                      need_masked=0):  # need_diagonal and need_masked cannot both set, one for bert seq one for s2s seq
        id_pad = self.id_pad
        id_mask = self.id_mask

        sequences_padded = []
        sequences_masked_padded = []

        for sequence in sequences:
            len_seq = len(sequence)

            count_pad = size_seq_max - len_seq

            sequence = torch.LongTensor(sequence)
            sequence_padded = torch.cat((sequence, torch.LongTensor([id_pad] * count_pad)))
            sequences_padded.append(sequence_padded)

            if need_masked:
                index_masked = list(range(1, len_seq - 1))
                random.shuffle(index_masked)
                anchor_mask = int(need_masked * (len_seq - 2)) or 1
                index_masked = torch.LongTensor(index_masked[:anchor_mask])

                sequence_masked = sequence.detach().clone()
                sequence_masked.index_fill_(0, index_masked, id_mask)
                sequence_masked_padded = torch.cat((sequence_masked, torch.LongTensor([id_pad] * count_pad)))

                sequences_masked_padded.append(sequence_masked_padded)
            # end
        #   # end for

        inputs = torch.stack(sequences_padded)  # (batch, size_seq_max)
        if need_masked:
            inputs_masked_padded = torch.stack(sequences_masked_padded)
        # end

        masks_segment = (inputs != self.id_pad).unsqueeze(-2)  # (nbatch, 1, seq)
        masks_attention = self.make_std_mask(inputs, self.id_pad) if need_diagonal else masks_segment

        if need_masked:
            masks_masked = (inputs_masked_padded != id_mask).unsqueeze(-2)
            masks_attention = masks_attention & masks_masked
            return inputs_masked_padded, masks_attention, masks_segment, inputs  # (inputs, masks_attention, masks_segment, labels)
        else:
            return inputs, masks_attention, masks_segment, None
        # end

    # end

    def subsequent_mask(self, size):
        "Mask out subsequent positions."
        attn_shape = (1, size, size)
        subsequent_mask = torch.triu(torch.ones(attn_shape), diagonal=1).type(
            torch.uint8
        )
        return subsequent_mask == 0

    def make_std_mask(self, tgt, pad):
        "Create a mask to hide padding and future words."
        tgt_mask = (tgt != pad).unsqueeze(-2)
        tgt_mask = tgt_mask & self.subsequent_mask(tgt.size(-1)).type_as(
            tgt_mask.data
        )
        return tgt_mask
    # end
# end



class Collator_BERT(Collator_Base):

    def __call__(self, list_sequence_batch):
        list_sequence_batch = [self._preprocess(sequence) for sequence in list_sequence_batch]  # remove special tokens

        list_sequence_tokenized = self.tokenizer.batch_encode_plus(list_sequence_batch, add_special_tokens=False)[
            'input_ids']

        # Process I.
        list_list_tokenized = []

        # batch initialized condition
        list_tokenized_cache = []
        len_tokenized_accumulated = 2  # add cls and sep

        while list_sequence_tokenized:
            tokenized_poped = list_sequence_tokenized.pop(0)
            len_tokenized_current = len(tokenized_poped)

            if len_tokenized_accumulated + len_tokenized_current > self.size_seq_max:
                if list_tokenized_cache:
                    list_list_tokenized.append(list_tokenized_cache)

                    # clear
                    list_tokenized_cache = []
                    len_tokenized_accumulated = 2
                # end
            # end

            list_tokenized_cache.append(tokenized_poped)
            len_tokenized_accumulated += len_tokenized_current
        # end

        list_list_tokenized.append(list_tokenized_cache)

        # Process II. Merge list_tokenized
        list_tokenized_merged = []

        for list_tokenized in list_list_tokenized:
            # tokenized_merged = [token for tokenized_padded in [tokenized + [self.id_eol] for tokenized in list_tokenized] for token in tokenized_padded]
            tokenized_merged = [token for tokenized in list_tokenized for token in tokenized][:self.size_seq_max - 2]
            list_tokenized_merged.append(tokenized_merged)
        # end

        # Process III. Add begin and stop special token, same as jinyuj_transformers_quora.ipynb
        tokens_input_encoder = []
        tokens_input_decoder = []
        tokens_label_decoder = []

        for tokenized_merged in list_tokenized_merged:
            tokens_input_encoder.append([self.id_cls] + tokenized_merged + [self.id_sep])
            tokens_input_decoder.append([self.id_cls] + tokenized_merged)
            tokens_label_decoder.append(tokenized_merged + [self.id_sep])
        # end

        inputs_encoder, masks_encoder, segments_encoder, labels_encoder = self.pad_sequences(tokens_input_encoder,
                                                                                             self.size_seq_max,
                                                                                             need_masked=self.need_masked)
        inputs_decoder, masks_decoder, segments_decoder, _ = self.pad_sequences(tokens_input_decoder, self.size_seq_max,
                                                                                need_diagonal=True)
        labels_decoder, masks_label, segments_label, _ = self.pad_sequences(tokens_label_decoder, self.size_seq_max)

        return Batch(
            ids_encoder=inputs_encoder,  # contains [mask]s
            masks_encoder=masks_encoder,
            labels_encoder=labels_encoder,  # doesn't contain [mask]
            segments_encoder=segments_encoder,
            ids_decoder=inputs_decoder,
            masks_decoder=masks_decoder,
            labels_decoder=labels_decoder,
            segments_label=segments_label
        )
    # end
# end


In [2]:
import spacy


def BookCorpus2000(split=0.1):
    filename = 'bookcorpus_2000.json'
    
    with open(filename, 'r') as file:
        list_corpus = json.load(file)
    # end
    
    indexs_all = list(range(len(list_corpus)))
    random.shuffle(indexs_all)
    
    index_split = int(split * len(list_corpus))
    
    indexs_eval = indexs_all[:index_split]
    indexs_train = indexs_all[index_split:]
    
    list_corpus_eval = [list_corpus[i_e] for i_e in indexs_eval]
    list_corpus_train = [list_corpus[i_t] for i_t in indexs_train]
    
    return list_corpus_train, list_corpus_eval, None
# end



def BookCorpus(split=0.0001, used=-1):
    import datasets
    
    list_corpus = datasets.load_dataset('bookcorpus')['train']['text'][:used]   # 70,000,000, 70 Million
    
    indexs_all = list(range(len(list_corpus)))
    random.shuffle(indexs_all)
    
    index_split = int(split * len(list_corpus))
    
    indexs_eval = indexs_all[:index_split]
    indexs_train = indexs_all[index_split:]
    
    list_corpus_eval = [list_corpus[i_e] for i_e in indexs_eval]
    list_corpus_train = [list_corpus[i_t] for i_t in indexs_train]
    
    return list_corpus_train, list_corpus_eval, None
# end

In [3]:
class SimpleEncoderHead_MLM(nn.Module):

    @classmethod
    def get_info_accuracy_template(cls):
        return Dotdict({
            'corrects_segmented': 0,
            'corrects_masked': 0,
            'num_segmented': 0,
            'num_masked': 0 
        })
    # end
    
    def __init__(self, model, size_vocab, dim_hidden=128, dropout=0.1):
        super(SimpleEncoderHead_MLM, self).__init__()
        
        self.ffn = LinearAndNorm(dim_in=dim_hidden, dim_out=dim_hidden, dropout=dropout)
        self.extractor = torch.nn.Linear(dim_hidden, size_vocab, bias=False)
        self.extractor.weight = nn.Parameter(model.embedder_encoder.embedder[0].lut.weight)

        self.func_loss = torch.nn.CrossEntropyLoss().cuda()
    # end


    def forward(self, output_encoder=None, labels_encoder=None, segments_encoder=None, masks_encoder=None, **kwargs):   # labels_input -> (batch, seq, labels)
        print('[{}] in SimpleEncoderHead_MLM.forward: {}, {}, current {}'.format(threading.get_native_id(), output_encoder.device, output_encoder.shape, torch.cuda.current_device()))
        output_ffn = self.ffn(output_encoder)
        output_mlm = self.extractor(output_ffn) # output_mlm = prediction_logits
        
        return {'output': output_mlm, 'labels_encoder': labels_encoder, 'segments_encoder': segments_encoder, 'masks_encoder': masks_encoder}


    
    def compute_loss(self, output=None, labels_encoder=None, segments_encoder=None, masks_encoder=None):
        
        output_mlm = output
        labels_mlm = labels_encoder
        
        info_acc = SimpleEncoderHead_MLM.get_info_accuracy_template()
        
        print('[{}] SimpleEncoderHead_MLM get loss: {}, {}, current {}, labels_mlm[0]: {}, labels_mlm.shape: {}, output_mlm.shape: {}'.format(
            threading.get_native_id(), output_mlm.device, self.ffn.linear.weight.device, torch.cuda.current_device(),labels_mlm[0][0], labels_mlm.shape, output_mlm.shape))
        
        segments_encoder_2d = segments_encoder.transpose(-1,-2)[:,:,0]
        hidden_mlm_segmented = output_mlm.masked_select(segments_encoder_2d.unsqueeze(-1)).reshape(-1, output_mlm.shape[-1]) # should be (segmented_all_batchs, size_vocab)
        
        loss_segments = self.func_loss(hidden_mlm_segmented, labels_mlm.masked_select(segments_encoder_2d))
        info_acc.corrects_segmented = torch.sum(hidden_mlm_segmented.argmax(-1) == labels_mlm.masked_select(segments_encoder_2d)).cpu().item()
        info_acc.num_segmented = hidden_mlm_segmented.shape[0]
        
        masks_masked = torch.logical_xor(masks_encoder, segments_encoder) & segments_encoder # True is masked
        masks_masked_perbatch = masks_masked[:,0,:]
        hidden_mlm_masked = output_mlm.masked_select(masks_masked_perbatch.unsqueeze(-1)).reshape(-1, output_mlm.shape[-1])

        if hidden_mlm_masked.shape[0] != 0:
            loss_masked = self.func_loss(hidden_mlm_masked, labels_mlm.masked_select(masks_masked_perbatch))       
            info_acc.corrects_masked = torch.sum(hidden_mlm_masked.argmax(-1) == labels_mlm.masked_select(masks_masked_perbatch)).cpu().item()
            info_acc.num_masked = hidden_mlm_masked.shape[0]
        else:
            loss_masked = 0
            info_acc.corrects_masked = 0
            info_acc.num_masked = 1
        # end
        
        loss_mlm = loss_segments + loss_masked * 3
        
        return loss_mlm, info_acc
    # end
# end

In [4]:
class SimpleDecoderHead_S2S(nn.Module):

    @classmethod
    def get_info_accuracy_template(cls):
        return Dotdict({
            'corrects_segmented': 0,
            'num_segmented': 0 
        })
    # end


    def __init__(self, model, size_vocab, dim_hidden=128, dropout=0.1):
        super(SimpleDecoderHead_S2S, self).__init__()
        
        self.ffn = LinearAndNorm(dim_in=dim_hidden, dim_out=dim_hidden, dropout=dropout)
        self.extractor = torch.nn.Linear(dim_hidden, size_vocab, bias=False)
        self.extractor.weight = nn.Parameter(model.embedder_decoder.embedder[0].lut.weight)

        self.func_loss = torch.nn.CrossEntropyLoss().cuda()
    # end



    def forward(self, output_decoder=None, labels_decoder=None, segments_label=None, **kwargs):   # labels_input -> (batch, seq, labels)
        
        print('[{}] in SimpleDecoderHead_S2S.forward: {}, {}, current {}, labels_s2s[0]: {}, labels_s2s.shape: {}'.format(
            threading.get_native_id(), output_decoder.device, self.ffn.linear.weight.device, torch.cuda.current_device(),labels_decoder[0][0], labels_decoder.shape))
        
        output_ffn = self.ffn(output_decoder)
        output_s2s = self.extractor(output_ffn)   # output_mlm = prediction_logits
        
        return {'output': output_s2s, 'labels_decoder': labels_decoder, 'segments_label': segments_label}

    # end


    def compute_loss(self, output=None, labels_decoder=None, segments_label=None):
        output_s2s = output
        labels_s2s = labels_decoder
        
        info_acc = SimpleDecoderHead_S2S.get_info_accuracy_template()
        
        segments_label_2d = segments_label.transpose(-1,-2)[:,:,0]
        hidden_s2s_segmented = output_s2s.masked_select(segments_label_2d.unsqueeze(-1)).reshape(-1, output_s2s.shape[-1])

        loss_segments = self.func_loss(hidden_s2s_segmented, labels_s2s.masked_select(segments_label_2d))
        print('[{}] SimpleDecoderHead_S2S get loss, cuda {}, loss {}, labels_s2s[0]: {}, labels.shape: {}, output.shape: {}'.format(
            threading.get_native_id(), torch.cuda.current_device(), loss_segments.device, labels_s2s[0][0], labels_s2s.shape, output_s2s.shape))
        info_acc.corrects_segmented = torch.sum(hidden_s2s_segmented.argmax(-1) == labels_s2s.masked_select(segments_label_2d)).cpu().item()
        info_acc.num_segmented = hidden_s2s_segmented.shape[0]
        
        return loss_segments * 4, info_acc
    # end

# end

In [5]:
class Trainer(nn.Module):
    def __init__(self, model):
        super(Trainer, self).__init__()
        self.index_name_head = set()
        self.model = model
    # end

    def register(self, head):
        name_head = head.__class__.__name__
        setattr(self, name_head, head)
        self.index_name_head.add(name_head)
        return self
    # end

    def forward(self, **kwargs):
        output_model = self.model(**kwargs)
        dict_head_output = {}
        
        for name in self.index_name_head:
            head = getattr(self, name)
            dict_head_output[name] = head.forward(**{**output_model, **kwargs})
        # end
        
        return dict_head_output
    # end

    def get_head(self, name_klass):
        if type(name_klass) is type:
            name_klass = klass.__name__
        # end
        
        return getattr(self, name_klass)
    # end
# end

In [6]:
class Builder:
    
    @classmethod
    def build_model_with_mlm_v2(cls, size_vocab, dim_hidden, dim_feedforward, n_head, n_layer):
        embedder_encoder = SimpleEmbedder(size_vocab=size_vocab, dim_hidden=dim_hidden)
        sample_encoder = SimpleEncoderLayer(dim_hidden, dim_feedforward, n_head)
        encoderstack = SimpleTransformerStack(sample_encoder, n_layer)

        model = SimpleEncoderDecoder(encoderstack, None, embedder_encoder, None)
        head_mlm = SimpleEncoderHead_MLM(model, size_vocab, dim_hidden)

        trainer = Trainer(model).register(head_mlm)

        return trainer
    # end
    
    @classmethod
    def build_model_with_s2s_v2(cls, size_vocab, dim_hidden, dim_feedforward, n_head, n_layer):
        embedder_encoder = SimpleEmbedder(size_vocab=size_vocab, dim_hidden=dim_hidden)
        sample_encoder = SimpleEncoderLayer(dim_hidden, dim_feedforward, n_head)
        encoderstack = SimpleTransformerStack(sample_encoder, n_layer)
        
        embedder_decoder = SimpleEmbedder(size_vocab=size_vocab, dim_hidden=dim_hidden)
        sample_decoder = SimpleDecoderLayer(dim_hidden, dim_feedforward, n_head)
        decoderstack = SimpleTransformerStack(sample_decoder, n_layer)

        model = SimpleEncoderDecoder(encoderstack, decoderstack, embedder_encoder, embedder_decoder, pooling=True)
        head_s2s = SimpleDecoderHead_S2S(model, size_vocab, dim_hidden)
        
        manager = HeadManager().register(head_s2s)
        trainer = Trainer(model=model, manager=manager)

        return trainer
    # end
    
    @classmethod
    def build_model_with_2heads(cls, size_vocab, dim_hidden, dim_feedforward, n_head, n_layer):
        embedder_encoder = SimpleEmbedder(size_vocab=size_vocab, dim_hidden=dim_hidden)
        sample_encoder = SimpleEncoderLayer(dim_hidden, dim_feedforward, n_head)
        encoderstack = SimpleTransformerStack(sample_encoder, n_layer)
        
        embedder_decoder = SimpleEmbedder(size_vocab=size_vocab, dim_hidden=dim_hidden)
        sample_decoder = SimpleDecoderLayer(dim_hidden, dim_feedforward, n_head)
        decoderstack = SimpleTransformerStack(sample_decoder, n_layer)

        model = SimpleEncoderDecoder(encoderstack, decoderstack, embedder_encoder, embedder_decoder, pooling=True)
        head_s2s = SimpleDecoderHead_S2S(model, size_vocab, dim_hidden)
        head_mlm = SimpleEncoderHead_MLM(model, size_vocab, dim_hidden)

        trainer = Trainer(model).register(head_mlm).register(head_s2s)
        return trainer
    # end

# end

In [7]:
import re
import json
import transformers
from torch.utils.data import DataLoader, Dataset
from torchtext.data.functional import to_map_style_dataset
from transformers import AutoTokenizer


# GPUS = [0]
GPUS = [0,1]

epochs = 10

# source
seq_max = 128
batch_size = 64


# model & head
dim_hidden = 512
dim_feedforward = 512
n_head = 8
n_layer = 8

# optimizer
lr_base_optimizer = 1e-4
betas_optimizer = (0.9, 0.999)
eps_optimizer = 1e-9

# scheduler
warmup = 200

train_source, valid_source, _ = BookCorpus2000(split=0.1)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") 
collator = Collator_BERT(tokenizer, seq_max)

dataloader_train = DataLoader(train_source, batch_size*len(GPUS), shuffle=False, collate_fn=collator)
dataloader_eval = DataLoader(valid_source, 1, shuffle=False, collate_fn=collator)

trainer = Builder.build_model_with_2heads(tokenizer.vocab_size, dim_hidden, dim_feedforward, n_head, n_layer)

for p in trainer.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)
    # end
# end

trainer = trainer.to('cuda')
trainer = torch.nn.DataParallel(trainer, device_ids=GPUS)

optimizer = torch.optim.Adam(trainer.parameters(), lr=1e-4, betas=(0.9, 0.999), eps=1e-08, weight_decay=0.01, amsgrad=False)
decayRate = 0.96
lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer=optimizer, gamma=decayRate)


print()




In [8]:
def train_a_batch(batch, trainer, optimizer=None, scheduler=None):
    dict_head_output = trainer.forward(**batch())
    
    print('[{}], cuda {}, mlm output: {}'.format(threading.get_native_id(), torch.cuda.current_device(), dict_head_output['SimpleEncoderHead_MLM']['output'].shape))
    
    loss_mlm, info_acc_mlm = trainer.module.get_head('SimpleEncoderHead_MLM').compute_loss(**dict_head_output['SimpleEncoderHead_MLM'])
    loss_s2s, info_acc_s2s = trainer.module.get_head('SimpleDecoderHead_S2S').compute_loss(**dict_head_output['SimpleDecoderHead_S2S'])
    
    # crossentropy loss
    loss_all = loss_mlm + loss_s2s
    loss_all_value = loss_all.item()
    
    
    loss_all.backward()
    
    if optimizer:
        optimizer.step()
        optimizer.zero_grad(set_to_none=True)
    # end
    
    if scheduler:
        scheduler.step()
    # end
    
    return loss_all_value, Dotdict({'mlm': info_acc_mlm, 's2s': info_acc_s2s})
# end


def evaluate_a_batch(batch, trainer, *args, **kwargs):
    
    with torch.no_grad():
        dict_head_output = trainer.forward(**batch())
    # end
    
    loss_mlm, info_acc_mlm = trainer.module.get_head('SimpleEncoderHead_MLM').compute_loss(**dict_head_output['SimpleEncoderHead_MLM'])
    loss_s2s, info_acc_s2s = trainer.module.get_head('SimpleDecoderHead_S2S').compute_loss(**dict_head_output['SimpleDecoderHead_S2S'])
    

    # crossentropy loss
    loss_all = (loss_s2s + loss_mlm) / 2
    loss_all_value = loss_all.item()
    
    return loss_all_value, Dotdict({'mlm': info_acc_mlm, 's2s': info_acc_s2s})
# end

In [9]:
from datetime import datetime
from tqdm import tqdm
import time

name_checkpoint_current = None
name_checkpoint_last = None

for e in range(epochs):
    
    info_acc_heads_train = Dotdict({
        'mlm': SimpleEncoderHead_MLM.get_info_accuracy_template(),
        's2s': SimpleDecoderHead_S2S.get_info_accuracy_template()
    })

    # train phase
    trainer.train()
    losss_per_e = []
    for i, batch in enumerate(tqdm(dataloader_train)):
        loss_current, info_acc_heads_batch = train_a_batch(batch, trainer, optimizer, None)
        info_acc_heads_train += info_acc_heads_batch
        losss_per_e.append(loss_current)
    # end

    loss_average_per_e = sum(losss_per_e) / len(losss_per_e)
    # print('[{}] Epoch: {} training ends. Status: Average loss: {}, Average MLM accuracy: {}'.format(
    print('[{}] Epoch: {} training ends. Status: Average loss: {}, Average MLM accuracy: {}, Average S2S accuracy: {}'.format(
        datetime.utcnow(), e, loss_average_per_e,
        info_acc_heads_train.mlm.corrects_masked / info_acc_heads_train.mlm.num_masked,
        info_acc_heads_train.s2s.corrects_segmented / info_acc_heads_train.s2s.num_segmented,
    ))
    
    if e % 2 == 0:
        lr_scheduler.step() # schedule per 2 epoch
    # end
    
    # eval phase
    info_acc_heads_eval = Dotdict({
        'mlm': SimpleEncoderHead_MLM.get_info_accuracy_template(),
        's2s': SimpleDecoderHead_S2S.get_info_accuracy_template()
    })
    
    trainer.eval()
    losss_per_e = []
    for i, batch in enumerate(tqdm(dataloader_eval)):
        loss_current, info_acc_heads_batch = evaluate_a_batch(batch, trainer)
        info_acc_heads_eval += info_acc_heads_batch
        
        losss_per_e.append(loss_current)
    # end
    
    loss_average_per_e = sum(losss_per_e) / len(losss_per_e)
    print('[{}] Epoch: {} Evalutation ends. Status: Average loss: {}, Average MLM accuracy: {}, Average S2S accuracy: {}'.format(        
        datetime.utcnow(), e, loss_average_per_e,
        info_acc_heads_eval.mlm.corrects_masked / info_acc_heads_eval.mlm.num_masked,
        info_acc_heads_eval.s2s.corrects_segmented / info_acc_heads_eval.s2s.num_segmented,
    ))
# end


  0%|          | 0/15 [00:00<?, ?it/s]

[188261] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188261] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([8, 128])
[188261] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([8, 128, 512]), current 0
[188262] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188262] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2045, labels_s2s.shape: torch.Size([8, 128])
[188262] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([8, 128, 512]), current 1
[188131], cuda 0, mlm output: torch.Size([16, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([16, 128]), output_mlm.shape: torch.Size([16, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([16, 128]), output.shape: torch.Size([16, 128, 30522])


  7%|▋         | 1/15 [00:03<00:47,  3.43s/it]

[188273] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188273] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2021, labels_s2s.shape: torch.Size([6, 128])
[188273] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([6, 128, 512]), current 1
[188272] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188272] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 20364, labels_s2s.shape: torch.Size([7, 128])
[188272] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0
[188131], cuda 0, mlm output: torch.Size([13, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([13, 128]), output_mlm.shape: torch.Size([13, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 20364, labels.shape: torch.Size([13, 128]), output.shape: torch.Size([13, 128, 30522])


 13%|█▎        | 2/15 [00:03<00:22,  1.72s/it]

[188274] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188275] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188275] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 10166, labels_s2s.shape: torch.Size([7, 128])
[188274] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2469, labels_s2s.shape: torch.Size([7, 128])
[188274] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0[188275] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1

[188131], cuda 0, mlm output: torch.Size([14, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([14, 128]), output_mlm.shape: torch.Size([14, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2469, labels.shape: torch.Size([14, 128]), output.shape: torch.Size([14, 128, 30522])


 20%|██        | 3/15 [00:04<00:13,  1.09s/it]

[188277] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188276] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188277] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([6, 128])[188276] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2044, labels_s2s.shape: torch.Size([6, 128])

[188277] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([6, 128, 512]), current 1
[188276] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([6, 128, 512]), current 0
[188131], cuda 0, mlm output: torch.Size([12, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([12, 128]), output_mlm.shape: torch.Size([12, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2044, labels.shape: torch.Size([12, 128]), output.shape: torch.Size([12, 128, 30522])


 27%|██▋       | 4/15 [00:04<00:08,  1.28it/s]

[188278] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188279] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188278] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2057, labels_s2s.shape: torch.Size([6, 128])[188279] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 1997, labels_s2s.shape: torch.Size([6, 128])

[188279] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([6, 128, 512]), current 1[188278] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([6, 128, 512]), current 0

[188131], cuda 0, mlm output: torch.Size([12, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([12, 128]), output_mlm.shape: torch.Size([12, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2057, labels.shape: torch.Size([12, 128]), output.shape: torch.Size([12, 128, 30522])


 33%|███▎      | 5/15 [00:04<00:06,  1.62it/s]

[188280] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188281] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188281] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2007, labels_s2s.shape: torch.Size([7, 128])
[188280] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1037, labels_s2s.shape: torch.Size([7, 128])
[188281] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1
[188280] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0
[188131], cuda 0, mlm output: torch.Size([14, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([14, 128]), output_mlm.shape: torch.Size([14, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1037, labels.shape: torch.Size([14, 128]), output.shape: torch.Size([14, 128, 30522])


 40%|████      | 6/15 [00:05<00:04,  1.91it/s]

[188289] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188288] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188289] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2941, labels_s2s.shape: torch.Size([7, 128])
[188289] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1
[188288] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 4067, labels_s2s.shape: torch.Size([8, 128])
[188288] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([8, 128, 512]), current 0
[188131], cuda 0, mlm output: torch.Size([15, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([15, 128]), output_mlm.shape: torch.Size([15, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 4067, labels.shape: torch.Size([15, 128]), output.shape: torch.Size([15, 128, 30522])


 47%|████▋     | 7/15 [00:05<00:03,  2.17it/s]

[188291] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188290] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188291] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 10167, labels_s2s.shape: torch.Size([7, 128])[188290] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([7, 128])

[188291] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1[188290] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0

[188131], cuda 0, mlm output: torch.Size([14, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([14, 128]), output_mlm.shape: torch.Size([14, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2002, labels.shape: torch.Size([14, 128]), output.shape: torch.Size([14, 128, 30522])


 53%|█████▎    | 8/15 [00:05<00:02,  2.39it/s]

[188292] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188293] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188293] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([7, 128])[188292] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2821, labels_s2s.shape: torch.Size([7, 128])

[188292] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0[188293] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1

[188131], cuda 0, mlm output: torch.Size([14, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([14, 128]), output_mlm.shape: torch.Size([14, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2821, labels.shape: torch.Size([14, 128]), output.shape: torch.Size([14, 128, 30522])


 60%|██████    | 9/15 [00:06<00:02,  2.56it/s]

[188301] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188300] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188301] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([7, 128])[188300] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2014, labels_s2s.shape: torch.Size([7, 128])

[188300] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0
[188301] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1
[188131], cuda 0, mlm output: torch.Size([14, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([14, 128]), output_mlm.shape: torch.Size([14, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2014, labels.shape: torch.Size([14, 128]), output.shape: torch.Size([14, 128, 30522])


 67%|██████▋   | 10/15 [00:06<00:01,  2.69it/s]

[188303] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188302] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188303] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2004, labels_s2s.shape: torch.Size([8, 128])
[188302] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([8, 128])
[188302] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([8, 128, 512]), current 0[188303] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([8, 128, 512]), current 1

[188131], cuda 0, mlm output: torch.Size([16, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([16, 128]), output_mlm.shape: torch.Size([16, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([16, 128]), output.shape: torch.Size([16, 128, 30522])


 73%|███████▎  | 11/15 [00:06<00:01,  2.73it/s]

[188305] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188304] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188304] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([6, 128])[188305] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([6, 128])

[188304] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([6, 128, 512]), current 0[188305] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([6, 128, 512]), current 1

[188131], cuda 0, mlm output: torch.Size([12, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([12, 128]), output_mlm.shape: torch.Size([12, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([12, 128]), output.shape: torch.Size([12, 128, 30522])


 80%|████████  | 12/15 [00:07<00:01,  2.84it/s]

[188307] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188306] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188307] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([7, 128])
[188306] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([7, 128])
[188307] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1[188306] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0

[188131], cuda 0, mlm output: torch.Size([14, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([14, 128]), output_mlm.shape: torch.Size([14, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([14, 128]), output.shape: torch.Size([14, 128, 30522])


 87%|████████▋ | 13/15 [00:07<00:00,  2.89it/s]

[188314] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188315] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188315] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 1996, labels_s2s.shape: torch.Size([7, 128])
[188315] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1
[188314] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([8, 128])
[188314] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([8, 128, 512]), current 0
[188131], cuda 0, mlm output: torch.Size([15, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([15, 128]), output_mlm.shape: torch.Size([15, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2016, labels.shape: torch.Size([15, 128]), output.shape: torch.Size([15, 128, 30522])


100%|██████████| 15/15 [00:08<00:00,  1.84it/s]


[188317] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188316] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188317] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 1037, labels_s2s.shape: torch.Size([1, 128])[188316] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2106, labels_s2s.shape: torch.Size([1, 128])

[188316] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0[188317] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([1, 128, 512]), current 1

[188131], cuda 0, mlm output: torch.Size([2, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([2, 128]), output_mlm.shape: torch.Size([2, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2106, labels.shape: torch.Size([2, 128]), output.shape: torch.Size([2, 128, 30522])
[2023-12-07 12:03:00.245437] Epoch: 0

  3%|▎         | 6/200 [00:00<00:07, 26.45it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2028, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2028, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2467, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

  6%|▌         | 12/200 [00:00<00:06, 27.39it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 21877, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels

  8%|▊         | 15/200 [00:00<00:06, 27.77it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2986, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2986, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 4441, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 10%|█         | 21/200 [00:00<00:06, 25.93it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 12643, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 12643, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 5616, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, label

 14%|█▎        | 27/200 [00:01<00:06, 26.79it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2129, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2129, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2975, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 16%|█▋        | 33/200 [00:01<00:06, 26.84it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2016, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 24781, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels

 20%|█▉        | 39/200 [00:01<00:05, 26.96it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 21877, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 21877, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 4067, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, label

 23%|██▎       | 46/200 [00:01<00:05, 28.08it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2204, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2204, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 26%|██▌       | 52/200 [00:01<00:05, 28.58it/s]

[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2057, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2004, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2004, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: out

 29%|██▉       | 58/200 [00:02<00:05, 28.26it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 21986, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels

 32%|███▏      | 64/200 [00:02<00:04, 28.89it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2673, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2673, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2009, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 36%|███▌      | 71/200 [00:02<00:04, 29.31it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 12756, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 12756, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, label

 38%|███▊      | 77/200 [00:02<00:04, 28.35it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2002, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2021, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 42%|████▏     | 83/200 [00:03<00:04, 27.62it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2053, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2053, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2053, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 44%|████▍     | 89/200 [00:03<00:04, 26.96it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1996, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1996, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 48%|████▊     | 95/200 [00:03<00:04, 26.21it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 9775, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 50%|█████     | 101/200 [00:03<00:03, 26.37it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1998, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1998, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2024, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 54%|█████▎    | 107/200 [00:03<00:03, 27.57it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 5513, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 5513, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2009, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 56%|█████▋    | 113/200 [00:04<00:03, 27.84it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 10792, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 10792, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, label

 60%|█████▉    | 119/200 [00:04<00:02, 27.79it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 21877, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 21877, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2004, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, label

 62%|██████▎   | 125/200 [00:04<00:02, 27.73it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 3127, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 3127, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2024, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 66%|██████▌   | 131/200 [00:04<00:02, 28.14it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 4445, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 4445, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 5026, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 69%|██████▉   | 138/200 [00:04<00:02, 29.40it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2025, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2025, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 72%|███████▏  | 144/200 [00:05<00:01, 28.55it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2016, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 3524, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 75%|███████▌  | 150/200 [00:05<00:01, 27.48it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2016, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 78%|███████▊  | 156/200 [00:05<00:01, 27.33it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 21468, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 21468, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 3201, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, label

 81%|████████  | 162/200 [00:05<00:01, 26.73it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2016, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2292, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 84%|████████▍ | 168/200 [00:06<00:01, 26.49it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2016, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 87%|████████▋ | 174/200 [00:06<00:00, 27.39it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 12756, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 12756, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2085, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, label

 90%|█████████ | 180/200 [00:06<00:00, 27.19it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 9152, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 9152, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2348, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 93%|█████████▎| 186/200 [00:06<00:00, 26.87it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2182, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2182, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 94%|█████████▍| 189/200 [00:06<00:00, 26.30it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2498, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2498, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1998, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 99%|█████████▉| 198/200 [00:07<00:00, 26.89it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2572, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2572, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2198, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

100%|██████████| 200/200 [00:07<00:00, 27.37it/s]


[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2053, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2053, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

  0%|          | 0/15 [00:00<?, ?it/s]

[188330] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188331] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188331] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2045, labels_s2s.shape: torch.Size([8, 128])
[188330] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([8, 128])
[188330] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([8, 128, 512]), current 0[188331] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([8, 128, 512]), current 1

[188131], cuda 0, mlm output: torch.Size([16, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([16, 128]), output_mlm.shape: torch.Size([16, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([16, 128]), output.shape: torch.Size([16, 128, 30522])


  7%|▋         | 1/15 [00:00<00:05,  2.75it/s]

[188332] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188333] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188333] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2021, labels_s2s.shape: torch.Size([6, 128])
[188332] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 20364, labels_s2s.shape: torch.Size([7, 128])
[188333] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([6, 128, 512]), current 1
[188332] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0
[188131], cuda 0, mlm output: torch.Size([13, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([13, 128]), output_mlm.shape: torch.Size([13, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 20364, labels.shape: torch.Size([13, 128]), output.shape: torch.Size([13, 128, 30522])


 13%|█▎        | 2/15 [00:00<00:04,  2.88it/s]

[188335] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188334] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188335] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 10166, labels_s2s.shape: torch.Size([7, 128])
[188334] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2469, labels_s2s.shape: torch.Size([7, 128])
[188335] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1[188334] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0

[188131], cuda 0, mlm output: torch.Size([14, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([14, 128]), output_mlm.shape: torch.Size([14, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2469, labels.shape: torch.Size([14, 128]), output.shape: torch.Size([14, 128, 30522])


 20%|██        | 3/15 [00:01<00:04,  2.89it/s]

[188337] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188337] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([6, 128])
[188337] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([6, 128, 512]), current 1
[188336] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188336] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2044, labels_s2s.shape: torch.Size([6, 128])
[188336] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([6, 128, 512]), current 0
[188131], cuda 0, mlm output: torch.Size([12, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([12, 128]), output_mlm.shape: torch.Size([12, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2044, labels.shape: torch.Size([12, 128]), output.shape: torch.Size([12, 128, 30522])


 27%|██▋       | 4/15 [00:01<00:04,  2.50it/s]

[188338] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188339] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188339] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 1997, labels_s2s.shape: torch.Size([6, 128])[188338] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2057, labels_s2s.shape: torch.Size([6, 128])

[188339] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([6, 128, 512]), current 1
[188338] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([6, 128, 512]), current 0
[188131], cuda 0, mlm output: torch.Size([12, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([12, 128]), output_mlm.shape: torch.Size([12, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2057, labels.shape: torch.Size([12, 128]), output.shape: torch.Size([12, 128, 30522])


 33%|███▎      | 5/15 [00:01<00:03,  2.68it/s]

[188340] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188341] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188341] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2007, labels_s2s.shape: torch.Size([7, 128])[188340] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1037, labels_s2s.shape: torch.Size([7, 128])

[188341] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1[188340] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0

[188131], cuda 0, mlm output: torch.Size([14, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([14, 128]), output_mlm.shape: torch.Size([14, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1037, labels.shape: torch.Size([14, 128]), output.shape: torch.Size([14, 128, 30522])


 40%|████      | 6/15 [00:02<00:03,  2.76it/s]

[188343] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188342] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188343] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2941, labels_s2s.shape: torch.Size([7, 128])
[188343] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1
[188342] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 4067, labels_s2s.shape: torch.Size([8, 128])
[188342] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([8, 128, 512]), current 0
[188131], cuda 0, mlm output: torch.Size([15, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([15, 128]), output_mlm.shape: torch.Size([15, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 4067, labels.shape: torch.Size([15, 128]), output.shape: torch.Size([15, 128, 30522])


 47%|████▋     | 7/15 [00:02<00:02,  2.77it/s]

[188345] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188344] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188344] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([7, 128])[188345] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 10167, labels_s2s.shape: torch.Size([7, 128])

[188344] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0[188345] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1

[188131], cuda 0, mlm output: torch.Size([14, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([14, 128]), output_mlm.shape: torch.Size([14, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2002, labels.shape: torch.Size([14, 128]), output.shape: torch.Size([14, 128, 30522])


 53%|█████▎    | 8/15 [00:02<00:02,  2.80it/s]

[188346] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188347] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188347] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([7, 128])
[188346] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2821, labels_s2s.shape: torch.Size([7, 128])
[188346] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0[188347] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1

[188131], cuda 0, mlm output: torch.Size([14, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([14, 128]), output_mlm.shape: torch.Size([14, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2821, labels.shape: torch.Size([14, 128]), output.shape: torch.Size([14, 128, 30522])


 60%|██████    | 9/15 [00:03<00:02,  2.80it/s]

[188348] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188349] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188349] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([7, 128])
[188348] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2014, labels_s2s.shape: torch.Size([7, 128])
[188349] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1[188348] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0

[188131], cuda 0, mlm output: torch.Size([14, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([14, 128]), output_mlm.shape: torch.Size([14, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2014, labels.shape: torch.Size([14, 128]), output.shape: torch.Size([14, 128, 30522])


 67%|██████▋   | 10/15 [00:03<00:01,  2.83it/s]

[188351] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188350] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188350] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([8, 128])
[188351] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2004, labels_s2s.shape: torch.Size([8, 128])
[188350] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([8, 128, 512]), current 0
[188351] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([8, 128, 512]), current 1
[188131], cuda 0, mlm output: torch.Size([16, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([16, 128]), output_mlm.shape: torch.Size([16, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([16, 128]), output.shape: torch.Size([16, 128, 30522])


 73%|███████▎  | 11/15 [00:03<00:01,  2.78it/s]

[188358] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188359] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188358] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([6, 128])[188359] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([6, 128])

[188358] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([6, 128, 512]), current 0
[188359] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([6, 128, 512]), current 1
[188131], cuda 0, mlm output: torch.Size([12, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([12, 128]), output_mlm.shape: torch.Size([12, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([12, 128]), output.shape: torch.Size([12, 128, 30522])


 80%|████████  | 12/15 [00:04<00:01,  2.85it/s]

[188360] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188361] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188361] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([7, 128])
[188360] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([7, 128])
[188360] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0
[188361] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1
[188131], cuda 0, mlm output: torch.Size([14, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([14, 128]), output_mlm.shape: torch.Size([14, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([14, 128]), output.shape: torch.Size([14, 128, 30522])


 87%|████████▋ | 13/15 [00:04<00:00,  2.82it/s]

[188362] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188363] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188363] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 1996, labels_s2s.shape: torch.Size([7, 128])
[188363] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1
[188362] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([8, 128])
[188362] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([8, 128, 512]), current 0
[188131], cuda 0, mlm output: torch.Size([15, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([15, 128]), output_mlm.shape: torch.Size([15, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2016, labels.shape: torch.Size([15, 128]), output.shape: torch.Size([15, 128, 30522])


100%|██████████| 15/15 [00:05<00:00,  2.84it/s]


[188365] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188364] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188365] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 1037, labels_s2s.shape: torch.Size([1, 128])
[188364] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2106, labels_s2s.shape: torch.Size([1, 128])
[188365] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([1, 128, 512]), current 1[188364] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0

[188131], cuda 0, mlm output: torch.Size([2, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([2, 128]), output_mlm.shape: torch.Size([2, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2106, labels.shape: torch.Size([2, 128]), output.shape: torch.Size([2, 128, 30522])
[2023-12-07 12:03:12.835251] Epoch: 1

  3%|▎         | 6/200 [00:00<00:07, 27.22it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2028, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2028, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2467, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

  6%|▌         | 12/200 [00:00<00:07, 26.69it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 21877, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels

  8%|▊         | 15/200 [00:00<00:06, 26.65it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2986, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2986, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 4441, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 12%|█▏        | 24/200 [00:00<00:06, 27.11it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 5616, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 5616, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2823, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 15%|█▌        | 30/200 [00:01<00:06, 27.00it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2975, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2975, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2017, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 16%|█▋        | 33/200 [00:01<00:06, 27.07it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 24781, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 24781, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 12756, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labe

 20%|██        | 40/200 [00:01<00:05, 27.98it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 4067, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 4067, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2009, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 23%|██▎       | 46/200 [00:01<00:05, 27.39it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2002, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2057, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 26%|██▌       | 52/200 [00:01<00:05, 28.39it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2004, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2004, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2079, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 30%|██▉       | 59/200 [00:02<00:05, 27.89it/s]

[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 21986, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2009, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2009, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 21877, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEn

 32%|███▎      | 65/200 [00:02<00:04, 27.69it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2009, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2009, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 36%|███▌      | 72/200 [00:02<00:04, 28.48it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2002, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 39%|███▉      | 78/200 [00:02<00:04, 27.84it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2021, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2021, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1998, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 42%|████▏     | 84/200 [00:03<00:04, 27.72it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2053, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2053, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 45%|████▌     | 90/200 [00:03<00:03, 28.10it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2059, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 48%|████▊     | 96/200 [00:03<00:03, 26.96it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 9775, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 9775, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 12643, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels

 50%|████▉     | 99/200 [00:03<00:03, 26.63it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2024, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2024, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1999, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 53%|█████▎    | 106/200 [00:03<00:03, 28.18it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2009, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2009, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 5616, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 56%|█████▌    | 112/200 [00:04<00:03, 27.20it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2002, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 59%|█████▉    | 118/200 [00:04<00:03, 27.13it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2004, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2004, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2425, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 62%|██████▏   | 124/200 [00:04<00:02, 25.67it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2024, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2024, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2428, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 65%|██████▌   | 130/200 [00:04<00:02, 26.38it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 4445, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 4445, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 5026, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 68%|██████▊   | 136/200 [00:04<00:02, 27.22it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2025, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2025, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 71%|███████   | 142/200 [00:05<00:02, 26.42it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2111, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2111, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 72%|███████▎  | 145/200 [00:05<00:03, 15.95it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2002, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2087, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 76%|███████▌  | 151/200 [00:05<00:02, 19.69it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2644, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2644, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 78%|███████▊  | 157/200 [00:06<00:01, 22.56it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 12643, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 12643, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2085, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, label

 82%|████████▏ | 163/200 [00:06<00:01, 24.43it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 7240, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 7240, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2062, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 84%|████████▍ | 169/200 [00:06<00:01, 26.08it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2016, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2021, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 88%|████████▊ | 175/200 [00:06<00:00, 26.40it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2002, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2348, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 90%|█████████ | 181/200 [00:06<00:00, 26.66it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 12756, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 12756, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2010, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, label

 94%|█████████▎| 187/200 [00:07<00:00, 26.89it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2009, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2009, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2935, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 96%|█████████▋| 193/200 [00:07<00:00, 27.39it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 3100, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 3100, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 21877, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels

100%|██████████| 200/200 [00:07<00:00, 26.24it/s]


[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2025, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2025, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2085, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

  0%|          | 0/15 [00:00<?, ?it/s]

[188366] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188367] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188366] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([8, 128])[188367] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2045, labels_s2s.shape: torch.Size([8, 128])

[188366] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([8, 128, 512]), current 0[188367] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([8, 128, 512]), current 1

[188131], cuda 0, mlm output: torch.Size([16, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([16, 128]), output_mlm.shape: torch.Size([16, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([16, 128]), output.shape: torch.Size([16, 128, 30522])


  7%|▋         | 1/15 [00:00<00:05,  2.76it/s]

[188369] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188368] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188369] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2021, labels_s2s.shape: torch.Size([6, 128])
[188368] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 20364, labels_s2s.shape: torch.Size([7, 128])
[188369] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([6, 128, 512]), current 1[188368] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0

[188131], cuda 0, mlm output: torch.Size([13, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([13, 128]), output_mlm.shape: torch.Size([13, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 20364, labels.shape: torch.Size([13, 128]), output.shape: torch.Size([13, 128, 30522])


 13%|█▎        | 2/15 [00:00<00:04,  2.88it/s]

[188370] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188371] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188371] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 10166, labels_s2s.shape: torch.Size([7, 128])[188370] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2469, labels_s2s.shape: torch.Size([7, 128])

[188371] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1[188370] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0

[188131], cuda 0, mlm output: torch.Size([14, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([14, 128]), output_mlm.shape: torch.Size([14, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2469, labels.shape: torch.Size([14, 128]), output.shape: torch.Size([14, 128, 30522])


 20%|██        | 3/15 [00:01<00:04,  2.88it/s]

[188373] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188372] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188373] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([6, 128])[188372] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2044, labels_s2s.shape: torch.Size([6, 128])

[188372] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([6, 128, 512]), current 0
[188373] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([6, 128, 512]), current 1
[188131], cuda 0, mlm output: torch.Size([12, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([12, 128]), output_mlm.shape: torch.Size([12, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2044, labels.shape: torch.Size([12, 128]), output.shape: torch.Size([12, 128, 30522])


 27%|██▋       | 4/15 [00:01<00:03,  2.95it/s]

[188374] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188375] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188375] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 1997, labels_s2s.shape: torch.Size([6, 128])[188374] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2057, labels_s2s.shape: torch.Size([6, 128])

[188375] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([6, 128, 512]), current 1[188374] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([6, 128, 512]), current 0

[188131], cuda 0, mlm output: torch.Size([12, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([12, 128]), output_mlm.shape: torch.Size([12, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2057, labels.shape: torch.Size([12, 128]), output.shape: torch.Size([12, 128, 30522])


 33%|███▎      | 5/15 [00:01<00:03,  3.00it/s]

[188376] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188377] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188377] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2007, labels_s2s.shape: torch.Size([7, 128])
[188376] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1037, labels_s2s.shape: torch.Size([7, 128])
[188377] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1
[188376] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0
[188131], cuda 0, mlm output: torch.Size([14, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([14, 128]), output_mlm.shape: torch.Size([14, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1037, labels.shape: torch.Size([14, 128]), output.shape: torch.Size([14, 128, 30522])


 40%|████      | 6/15 [00:02<00:03,  2.96it/s]

[188378] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188379] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188379] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2941, labels_s2s.shape: torch.Size([7, 128])
[188379] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1
[188378] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 4067, labels_s2s.shape: torch.Size([8, 128])
[188378] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([8, 128, 512]), current 0
[188131], cuda 0, mlm output: torch.Size([15, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([15, 128]), output_mlm.shape: torch.Size([15, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 4067, labels.shape: torch.Size([15, 128]), output.shape: torch.Size([15, 128, 30522])


 47%|████▋     | 7/15 [00:02<00:02,  2.92it/s]

[188381] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188380] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188380] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([7, 128])[188381] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 10167, labels_s2s.shape: torch.Size([7, 128])

[188381] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1
[188380] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0
[188131], cuda 0, mlm output: torch.Size([14, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([14, 128]), output_mlm.shape: torch.Size([14, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2002, labels.shape: torch.Size([14, 128]), output.shape: torch.Size([14, 128, 30522])


 53%|█████▎    | 8/15 [00:02<00:02,  2.90it/s]

[188382] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188383] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188383] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([7, 128])
[188382] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2821, labels_s2s.shape: torch.Size([7, 128])
[188383] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1
[188382] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0
[188131], cuda 0, mlm output: torch.Size([14, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([14, 128]), output_mlm.shape: torch.Size([14, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2821, labels.shape: torch.Size([14, 128]), output.shape: torch.Size([14, 128, 30522])


 60%|██████    | 9/15 [00:03<00:02,  2.88it/s]

[188384] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188385] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188384] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2014, labels_s2s.shape: torch.Size([7, 128])[188385] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([7, 128])

[188385] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1
[188384] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0
[188131], cuda 0, mlm output: torch.Size([14, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([14, 128]), output_mlm.shape: torch.Size([14, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2014, labels.shape: torch.Size([14, 128]), output.shape: torch.Size([14, 128, 30522])


 67%|██████▋   | 10/15 [00:03<00:01,  2.89it/s]

[188386] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188387] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188387] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2004, labels_s2s.shape: torch.Size([8, 128])[188386] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([8, 128])

[188386] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([8, 128, 512]), current 0[188387] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([8, 128, 512]), current 1

[188131], cuda 0, mlm output: torch.Size([16, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([16, 128]), output_mlm.shape: torch.Size([16, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([16, 128]), output.shape: torch.Size([16, 128, 30522])


 73%|███████▎  | 11/15 [00:03<00:01,  2.83it/s]

[188388] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188389] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188388] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([6, 128])
[188389] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([6, 128])
[188388] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([6, 128, 512]), current 0[188389] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([6, 128, 512]), current 1

[188131], cuda 0, mlm output: torch.Size([12, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([12, 128]), output_mlm.shape: torch.Size([12, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([12, 128]), output.shape: torch.Size([12, 128, 30522])


 80%|████████  | 12/15 [00:04<00:01,  2.89it/s]

[188391] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188390] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188390] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([7, 128])[188391] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([7, 128])

[188390] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0[188391] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1

[188131], cuda 0, mlm output: torch.Size([14, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([14, 128]), output_mlm.shape: torch.Size([14, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([14, 128]), output.shape: torch.Size([14, 128, 30522])


 87%|████████▋ | 13/15 [00:04<00:00,  2.89it/s]

[188392] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188393] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188393] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 1996, labels_s2s.shape: torch.Size([7, 128])
[188393] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1
[188392] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([8, 128])
[188392] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([8, 128, 512]), current 0
[188131], cuda 0, mlm output: torch.Size([15, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([15, 128]), output_mlm.shape: torch.Size([15, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2016, labels.shape: torch.Size([15, 128]), output.shape: torch.Size([15, 128, 30522])


100%|██████████| 15/15 [00:05<00:00,  2.96it/s]


[188394] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188395] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188395] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 1037, labels_s2s.shape: torch.Size([1, 128])[188394] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2106, labels_s2s.shape: torch.Size([1, 128])

[188395] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([1, 128, 512]), current 1
[188394] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131], cuda 0, mlm output: torch.Size([2, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([2, 128]), output_mlm.shape: torch.Size([2, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2106, labels.shape: torch.Size([2, 128]), output.shape: torch.Size([2, 128, 30522])
[2023-12-07 12:03:25.541122] Epoch: 2

  2%|▏         | 3/200 [00:00<00:07, 26.08it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2028, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2028, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2467, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

  4%|▍         | 9/200 [00:00<00:08, 23.39it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 4376, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 4376, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

  8%|▊         | 15/200 [00:00<00:07, 24.19it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 10%|█         | 21/200 [00:00<00:07, 24.89it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 8529, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 8529, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 14%|█▎        | 27/200 [00:01<00:06, 25.82it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 5616, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 5616, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 12756, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels

 16%|█▋        | 33/200 [00:01<00:06, 26.72it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2612, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2612, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2087, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 20%|█▉        | 39/200 [00:01<00:05, 27.28it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 5616, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 5616, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 7539, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 22%|██▎       | 45/200 [00:01<00:05, 26.54it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 5616, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 5616, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2612, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 26%|██▌       | 51/200 [00:01<00:05, 25.97it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 5293, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 5293, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 8430, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 27%|██▋       | 54/200 [00:02<00:05, 26.00it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2004, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2004, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 21877, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels

 30%|███       | 60/200 [00:02<00:05, 25.96it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 21877, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 21877, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 9592, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, label

 33%|███▎      | 66/200 [00:02<00:05, 26.63it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2215, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2215, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2054, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 36%|███▌      | 72/200 [00:02<00:04, 26.90it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2748, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2748, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 39%|███▉      | 78/200 [00:03<00:04, 26.88it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2016, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2009, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 42%|████▏     | 84/200 [00:03<00:04, 26.22it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1996, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1996, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2079, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 45%|████▌     | 90/200 [00:03<00:04, 26.17it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2059, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2059, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2054, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 48%|████▊     | 97/200 [00:03<00:03, 27.61it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 12643, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 12643, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2044, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, label

 52%|█████▏    | 103/200 [00:03<00:03, 28.11it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1999, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1999, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2014, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 55%|█████▍    | 109/200 [00:04<00:03, 28.61it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 5616, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 5616, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2074, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 57%|█████▊    | 115/200 [00:04<00:03, 28.03it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2044, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 60%|██████    | 121/200 [00:04<00:02, 27.94it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2425, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2425, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2053, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 64%|██████▎   | 127/200 [00:04<00:02, 27.17it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2428, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2428, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 4228, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 66%|██████▋   | 133/200 [00:05<00:02, 27.03it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2633, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2633, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 70%|██████▉   | 139/200 [00:05<00:02, 26.98it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 10167, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 10167, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 6701, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, label

 71%|███████   | 142/200 [00:05<00:02, 26.68it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 3524, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 3524, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2821, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 74%|███████▍  | 148/200 [00:05<00:01, 26.33it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2002, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2007, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 77%|███████▋  | 154/200 [00:06<00:02, 18.90it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 3127, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 80%|████████  | 160/200 [00:06<00:01, 23.01it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2085, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2085, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1996, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 82%|████████▏ | 163/200 [00:06<00:01, 24.69it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2062, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2062, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2339, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 85%|████████▌ | 170/200 [00:06<00:01, 26.92it/s]

[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2021, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 6854, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 6854, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 12756, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEnc

 88%|████████▊ | 176/200 [00:06<00:00, 27.57it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2348, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2348, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2021, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 91%|█████████ | 182/200 [00:07<00:00, 27.83it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2010, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2010, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2507, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 94%|█████████▍| 188/200 [00:07<00:00, 27.82it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2935, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2935, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2033, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 97%|█████████▋| 194/200 [00:07<00:00, 27.68it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 21877, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 21877, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 5616, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, label

100%|██████████| 200/200 [00:07<00:00, 26.08it/s]


[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2085, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2085, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2044, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

  0%|          | 0/15 [00:00<?, ?it/s]

[188403] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188402] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188403] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2045, labels_s2s.shape: torch.Size([8, 128])
[188402] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([8, 128])
[188403] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([8, 128, 512]), current 1
[188402] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([8, 128, 512]), current 0
[188131], cuda 0, mlm output: torch.Size([16, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([16, 128]), output_mlm.shape: torch.Size([16, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([16, 128]), output.shape: torch.Size([16, 128, 30522])


  7%|▋         | 1/15 [00:00<00:05,  2.75it/s]

[188404] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188405] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188405] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2021, labels_s2s.shape: torch.Size([6, 128])
[188404] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 20364, labels_s2s.shape: torch.Size([7, 128])
[188405] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([6, 128, 512]), current 1[188404] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0

[188131], cuda 0, mlm output: torch.Size([13, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([13, 128]), output_mlm.shape: torch.Size([13, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 20364, labels.shape: torch.Size([13, 128]), output.shape: torch.Size([13, 128, 30522])


 13%|█▎        | 2/15 [00:00<00:04,  2.93it/s]

[188407] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188406] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188407] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 10166, labels_s2s.shape: torch.Size([7, 128])
[188406] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2469, labels_s2s.shape: torch.Size([7, 128])
[188406] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0[188407] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1

[188131], cuda 0, mlm output: torch.Size([14, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([14, 128]), output_mlm.shape: torch.Size([14, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2469, labels.shape: torch.Size([14, 128]), output.shape: torch.Size([14, 128, 30522])


 20%|██        | 3/15 [00:01<00:04,  2.92it/s]

[188409] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188408] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188409] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([6, 128])[188408] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2044, labels_s2s.shape: torch.Size([6, 128])

[188408] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([6, 128, 512]), current 0[188409] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([6, 128, 512]), current 1

[188131], cuda 0, mlm output: torch.Size([12, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([12, 128]), output_mlm.shape: torch.Size([12, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2044, labels.shape: torch.Size([12, 128]), output.shape: torch.Size([12, 128, 30522])


 27%|██▋       | 4/15 [00:01<00:03,  3.00it/s]

[188411] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188410] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188410] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2057, labels_s2s.shape: torch.Size([6, 128])[188411] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 1997, labels_s2s.shape: torch.Size([6, 128])

[188410] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([6, 128, 512]), current 0
[188411] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([6, 128, 512]), current 1
[188131], cuda 0, mlm output: torch.Size([12, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([12, 128]), output_mlm.shape: torch.Size([12, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2057, labels.shape: torch.Size([12, 128]), output.shape: torch.Size([12, 128, 30522])


 33%|███▎      | 5/15 [00:01<00:03,  3.04it/s]

[188412] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188413] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188412] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1037, labels_s2s.shape: torch.Size([7, 128])[188413] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2007, labels_s2s.shape: torch.Size([7, 128])

[188413] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1
[188412] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0
[188131], cuda 0, mlm output: torch.Size([14, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([14, 128]), output_mlm.shape: torch.Size([14, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1037, labels.shape: torch.Size([14, 128]), output.shape: torch.Size([14, 128, 30522])


 40%|████      | 6/15 [00:02<00:02,  3.01it/s]

[188415] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188414] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188415] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2941, labels_s2s.shape: torch.Size([7, 128])
[188415] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1
[188414] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 4067, labels_s2s.shape: torch.Size([8, 128])
[188414] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([8, 128, 512]), current 0
[188131], cuda 0, mlm output: torch.Size([15, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([15, 128]), output_mlm.shape: torch.Size([15, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 4067, labels.shape: torch.Size([15, 128]), output.shape: torch.Size([15, 128, 30522])


 47%|████▋     | 7/15 [00:02<00:02,  2.95it/s]

[188416] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188417] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188416] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([7, 128])[188417] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 10167, labels_s2s.shape: torch.Size([7, 128])

[188417] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1
[188416] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0
[188131], cuda 0, mlm output: torch.Size([14, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([14, 128]), output_mlm.shape: torch.Size([14, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2002, labels.shape: torch.Size([14, 128]), output.shape: torch.Size([14, 128, 30522])


 53%|█████▎    | 8/15 [00:02<00:02,  2.93it/s]

[188419] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188418] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188419] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([7, 128])[188418] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2821, labels_s2s.shape: torch.Size([7, 128])

[188418] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0[188419] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1

[188131], cuda 0, mlm output: torch.Size([14, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([14, 128]), output_mlm.shape: torch.Size([14, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2821, labels.shape: torch.Size([14, 128]), output.shape: torch.Size([14, 128, 30522])


 60%|██████    | 9/15 [00:03<00:02,  2.92it/s]

[188421] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188420] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188421] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([7, 128])
[188420] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2014, labels_s2s.shape: torch.Size([7, 128])
[188421] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1[188420] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0

[188131], cuda 0, mlm output: torch.Size([14, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([14, 128]), output_mlm.shape: torch.Size([14, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2014, labels.shape: torch.Size([14, 128]), output.shape: torch.Size([14, 128, 30522])


 67%|██████▋   | 10/15 [00:03<00:01,  2.91it/s]

[188423] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188422] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188422] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([8, 128])
[188423] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2004, labels_s2s.shape: torch.Size([8, 128])
[188423] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([8, 128, 512]), current 1
[188422] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([8, 128, 512]), current 0
[188131], cuda 0, mlm output: torch.Size([16, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([16, 128]), output_mlm.shape: torch.Size([16, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([16, 128]), output.shape: torch.Size([16, 128, 30522])


 73%|███████▎  | 11/15 [00:03<00:01,  2.85it/s]

[188425] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188424] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188425] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([6, 128])
[188424] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([6, 128])
[188425] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([6, 128, 512]), current 1
[188424] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([6, 128, 512]), current 0
[188131], cuda 0, mlm output: torch.Size([12, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([12, 128]), output_mlm.shape: torch.Size([12, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([12, 128]), output.shape: torch.Size([12, 128, 30522])


 80%|████████  | 12/15 [00:04<00:01,  2.91it/s]

[188427] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188426] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188427] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([7, 128])
[188426] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([7, 128])
[188426] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0[188427] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1

[188131], cuda 0, mlm output: torch.Size([14, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([14, 128]), output_mlm.shape: torch.Size([14, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([14, 128]), output.shape: torch.Size([14, 128, 30522])


 87%|████████▋ | 13/15 [00:04<00:00,  2.91it/s]

[188429] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188428] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188429] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 1996, labels_s2s.shape: torch.Size([7, 128])
[188429] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1
[188428] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([8, 128])
[188428] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([8, 128, 512]), current 0
[188131], cuda 0, mlm output: torch.Size([15, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([15, 128]), output_mlm.shape: torch.Size([15, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2016, labels.shape: torch.Size([15, 128]), output.shape: torch.Size([15, 128, 30522])


100%|██████████| 15/15 [00:05<00:00,  2.98it/s]


[188430] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188431] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188431] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 1037, labels_s2s.shape: torch.Size([1, 128])[188430] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2106, labels_s2s.shape: torch.Size([1, 128])

[188431] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([1, 128, 512]), current 1
[188430] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131], cuda 0, mlm output: torch.Size([2, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([2, 128]), output_mlm.shape: torch.Size([2, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2106, labels.shape: torch.Size([2, 128]), output.shape: torch.Size([2, 128, 30522])
[2023-12-07 12:03:38.243087] Epoch: 3

  0%|          | 0/200 [00:00<?, ?it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0


  3%|▎         | 6/200 [00:00<00:06, 27.82it/s]

[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2028, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2028, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2467, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 1

  4%|▍         | 9/200 [00:00<00:06, 27.63it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 21877, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels

  8%|▊         | 15/200 [00:00<00:07, 25.43it/s]

[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2016, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2986, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 1

 10%|█         | 21/200 [00:00<00:06, 26.31it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 12643, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 12643, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 5616, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, label

 14%|█▎        | 27/200 [00:01<00:06, 26.95it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2129, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2129, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2975, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 16%|█▋        | 33/200 [00:01<00:06, 26.78it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2016, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 24781, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels

 20%|█▉        | 39/200 [00:01<00:05, 27.19it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 21877, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 21877, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 4067, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, label

 22%|██▎       | 45/200 [00:01<00:05, 26.87it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2204, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2204, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 26%|██▌       | 51/200 [00:01<00:05, 27.21it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2057, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2057, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2004, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 28%|██▊       | 57/200 [00:02<00:05, 27.30it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 21986, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels

 32%|███▏      | 63/200 [00:02<00:05, 26.48it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2673, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2673, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2009, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 34%|███▍      | 69/200 [00:02<00:05, 25.98it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 12756, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 12756, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, label

 38%|███▊      | 75/200 [00:02<00:04, 25.90it/s]

[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2021, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2021, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 1

 40%|████      | 81/200 [00:03<00:04, 26.87it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2016, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2053, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 44%|████▎     | 87/200 [00:03<00:04, 27.19it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1996, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1996, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1996, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 46%|████▋     | 93/200 [00:03<00:04, 26.74it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2074, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2074, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 50%|█████     | 100/200 [00:03<00:03, 27.69it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 21877, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 21877, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1998, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, label

 53%|█████▎    | 106/200 [00:03<00:03, 28.11it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2043, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2043, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 5513, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 56%|█████▌    | 112/200 [00:04<00:03, 28.34it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 12643, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 12643, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 10792, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labe

 59%|█████▉    | 118/200 [00:04<00:02, 27.82it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 3531, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 3531, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 21877, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels

 62%|██████▏   | 124/200 [00:04<00:02, 27.70it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 9036, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 9036, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 3127, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 65%|██████▌   | 130/200 [00:04<00:02, 27.93it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2077, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2077, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 4445, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 68%|██████▊   | 136/200 [00:05<00:02, 27.05it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2009, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2009, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2025, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 71%|███████   | 142/200 [00:05<00:02, 26.84it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2016, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2111, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 74%|███████▍  | 148/200 [00:05<00:01, 27.22it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2002, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2087, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 77%|███████▋  | 154/200 [00:05<00:01, 27.59it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 3127, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 80%|████████  | 160/200 [00:05<00:01, 28.30it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2085, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2085, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1996, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 83%|████████▎ | 166/200 [00:06<00:01, 28.91it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2062, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2062, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2339, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 86%|████████▌ | 172/200 [00:06<00:01, 27.61it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2021, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2021, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 6854, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 89%|████████▉ | 178/200 [00:06<00:00, 27.76it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2348, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2348, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2021, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 92%|█████████▏| 184/200 [00:06<00:00, 28.06it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2010, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2010, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2507, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 95%|█████████▌| 190/200 [00:06<00:00, 28.01it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2935, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2935, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2033, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 96%|█████████▋| 193/200 [00:07<00:00, 27.38it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 21877, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 21877, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 5616, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, label

100%|██████████| 200/200 [00:07<00:00, 27.11it/s]


[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2025, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2025, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2085, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

  0%|          | 0/15 [00:00<?, ?it/s]

[188433] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188432] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188432] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([8, 128])
[188433] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2045, labels_s2s.shape: torch.Size([8, 128])
[188432] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([8, 128, 512]), current 0[188433] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([8, 128, 512]), current 1

[188131], cuda 0, mlm output: torch.Size([16, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([16, 128]), output_mlm.shape: torch.Size([16, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([16, 128]), output.shape: torch.Size([16, 128, 30522])


  7%|▋         | 1/15 [00:00<00:05,  2.73it/s]

[188441] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188440] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188441] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2021, labels_s2s.shape: torch.Size([6, 128])
[188440] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 20364, labels_s2s.shape: torch.Size([7, 128])
[188441] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([6, 128, 512]), current 1[188440] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0

[188131], cuda 0, mlm output: torch.Size([13, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([13, 128]), output_mlm.shape: torch.Size([13, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 20364, labels.shape: torch.Size([13, 128]), output.shape: torch.Size([13, 128, 30522])


 13%|█▎        | 2/15 [00:00<00:04,  2.90it/s]

[188443] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188442] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188443] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 10166, labels_s2s.shape: torch.Size([7, 128])[188442] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2469, labels_s2s.shape: torch.Size([7, 128])

[188442] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0
[188443] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1
[188131], cuda 0, mlm output: torch.Size([14, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([14, 128]), output_mlm.shape: torch.Size([14, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2469, labels.shape: torch.Size([14, 128]), output.shape: torch.Size([14, 128, 30522])


 20%|██        | 3/15 [00:01<00:04,  2.91it/s]

[188445] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188445] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([6, 128])
[188445] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([6, 128, 512]), current 1
[188444] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188444] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2044, labels_s2s.shape: torch.Size([6, 128])
[188444] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([6, 128, 512]), current 0
[188131], cuda 0, mlm output: torch.Size([12, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([12, 128]), output_mlm.shape: torch.Size([12, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2044, labels.shape: torch.Size([12, 128]), output.shape: torch.Size([12, 128, 30522])


 27%|██▋       | 4/15 [00:01<00:04,  2.41it/s]

[188446] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188447] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188446] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2057, labels_s2s.shape: torch.Size([6, 128])[188447] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 1997, labels_s2s.shape: torch.Size([6, 128])

[188447] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([6, 128, 512]), current 1[188446] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([6, 128, 512]), current 0

[188131], cuda 0, mlm output: torch.Size([12, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([12, 128]), output_mlm.shape: torch.Size([12, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2057, labels.shape: torch.Size([12, 128]), output.shape: torch.Size([12, 128, 30522])


 33%|███▎      | 5/15 [00:01<00:03,  2.63it/s]

[188449] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188448] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188449] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2007, labels_s2s.shape: torch.Size([7, 128])[188448] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1037, labels_s2s.shape: torch.Size([7, 128])

[188449] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1[188448] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0

[188131], cuda 0, mlm output: torch.Size([14, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([14, 128]), output_mlm.shape: torch.Size([14, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1037, labels.shape: torch.Size([14, 128]), output.shape: torch.Size([14, 128, 30522])


 40%|████      | 6/15 [00:02<00:03,  2.71it/s]

[188451] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188450] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188451] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2941, labels_s2s.shape: torch.Size([7, 128])
[188451] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1
[188450] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 4067, labels_s2s.shape: torch.Size([8, 128])
[188450] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([8, 128, 512]), current 0
[188131], cuda 0, mlm output: torch.Size([15, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([15, 128]), output_mlm.shape: torch.Size([15, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 4067, labels.shape: torch.Size([15, 128]), output.shape: torch.Size([15, 128, 30522])


 47%|████▋     | 7/15 [00:02<00:02,  2.75it/s]

[188452] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188453] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188453] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 10167, labels_s2s.shape: torch.Size([7, 128])[188452] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([7, 128])

[188453] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1[188452] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0

[188131], cuda 0, mlm output: torch.Size([14, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([14, 128]), output_mlm.shape: torch.Size([14, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2002, labels.shape: torch.Size([14, 128]), output.shape: torch.Size([14, 128, 30522])


 53%|█████▎    | 8/15 [00:02<00:02,  2.79it/s]

[188455] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188454] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188455] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([7, 128])[188454] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2821, labels_s2s.shape: torch.Size([7, 128])

[188454] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0
[188455] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1
[188131], cuda 0, mlm output: torch.Size([14, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([14, 128]), output_mlm.shape: torch.Size([14, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2821, labels.shape: torch.Size([14, 128]), output.shape: torch.Size([14, 128, 30522])


 60%|██████    | 9/15 [00:03<00:02,  2.82it/s]

[188457] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188456] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188457] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([7, 128])
[188456] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2014, labels_s2s.shape: torch.Size([7, 128])
[188456] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0
[188457] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1
[188131], cuda 0, mlm output: torch.Size([14, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([14, 128]), output_mlm.shape: torch.Size([14, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2014, labels.shape: torch.Size([14, 128]), output.shape: torch.Size([14, 128, 30522])


 67%|██████▋   | 10/15 [00:03<00:01,  2.85it/s]

[188459] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188458] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188458] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([8, 128])[188459] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2004, labels_s2s.shape: torch.Size([8, 128])

[188458] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([8, 128, 512]), current 0
[188459] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([8, 128, 512]), current 1
[188131], cuda 0, mlm output: torch.Size([16, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([16, 128]), output_mlm.shape: torch.Size([16, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([16, 128]), output.shape: torch.Size([16, 128, 30522])


 73%|███████▎  | 11/15 [00:03<00:01,  2.83it/s]

[188460] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188461] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188460] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([6, 128])
[188461] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([6, 128])
[188460] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([6, 128, 512]), current 0[188461] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([6, 128, 512]), current 1

[188131], cuda 0, mlm output: torch.Size([12, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([12, 128]), output_mlm.shape: torch.Size([12, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([12, 128]), output.shape: torch.Size([12, 128, 30522])


 80%|████████  | 12/15 [00:04<00:01,  2.91it/s]

[188462] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188463] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188463] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([7, 128])[188462] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([7, 128])

[188463] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1[188462] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0

[188131], cuda 0, mlm output: torch.Size([14, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([14, 128]), output_mlm.shape: torch.Size([14, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([14, 128]), output.shape: torch.Size([14, 128, 30522])


 87%|████████▋ | 13/15 [00:04<00:00,  2.92it/s]

[188464] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188465] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188465] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 1996, labels_s2s.shape: torch.Size([7, 128])
[188465] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1
[188464] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([8, 128])
[188464] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([8, 128, 512]), current 0
[188131], cuda 0, mlm output: torch.Size([15, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([15, 128]), output_mlm.shape: torch.Size([15, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2016, labels.shape: torch.Size([15, 128]), output.shape: torch.Size([15, 128, 30522])


100%|██████████| 15/15 [00:05<00:00,  2.87it/s]


[188466] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188467] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188467] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 1037, labels_s2s.shape: torch.Size([1, 128])[188466] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2106, labels_s2s.shape: torch.Size([1, 128])

[188466] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188467] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([1, 128, 512]), current 1
[188131], cuda 0, mlm output: torch.Size([2, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([2, 128]), output_mlm.shape: torch.Size([2, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2106, labels.shape: torch.Size([2, 128]), output.shape: torch.Size([2, 128, 30522])
[2023-12-07 12:03:50.847293] Epoch: 4

  3%|▎         | 6/200 [00:00<00:07, 26.00it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2028, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2028, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2467, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

  6%|▌         | 12/200 [00:00<00:07, 26.19it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 21877, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels

  9%|▉         | 18/200 [00:00<00:06, 26.38it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2986, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2986, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 4441, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 12%|█▏        | 24/200 [00:00<00:06, 26.54it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 5616, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 5616, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2823, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 15%|█▌        | 30/200 [00:01<00:06, 26.70it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2975, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2975, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2017, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 18%|█▊        | 36/200 [00:01<00:06, 27.03it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 24781, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 24781, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 12756, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labe

 21%|██        | 42/200 [00:01<00:05, 27.03it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 4067, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 4067, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2009, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 24%|██▍       | 48/200 [00:01<00:05, 27.16it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2002, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2057, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 27%|██▋       | 54/200 [00:02<00:05, 27.58it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2004, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2004, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2079, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 30%|███       | 60/200 [00:02<00:05, 26.99it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 21986, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 21986, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2009, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, label

 33%|███▎      | 66/200 [00:02<00:05, 26.53it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2009, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2009, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 36%|███▌      | 72/200 [00:02<00:04, 26.39it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2002, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 39%|███▉      | 78/200 [00:02<00:04, 26.86it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2021, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2021, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1998, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 40%|████      | 81/200 [00:03<00:04, 26.13it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2053, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2053, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 44%|████▎     | 87/200 [00:03<00:04, 25.39it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1996, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1996, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 46%|████▋     | 93/200 [00:03<00:04, 26.17it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 9775, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 50%|████▉     | 99/200 [00:03<00:03, 26.39it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1998, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1998, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2024, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 52%|█████▎    | 105/200 [00:03<00:03, 25.56it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 5513, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 5513, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2009, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 56%|█████▌    | 111/200 [00:04<00:03, 26.46it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 10792, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 10792, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, label

 58%|█████▊    | 117/200 [00:04<00:03, 26.67it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 21877, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 21877, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2004, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, label

 62%|██████▏   | 123/200 [00:04<00:02, 27.13it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 3127, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 3127, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2024, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 64%|██████▍   | 129/200 [00:04<00:02, 25.97it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 4445, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 4445, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 5026, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 68%|██████▊   | 135/200 [00:05<00:02, 26.04it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2025, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2025, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 70%|███████   | 141/200 [00:05<00:02, 25.42it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2111, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2111, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 74%|███████▎  | 147/200 [00:05<00:02, 26.27it/s]

[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2002, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2087, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 1

 76%|███████▋  | 153/200 [00:05<00:01, 27.09it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 3127, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 80%|███████▉  | 159/200 [00:06<00:01, 27.59it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2085, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2085, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1996, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 82%|████████▎ | 165/200 [00:06<00:01, 26.93it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2062, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2062, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2339, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 86%|████████▌ | 171/200 [00:06<00:01, 26.86it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2021, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2021, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 6854, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 88%|████████▊ | 177/200 [00:06<00:00, 25.90it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2348, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2348, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2021, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 92%|█████████▏| 183/200 [00:06<00:00, 26.40it/s]

[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 12756, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 12756, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2010, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1,

 94%|█████████▍| 189/200 [00:07<00:00, 26.01it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2009, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2009, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2935, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 98%|█████████▊| 195/200 [00:07<00:00, 26.34it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 3100, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 3100, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 21877, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels

100%|██████████| 200/200 [00:07<00:00, 26.46it/s]


[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2025, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2025, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2085, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

  0%|          | 0/15 [00:00<?, ?it/s]

[188468] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188469] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188468] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([8, 128])[188469] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2045, labels_s2s.shape: torch.Size([8, 128])

[188469] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([8, 128, 512]), current 1
[188468] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([8, 128, 512]), current 0
[188131], cuda 0, mlm output: torch.Size([16, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([16, 128]), output_mlm.shape: torch.Size([16, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([16, 128]), output.shape: torch.Size([16, 128, 30522])


  7%|▋         | 1/15 [00:00<00:05,  2.73it/s]

[188470] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188471] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188471] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2021, labels_s2s.shape: torch.Size([6, 128])
[188470] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 20364, labels_s2s.shape: torch.Size([7, 128])
[188470] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0[188471] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([6, 128, 512]), current 1

[188131], cuda 0, mlm output: torch.Size([13, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([13, 128]), output_mlm.shape: torch.Size([13, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 20364, labels.shape: torch.Size([13, 128]), output.shape: torch.Size([13, 128, 30522])


 13%|█▎        | 2/15 [00:00<00:04,  2.88it/s]

[188472] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188473] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188473] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 10166, labels_s2s.shape: torch.Size([7, 128])
[188472] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2469, labels_s2s.shape: torch.Size([7, 128])
[188473] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1
[188472] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0
[188131], cuda 0, mlm output: torch.Size([14, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([14, 128]), output_mlm.shape: torch.Size([14, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2469, labels.shape: torch.Size([14, 128]), output.shape: torch.Size([14, 128, 30522])


 20%|██        | 3/15 [00:01<00:05,  2.31it/s]

[188474] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188475] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188474] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2044, labels_s2s.shape: torch.Size([6, 128])[188475] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([6, 128])

[188474] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([6, 128, 512]), current 0[188475] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([6, 128, 512]), current 1

[188131], cuda 0, mlm output: torch.Size([12, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([12, 128]), output_mlm.shape: torch.Size([12, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2044, labels.shape: torch.Size([12, 128]), output.shape: torch.Size([12, 128, 30522])


 27%|██▋       | 4/15 [00:01<00:04,  2.55it/s]

[188477] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188476] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188477] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 1997, labels_s2s.shape: torch.Size([6, 128])[188476] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2057, labels_s2s.shape: torch.Size([6, 128])

[188477] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([6, 128, 512]), current 1[188476] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([6, 128, 512]), current 0

[188131], cuda 0, mlm output: torch.Size([12, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([12, 128]), output_mlm.shape: torch.Size([12, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2057, labels.shape: torch.Size([12, 128]), output.shape: torch.Size([12, 128, 30522])


 33%|███▎      | 5/15 [00:01<00:03,  2.72it/s]

[188479] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188478] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188479] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2007, labels_s2s.shape: torch.Size([7, 128])[188478] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1037, labels_s2s.shape: torch.Size([7, 128])

[188479] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1[188478] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0

[188131], cuda 0, mlm output: torch.Size([14, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([14, 128]), output_mlm.shape: torch.Size([14, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1037, labels.shape: torch.Size([14, 128]), output.shape: torch.Size([14, 128, 30522])


 40%|████      | 6/15 [00:02<00:03,  2.79it/s]

[188481] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188480] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188481] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2941, labels_s2s.shape: torch.Size([7, 128])
[188481] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1
[188480] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 4067, labels_s2s.shape: torch.Size([8, 128])
[188480] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([8, 128, 512]), current 0
[188131], cuda 0, mlm output: torch.Size([15, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([15, 128]), output_mlm.shape: torch.Size([15, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 4067, labels.shape: torch.Size([15, 128]), output.shape: torch.Size([15, 128, 30522])


 47%|████▋     | 7/15 [00:02<00:02,  2.81it/s]

[188482] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188483] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188483] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 10167, labels_s2s.shape: torch.Size([7, 128])
[188482] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([7, 128])
[188482] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0[188483] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1

[188131], cuda 0, mlm output: torch.Size([14, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([14, 128]), output_mlm.shape: torch.Size([14, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2002, labels.shape: torch.Size([14, 128]), output.shape: torch.Size([14, 128, 30522])


 53%|█████▎    | 8/15 [00:02<00:02,  2.83it/s]

[188485] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188484] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188485] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([7, 128])[188484] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2821, labels_s2s.shape: torch.Size([7, 128])

[188484] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0[188485] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1

[188131], cuda 0, mlm output: torch.Size([14, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([14, 128]), output_mlm.shape: torch.Size([14, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2821, labels.shape: torch.Size([14, 128]), output.shape: torch.Size([14, 128, 30522])


 60%|██████    | 9/15 [00:03<00:02,  2.80it/s]

[188487] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188486] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188486] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2014, labels_s2s.shape: torch.Size([7, 128])[188487] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([7, 128])

[188487] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1
[188486] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0
[188131], cuda 0, mlm output: torch.Size([14, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([14, 128]), output_mlm.shape: torch.Size([14, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2014, labels.shape: torch.Size([14, 128]), output.shape: torch.Size([14, 128, 30522])


 67%|██████▋   | 10/15 [00:03<00:01,  2.82it/s]

[188488] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188489] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188488] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([8, 128])
[188489] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2004, labels_s2s.shape: torch.Size([8, 128])
[188488] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([8, 128, 512]), current 0[188489] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([8, 128, 512]), current 1

[188131], cuda 0, mlm output: torch.Size([16, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([16, 128]), output_mlm.shape: torch.Size([16, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([16, 128]), output.shape: torch.Size([16, 128, 30522])


 73%|███████▎  | 11/15 [00:04<00:01,  2.77it/s]

[188490] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188491] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188491] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([6, 128])
[188490] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([6, 128])
[188491] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([6, 128, 512]), current 1
[188490] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([6, 128, 512]), current 0
[188131], cuda 0, mlm output: torch.Size([12, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([12, 128]), output_mlm.shape: torch.Size([12, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([12, 128]), output.shape: torch.Size([12, 128, 30522])


 80%|████████  | 12/15 [00:04<00:01,  2.86it/s]

[188493] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188492] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188493] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([7, 128])
[188492] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([7, 128])
[188493] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1
[188492] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0
[188131], cuda 0, mlm output: torch.Size([14, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([14, 128]), output_mlm.shape: torch.Size([14, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([14, 128]), output.shape: torch.Size([14, 128, 30522])


 87%|████████▋ | 13/15 [00:04<00:00,  2.87it/s]

[188494] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188495] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188495] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 1996, labels_s2s.shape: torch.Size([7, 128])
[188495] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1
[188494] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([8, 128])
[188494] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([8, 128, 512]), current 0
[188131], cuda 0, mlm output: torch.Size([15, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([15, 128]), output_mlm.shape: torch.Size([15, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2016, labels.shape: torch.Size([15, 128]), output.shape: torch.Size([15, 128, 30522])


100%|██████████| 15/15 [00:05<00:00,  2.85it/s]


[188497] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188496] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188497] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 1037, labels_s2s.shape: torch.Size([1, 128])[188496] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2106, labels_s2s.shape: torch.Size([1, 128])

[188496] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188497] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([1, 128, 512]), current 1
[188131], cuda 0, mlm output: torch.Size([2, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([2, 128]), output_mlm.shape: torch.Size([2, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2106, labels.shape: torch.Size([2, 128]), output.shape: torch.Size([2, 128, 30522])
[2023-12-07 12:04:03.681295] Epoch: 5

  2%|▏         | 3/200 [00:00<00:08, 23.14it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2028, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2028, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2467, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

  4%|▍         | 9/200 [00:00<00:07, 24.60it/s]

[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 4376, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([1, 128]),

  8%|▊         | 15/200 [00:00<00:06, 26.54it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2016, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2986, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 10%|█         | 21/200 [00:00<00:06, 26.97it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 12643, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 12643, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 5616, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, label

 14%|█▎        | 27/200 [00:01<00:06, 26.16it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2129, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2129, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2975, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 16%|█▋        | 33/200 [00:01<00:06, 25.76it/s]

[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2087, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2016, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 24781, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEnc

 20%|█▉        | 39/200 [00:01<00:06, 25.75it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 7539, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 7539, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 21877, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels

 22%|██▎       | 45/200 [00:01<00:05, 26.56it/s]

[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 5616, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2612, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2612, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2204, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEnco

 26%|██▌       | 51/200 [00:01<00:05, 27.50it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 5293, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 5293, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 8430, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 28%|██▊       | 57/200 [00:02<00:05, 27.15it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2004, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2004, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 21877, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels

 32%|███▏      | 63/200 [00:02<00:05, 26.84it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 9592, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 9592, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 33%|███▎      | 66/200 [00:02<00:05, 26.49it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2054, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2054, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 3398, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 36%|███▌      | 72/200 [00:02<00:04, 26.21it/s]

[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2748, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2021, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEnco

 39%|███▉      | 78/200 [00:02<00:04, 26.55it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2016, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2009, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 42%|████▏     | 84/200 [00:03<00:04, 26.48it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1996, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1996, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2079, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 45%|████▌     | 90/200 [00:03<00:04, 26.81it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2054, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2054, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 48%|████▊     | 97/200 [00:03<00:03, 28.19it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2044, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2044, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2043, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 52%|█████▏    | 103/200 [00:03<00:03, 28.65it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2014, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2014, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2302, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 55%|█████▍    | 109/200 [00:04<00:03, 27.34it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2074, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2074, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 57%|█████▊    | 115/200 [00:04<00:03, 26.86it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2044, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2044, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2044, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 60%|██████    | 121/200 [00:04<00:02, 27.60it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2053, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2053, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2021, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 64%|██████▎   | 127/200 [00:04<00:02, 27.45it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 4228, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 4228, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 66%|██████▋   | 133/200 [00:04<00:02, 28.01it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2016, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 21877, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels

 70%|██████▉   | 139/200 [00:05<00:02, 27.72it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 6701, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 6701, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 72%|███████▎  | 145/200 [00:05<00:02, 27.32it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2821, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2821, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 4067, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 74%|███████▍  | 148/200 [00:05<00:01, 26.82it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2007, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2007, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2644, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 77%|███████▋  | 154/200 [00:05<00:02, 19.00it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 3127, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 80%|████████  | 160/200 [00:06<00:01, 22.50it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2085, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2085, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1996, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 83%|████████▎ | 166/200 [00:06<00:01, 24.79it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2062, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2062, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2339, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 86%|████████▌ | 172/200 [00:06<00:01, 26.33it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2021, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2021, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 6854, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 89%|████████▉ | 178/200 [00:06<00:00, 27.21it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2348, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2348, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2021, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 92%|█████████▏| 184/200 [00:07<00:00, 27.27it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2010, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2010, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2507, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 95%|█████████▌| 190/200 [00:07<00:00, 27.43it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2935, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2935, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2033, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 96%|█████████▋| 193/200 [00:07<00:00, 27.01it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 21877, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 21877, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 5616, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, label

100%|██████████| 200/200 [00:07<00:00, 26.15it/s]


[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2025, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2085, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2085, labels.shape: torch.Size([1, 128]),

  0%|          | 0/15 [00:00<?, ?it/s]

[188534] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188535] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188535] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2045, labels_s2s.shape: torch.Size([8, 128])[188534] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([8, 128])

[188534] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([8, 128, 512]), current 0[188535] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([8, 128, 512]), current 1

[188131], cuda 0, mlm output: torch.Size([16, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([16, 128]), output_mlm.shape: torch.Size([16, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([16, 128]), output.shape: torch.Size([16, 128, 30522])


  7%|▋         | 1/15 [00:00<00:05,  2.79it/s]

[188537] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188536] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188537] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2021, labels_s2s.shape: torch.Size([6, 128])
[188536] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 20364, labels_s2s.shape: torch.Size([7, 128])
[188537] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([6, 128, 512]), current 1[188536] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0

[188131], cuda 0, mlm output: torch.Size([13, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([13, 128]), output_mlm.shape: torch.Size([13, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 20364, labels.shape: torch.Size([13, 128]), output.shape: torch.Size([13, 128, 30522])


 13%|█▎        | 2/15 [00:00<00:04,  2.93it/s]

[188539] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188538] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188539] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 10166, labels_s2s.shape: torch.Size([7, 128])[188538] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2469, labels_s2s.shape: torch.Size([7, 128])

[188538] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0[188539] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1

[188131], cuda 0, mlm output: torch.Size([14, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([14, 128]), output_mlm.shape: torch.Size([14, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2469, labels.shape: torch.Size([14, 128]), output.shape: torch.Size([14, 128, 30522])


 20%|██        | 3/15 [00:01<00:04,  2.94it/s]

[188541] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188540] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188540] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2044, labels_s2s.shape: torch.Size([6, 128])
[188541] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([6, 128])
[188540] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([6, 128, 512]), current 0[188541] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([6, 128, 512]), current 1

[188131], cuda 0, mlm output: torch.Size([12, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([12, 128]), output_mlm.shape: torch.Size([12, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2044, labels.shape: torch.Size([12, 128]), output.shape: torch.Size([12, 128, 30522])


 27%|██▋       | 4/15 [00:01<00:03,  3.00it/s]

[188543] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188542] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188542] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2057, labels_s2s.shape: torch.Size([6, 128])
[188543] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 1997, labels_s2s.shape: torch.Size([6, 128])
[188543] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([6, 128, 512]), current 1[188542] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([6, 128, 512]), current 0

[188131], cuda 0, mlm output: torch.Size([12, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([12, 128]), output_mlm.shape: torch.Size([12, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2057, labels.shape: torch.Size([12, 128]), output.shape: torch.Size([12, 128, 30522])


 33%|███▎      | 5/15 [00:01<00:03,  3.07it/s]

[188544] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188545] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188545] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2007, labels_s2s.shape: torch.Size([7, 128])
[188544] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1037, labels_s2s.shape: torch.Size([7, 128])
[188544] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0[188545] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1

[188131], cuda 0, mlm output: torch.Size([14, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([14, 128]), output_mlm.shape: torch.Size([14, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1037, labels.shape: torch.Size([14, 128]), output.shape: torch.Size([14, 128, 30522])


 40%|████      | 6/15 [00:02<00:02,  3.02it/s]

[188547] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188546] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188547] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2941, labels_s2s.shape: torch.Size([7, 128])
[188547] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1
[188546] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 4067, labels_s2s.shape: torch.Size([8, 128])
[188546] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([8, 128, 512]), current 0
[188131], cuda 0, mlm output: torch.Size([15, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([15, 128]), output_mlm.shape: torch.Size([15, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 4067, labels.shape: torch.Size([15, 128]), output.shape: torch.Size([15, 128, 30522])


 47%|████▋     | 7/15 [00:02<00:02,  2.96it/s]

[188548] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188549] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188549] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 10167, labels_s2s.shape: torch.Size([7, 128])
[188548] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([7, 128])
[188549] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1[188548] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0

[188131], cuda 0, mlm output: torch.Size([14, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([14, 128]), output_mlm.shape: torch.Size([14, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2002, labels.shape: torch.Size([14, 128]), output.shape: torch.Size([14, 128, 30522])


 53%|█████▎    | 8/15 [00:02<00:02,  2.95it/s]

[188550] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188551] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188551] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([7, 128])[188550] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2821, labels_s2s.shape: torch.Size([7, 128])

[188550] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0
[188551] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1
[188131], cuda 0, mlm output: torch.Size([14, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([14, 128]), output_mlm.shape: torch.Size([14, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2821, labels.shape: torch.Size([14, 128]), output.shape: torch.Size([14, 128, 30522])


 60%|██████    | 9/15 [00:03<00:02,  2.94it/s]

[188552] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188553] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188553] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([7, 128])
[188552] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2014, labels_s2s.shape: torch.Size([7, 128])
[188552] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0[188553] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1

[188131], cuda 0, mlm output: torch.Size([14, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([14, 128]), output_mlm.shape: torch.Size([14, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2014, labels.shape: torch.Size([14, 128]), output.shape: torch.Size([14, 128, 30522])


 67%|██████▋   | 10/15 [00:03<00:01,  2.92it/s]

[188554] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188555] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188554] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([8, 128])[188555] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2004, labels_s2s.shape: torch.Size([8, 128])

[188555] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([8, 128, 512]), current 1[188554] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([8, 128, 512]), current 0

[188131], cuda 0, mlm output: torch.Size([16, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([16, 128]), output_mlm.shape: torch.Size([16, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([16, 128]), output.shape: torch.Size([16, 128, 30522])


 73%|███████▎  | 11/15 [00:03<00:01,  2.86it/s]

[188557] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188556] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188557] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([6, 128])[188556] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([6, 128])

[188557] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([6, 128, 512]), current 1[188556] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([6, 128, 512]), current 0

[188131], cuda 0, mlm output: torch.Size([12, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([12, 128]), output_mlm.shape: torch.Size([12, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([12, 128]), output.shape: torch.Size([12, 128, 30522])


 80%|████████  | 12/15 [00:04<00:01,  2.92it/s]

[188558] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188559] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188558] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([7, 128])
[188559] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([7, 128])
[188558] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0[188559] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1

[188131], cuda 0, mlm output: torch.Size([14, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([14, 128]), output_mlm.shape: torch.Size([14, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([14, 128]), output.shape: torch.Size([14, 128, 30522])


 87%|████████▋ | 13/15 [00:04<00:00,  2.91it/s]

[188560] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188561] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188561] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 1996, labels_s2s.shape: torch.Size([7, 128])
[188561] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1
[188560] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([8, 128])
[188560] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([8, 128, 512]), current 0
[188131], cuda 0, mlm output: torch.Size([15, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([15, 128]), output_mlm.shape: torch.Size([15, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2016, labels.shape: torch.Size([15, 128]), output.shape: torch.Size([15, 128, 30522])


100%|██████████| 15/15 [00:05<00:00,  2.99it/s]


[188562] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188563] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188563] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 1037, labels_s2s.shape: torch.Size([1, 128])[188562] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2106, labels_s2s.shape: torch.Size([1, 128])

[188563] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([1, 128, 512]), current 1[188562] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0

[188131], cuda 0, mlm output: torch.Size([2, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([2, 128]), output_mlm.shape: torch.Size([2, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2106, labels.shape: torch.Size([2, 128]), output.shape: torch.Size([2, 128, 30522])
[2023-12-07 12:04:16.358379] Epoch: 6

  2%|▏         | 3/200 [00:00<00:07, 25.73it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2028, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2028, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2467, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

  4%|▍         | 9/200 [00:00<00:07, 23.99it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 4376, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 4376, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

  8%|▊         | 15/200 [00:00<00:07, 25.03it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 10%|█         | 21/200 [00:00<00:06, 25.84it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2016, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 12643, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels

 14%|█▎        | 27/200 [00:01<00:06, 25.22it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 12756, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 12756, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2129, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, label

 16%|█▋        | 33/200 [00:01<00:06, 25.35it/s]

[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2612, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2087, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2087, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: out

 18%|█▊        | 36/200 [00:01<00:06, 25.30it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 5616, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 5616, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 7539, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 21%|██        | 42/200 [00:01<00:06, 25.67it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1996, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1996, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 5616, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 24%|██▍       | 48/200 [00:01<00:05, 25.99it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 9462, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 9462, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 5293, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 27%|██▋       | 54/200 [00:02<00:05, 26.57it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 21877, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 21877, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2004, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, label

 30%|███       | 60/200 [00:02<00:05, 26.87it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 21877, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 21877, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 9592, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, label

 33%|███▎      | 66/200 [00:02<00:05, 26.09it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2215, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2215, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2054, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 36%|███▌      | 72/200 [00:02<00:05, 24.26it/s]

[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2016, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2748, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2748, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEnco

 39%|███▉      | 78/200 [00:03<00:04, 25.07it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2021, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2021, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1998, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 42%|████▏     | 84/200 [00:03<00:04, 25.79it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2053, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2053, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 45%|████▌     | 90/200 [00:03<00:04, 25.70it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2059, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 48%|████▊     | 96/200 [00:03<00:03, 27.16it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 9775, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 9775, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 12643, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels

 51%|█████     | 102/200 [00:03<00:03, 28.46it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2024, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2024, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1999, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 54%|█████▍    | 108/200 [00:04<00:03, 28.09it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2009, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2009, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 5616, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 57%|█████▋    | 114/200 [00:04<00:02, 28.91it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2002, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 60%|██████    | 120/200 [00:04<00:02, 28.34it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2004, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2004, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2425, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 63%|██████▎   | 126/200 [00:04<00:02, 27.71it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2024, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2024, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2428, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 66%|██████▌   | 132/200 [00:05<00:02, 26.70it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 5026, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 5026, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2633, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 69%|██████▉   | 138/200 [00:05<00:02, 26.76it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 10167, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels

 72%|███████▏  | 144/200 [00:05<00:02, 27.05it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2016, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 3524, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 75%|███████▌  | 150/200 [00:05<00:01, 26.45it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2016, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 78%|███████▊  | 156/200 [00:05<00:01, 27.47it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 21468, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 21468, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 3201, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, label

 80%|███████▉  | 159/200 [00:06<00:01, 28.08it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2016, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2292, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 83%|████████▎ | 166/200 [00:06<00:01, 21.22it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2016, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 87%|████████▋ | 174/200 [00:06<00:01, 25.15it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 12756, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 12756, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2085, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, label

 90%|█████████ | 180/200 [00:06<00:00, 26.73it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 9152, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 9152, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2348, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 93%|█████████▎| 186/200 [00:07<00:00, 26.76it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2182, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2182, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 95%|█████████▌| 190/200 [00:07<00:00, 28.16it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2498, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2498, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1998, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 98%|█████████▊| 196/200 [00:07<00:00, 27.78it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2572, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2572, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2198, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

100%|██████████| 200/200 [00:07<00:00, 26.19it/s]


[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2053, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2053, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

  0%|          | 0/15 [00:00<?, ?it/s]

[188571] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188570] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188571] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2045, labels_s2s.shape: torch.Size([8, 128])
[188570] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([8, 128])
[188571] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([8, 128, 512]), current 1[188570] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([8, 128, 512]), current 0

[188131], cuda 0, mlm output: torch.Size([16, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([16, 128]), output_mlm.shape: torch.Size([16, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([16, 128]), output.shape: torch.Size([16, 128, 30522])


  7%|▋         | 1/15 [00:00<00:05,  2.78it/s]

[188572] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188573] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188573] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2021, labels_s2s.shape: torch.Size([6, 128])
[188572] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 20364, labels_s2s.shape: torch.Size([7, 128])
[188573] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([6, 128, 512]), current 1
[188572] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0
[188131], cuda 0, mlm output: torch.Size([13, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([13, 128]), output_mlm.shape: torch.Size([13, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 20364, labels.shape: torch.Size([13, 128]), output.shape: torch.Size([13, 128, 30522])


 13%|█▎        | 2/15 [00:00<00:04,  2.96it/s]

[188574] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188575] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188575] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 10166, labels_s2s.shape: torch.Size([7, 128])[188574] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2469, labels_s2s.shape: torch.Size([7, 128])

[188574] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0[188575] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1

[188131], cuda 0, mlm output: torch.Size([14, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([14, 128]), output_mlm.shape: torch.Size([14, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2469, labels.shape: torch.Size([14, 128]), output.shape: torch.Size([14, 128, 30522])


 20%|██        | 3/15 [00:01<00:04,  2.94it/s]

[188576] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188577] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188576] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2044, labels_s2s.shape: torch.Size([6, 128])[188577] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([6, 128])

[188577] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([6, 128, 512]), current 1[188576] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([6, 128, 512]), current 0

[188131], cuda 0, mlm output: torch.Size([12, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([12, 128]), output_mlm.shape: torch.Size([12, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2044, labels.shape: torch.Size([12, 128]), output.shape: torch.Size([12, 128, 30522])


 27%|██▋       | 4/15 [00:01<00:03,  3.02it/s]

[188578] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188579] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188578] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2057, labels_s2s.shape: torch.Size([6, 128])
[188579] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 1997, labels_s2s.shape: torch.Size([6, 128])
[188578] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([6, 128, 512]), current 0
[188579] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([6, 128, 512]), current 1
[188131], cuda 0, mlm output: torch.Size([12, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([12, 128]), output_mlm.shape: torch.Size([12, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2057, labels.shape: torch.Size([12, 128]), output.shape: torch.Size([12, 128, 30522])


 33%|███▎      | 5/15 [00:01<00:03,  3.05it/s]

[188581] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188580] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188581] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2007, labels_s2s.shape: torch.Size([7, 128])[188580] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1037, labels_s2s.shape: torch.Size([7, 128])

[188580] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0[188581] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1

[188131], cuda 0, mlm output: torch.Size([14, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([14, 128]), output_mlm.shape: torch.Size([14, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1037, labels.shape: torch.Size([14, 128]), output.shape: torch.Size([14, 128, 30522])


 40%|████      | 6/15 [00:02<00:02,  3.02it/s]

[188582] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188583] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188583] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2941, labels_s2s.shape: torch.Size([7, 128])
[188583] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1
[188582] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 4067, labels_s2s.shape: torch.Size([8, 128])
[188582] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([8, 128, 512]), current 0
[188131], cuda 0, mlm output: torch.Size([15, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([15, 128]), output_mlm.shape: torch.Size([15, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 4067, labels.shape: torch.Size([15, 128]), output.shape: torch.Size([15, 128, 30522])


 47%|████▋     | 7/15 [00:02<00:02,  2.97it/s]

[188584] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188585] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188584] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([7, 128])[188585] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 10167, labels_s2s.shape: torch.Size([7, 128])

[188585] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1[188584] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0

[188131], cuda 0, mlm output: torch.Size([14, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([14, 128]), output_mlm.shape: torch.Size([14, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2002, labels.shape: torch.Size([14, 128]), output.shape: torch.Size([14, 128, 30522])


 53%|█████▎    | 8/15 [00:02<00:02,  2.96it/s]

[188587] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188586] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188586] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2821, labels_s2s.shape: torch.Size([7, 128])[188587] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([7, 128])

[188587] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1
[188586] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0
[188131], cuda 0, mlm output: torch.Size([14, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([14, 128]), output_mlm.shape: torch.Size([14, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2821, labels.shape: torch.Size([14, 128]), output.shape: torch.Size([14, 128, 30522])


 60%|██████    | 9/15 [00:03<00:02,  2.95it/s]

[188589] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188588] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188589] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([7, 128])
[188588] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2014, labels_s2s.shape: torch.Size([7, 128])
[188588] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0[188589] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1

[188131], cuda 0, mlm output: torch.Size([14, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([14, 128]), output_mlm.shape: torch.Size([14, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2014, labels.shape: torch.Size([14, 128]), output.shape: torch.Size([14, 128, 30522])


 67%|██████▋   | 10/15 [00:03<00:01,  2.94it/s]

[188590] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188591] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188591] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2004, labels_s2s.shape: torch.Size([8, 128])
[188590] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([8, 128])
[188591] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([8, 128, 512]), current 1
[188590] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([8, 128, 512]), current 0
[188131], cuda 0, mlm output: torch.Size([16, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([16, 128]), output_mlm.shape: torch.Size([16, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([16, 128]), output.shape: torch.Size([16, 128, 30522])


 73%|███████▎  | 11/15 [00:03<00:01,  2.90it/s]

[188592] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188593] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188592] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([6, 128])[188593] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([6, 128])

[188592] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([6, 128, 512]), current 0[188593] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([6, 128, 512]), current 1

[188131], cuda 0, mlm output: torch.Size([12, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([12, 128]), output_mlm.shape: torch.Size([12, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([12, 128]), output.shape: torch.Size([12, 128, 30522])


 80%|████████  | 12/15 [00:04<00:01,  2.94it/s]

[188594] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188595] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188595] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([7, 128])[188594] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([7, 128])

[188595] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1[188594] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0

[188131], cuda 0, mlm output: torch.Size([14, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([14, 128]), output_mlm.shape: torch.Size([14, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([14, 128]), output.shape: torch.Size([14, 128, 30522])


 87%|████████▋ | 13/15 [00:04<00:00,  2.93it/s]

[188597] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188596] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188597] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 1996, labels_s2s.shape: torch.Size([7, 128])
[188597] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1
[188596] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([8, 128])
[188596] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([8, 128, 512]), current 0
[188131], cuda 0, mlm output: torch.Size([15, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([15, 128]), output_mlm.shape: torch.Size([15, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2016, labels.shape: torch.Size([15, 128]), output.shape: torch.Size([15, 128, 30522])


100%|██████████| 15/15 [00:04<00:00,  3.01it/s]


[188598] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188599] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188598] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2106, labels_s2s.shape: torch.Size([1, 128])[188599] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 1037, labels_s2s.shape: torch.Size([1, 128])

[188599] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([1, 128, 512]), current 1[188598] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0

[188131], cuda 0, mlm output: torch.Size([2, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([2, 128]), output_mlm.shape: torch.Size([2, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2106, labels.shape: torch.Size([2, 128]), output.shape: torch.Size([2, 128, 30522])
[2023-12-07 12:04:28.982349] Epoch: 7

  0%|          | 0/200 [00:00<?, ?it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2028, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2028, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])


  3%|▎         | 6/200 [00:00<00:07, 25.98it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2467, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2467, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1999, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

  6%|▌         | 12/200 [00:00<00:06, 28.01it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 21877, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 21877, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 12756, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labe

  9%|▉         | 18/200 [00:00<00:06, 27.30it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 4441, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 4441, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2074, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 12%|█▏        | 24/200 [00:00<00:06, 28.06it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2823, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2823, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 3100, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 15%|█▌        | 30/200 [00:01<00:05, 28.39it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2017, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2017, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 18%|█▊        | 36/200 [00:01<00:05, 27.40it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 12756, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 12756, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2004, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, label

 21%|██        | 42/200 [00:01<00:05, 26.97it/s]

[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 4067, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2009, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2009, labels.shape: torch.Size([1, 128]),

 24%|██▍       | 48/200 [00:01<00:05, 27.84it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2002, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2057, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 26%|██▌       | 51/200 [00:01<00:05, 27.05it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2004, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2004, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2079, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 28%|██▊       | 57/200 [00:02<00:05, 26.60it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 21986, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels

 32%|███▏      | 63/200 [00:02<00:05, 26.88it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2673, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2673, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2009, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 34%|███▍      | 69/200 [00:02<00:04, 27.85it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 12756, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 12756, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, label

 38%|███▊      | 75/200 [00:02<00:04, 27.71it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2002, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2021, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 40%|████      | 81/200 [00:02<00:04, 27.45it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2053, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2053, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2053, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 44%|████▎     | 87/200 [00:03<00:04, 27.48it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1996, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1996, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 46%|████▋     | 93/200 [00:03<00:03, 28.09it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 9775, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 50%|████▉     | 99/200 [00:03<00:03, 27.74it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1998, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1998, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2024, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 52%|█████▎    | 105/200 [00:03<00:03, 27.47it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 5513, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 5513, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2009, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 56%|█████▌    | 112/200 [00:04<00:03, 28.49it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 10792, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 10792, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, label

 60%|█████▉    | 119/200 [00:04<00:02, 29.06it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 21877, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 21877, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2004, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, label

 62%|██████▎   | 125/200 [00:04<00:02, 27.63it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 3127, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 3127, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2024, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 66%|██████▌   | 131/200 [00:04<00:02, 26.97it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 4445, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 4445, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 5026, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 68%|██████▊   | 137/200 [00:04<00:02, 27.41it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2025, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2025, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 72%|███████▏  | 143/200 [00:05<00:02, 26.59it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2111, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2111, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 74%|███████▍  | 149/200 [00:05<00:01, 26.34it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2087, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2087, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 78%|███████▊  | 155/200 [00:05<00:01, 27.03it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 3127, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 3127, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 21468, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels

 80%|████████  | 161/200 [00:05<00:01, 27.41it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1996, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1996, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 84%|████████▎ | 167/200 [00:06<00:01, 27.62it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2339, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2339, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 85%|████████▌ | 170/200 [00:06<00:01, 27.34it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 6854, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 6854, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 12756, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels

 90%|████████▉ | 179/200 [00:06<00:00, 26.82it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2021, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2021, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 9152, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 91%|█████████ | 182/200 [00:06<00:00, 25.80it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2507, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2507, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2182, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 94%|█████████▍| 188/200 [00:06<00:00, 25.47it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2935, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2935, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2033, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 97%|█████████▋| 194/200 [00:07<00:00, 26.83it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 21877, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 21877, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 5616, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, label

100%|██████████| 200/200 [00:07<00:00, 27.22it/s]


[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2085, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2085, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2044, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

  0%|          | 0/15 [00:00<?, ?it/s]

[188601] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188600] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188601] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2045, labels_s2s.shape: torch.Size([8, 128])[188600] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([8, 128])

[188601] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([8, 128, 512]), current 1[188600] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([8, 128, 512]), current 0

[188131], cuda 0, mlm output: torch.Size([16, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([16, 128]), output_mlm.shape: torch.Size([16, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([16, 128]), output.shape: torch.Size([16, 128, 30522])


  7%|▋         | 1/15 [00:00<00:05,  2.75it/s]

[188603] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188602] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188603] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2021, labels_s2s.shape: torch.Size([6, 128])
[188602] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 20364, labels_s2s.shape: torch.Size([7, 128])
[188603] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([6, 128, 512]), current 1
[188602] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0
[188131], cuda 0, mlm output: torch.Size([13, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([13, 128]), output_mlm.shape: torch.Size([13, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 20364, labels.shape: torch.Size([13, 128]), output.shape: torch.Size([13, 128, 30522])


 13%|█▎        | 2/15 [00:00<00:04,  2.91it/s]

[188605] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188606] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188606] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 10166, labels_s2s.shape: torch.Size([7, 128])
[188605] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2469, labels_s2s.shape: torch.Size([7, 128])
[188606] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1
[188605] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0
[188131], cuda 0, mlm output: torch.Size([14, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([14, 128]), output_mlm.shape: torch.Size([14, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2469, labels.shape: torch.Size([14, 128]), output.shape: torch.Size([14, 128, 30522])


 20%|██        | 3/15 [00:01<00:04,  2.93it/s]

[188607] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188608] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188607] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2044, labels_s2s.shape: torch.Size([6, 128])
[188608] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([6, 128])
[188607] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([6, 128, 512]), current 0[188608] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([6, 128, 512]), current 1

[188131], cuda 0, mlm output: torch.Size([12, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([12, 128]), output_mlm.shape: torch.Size([12, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2044, labels.shape: torch.Size([12, 128]), output.shape: torch.Size([12, 128, 30522])


 27%|██▋       | 4/15 [00:01<00:03,  2.99it/s]

[188610] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188609] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188609] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2057, labels_s2s.shape: torch.Size([6, 128])[188610] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 1997, labels_s2s.shape: torch.Size([6, 128])

[188609] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([6, 128, 512]), current 0
[188610] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([6, 128, 512]), current 1
[188131], cuda 0, mlm output: torch.Size([12, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([12, 128]), output_mlm.shape: torch.Size([12, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2057, labels.shape: torch.Size([12, 128]), output.shape: torch.Size([12, 128, 30522])


 33%|███▎      | 5/15 [00:01<00:04,  2.48it/s]

[188611] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188612] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188612] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2007, labels_s2s.shape: torch.Size([7, 128])[188611] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1037, labels_s2s.shape: torch.Size([7, 128])

[188612] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1
[188611] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0
[188131], cuda 0, mlm output: torch.Size([14, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([14, 128]), output_mlm.shape: torch.Size([14, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1037, labels.shape: torch.Size([14, 128]), output.shape: torch.Size([14, 128, 30522])


 40%|████      | 6/15 [00:02<00:03,  2.63it/s]

[188620] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188619] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188620] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2941, labels_s2s.shape: torch.Size([7, 128])
[188620] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1
[188619] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 4067, labels_s2s.shape: torch.Size([8, 128])
[188619] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([8, 128, 512]), current 0
[188131], cuda 0, mlm output: torch.Size([15, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([15, 128]), output_mlm.shape: torch.Size([15, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 4067, labels.shape: torch.Size([15, 128]), output.shape: torch.Size([15, 128, 30522])


 47%|████▋     | 7/15 [00:02<00:02,  2.69it/s]

[188621] in SimpleEncoderDecoder.forward: output cuda:0, current 0[188622] in SimpleEncoderDecoder.forward: output cuda:1, current 1

[188622] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 10167, labels_s2s.shape: torch.Size([7, 128])
[188621] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([7, 128])
[188622] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1
[188621] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0
[188131], cuda 0, mlm output: torch.Size([14, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([14, 128]), output_mlm.shape: torch.Size([14, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2002, labels.shape: torch.Size([14, 128]), output.shape: torch.Size([14, 128, 30522])


 53%|█████▎    | 8/15 [00:02<00:02,  2.75it/s]

[188623] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188624] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188624] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([7, 128])[188623] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2821, labels_s2s.shape: torch.Size([7, 128])

[188624] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1[188623] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0

[188131], cuda 0, mlm output: torch.Size([14, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([14, 128]), output_mlm.shape: torch.Size([14, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2821, labels.shape: torch.Size([14, 128]), output.shape: torch.Size([14, 128, 30522])


 60%|██████    | 9/15 [00:03<00:02,  2.80it/s]

[188626] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188625] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188626] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([7, 128])[188625] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2014, labels_s2s.shape: torch.Size([7, 128])

[188626] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1[188625] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0

[188131], cuda 0, mlm output: torch.Size([14, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([14, 128]), output_mlm.shape: torch.Size([14, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2014, labels.shape: torch.Size([14, 128]), output.shape: torch.Size([14, 128, 30522])


 67%|██████▋   | 10/15 [00:03<00:01,  2.83it/s]

[188627] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188628] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188627] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([8, 128])[188628] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2004, labels_s2s.shape: torch.Size([8, 128])

[188628] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([8, 128, 512]), current 1[188627] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([8, 128, 512]), current 0

[188131], cuda 0, mlm output: torch.Size([16, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([16, 128]), output_mlm.shape: torch.Size([16, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([16, 128]), output.shape: torch.Size([16, 128, 30522])


 73%|███████▎  | 11/15 [00:03<00:01,  2.82it/s]

[188629] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188630] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188630] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([6, 128])
[188629] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([6, 128])
[188630] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([6, 128, 512]), current 1
[188629] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([6, 128, 512]), current 0
[188131], cuda 0, mlm output: torch.Size([12, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([12, 128]), output_mlm.shape: torch.Size([12, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([12, 128]), output.shape: torch.Size([12, 128, 30522])


 80%|████████  | 12/15 [00:04<00:01,  2.89it/s]

[188631] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188632] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188632] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([7, 128])
[188631] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([7, 128])
[188632] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1[188631] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0

[188131], cuda 0, mlm output: torch.Size([14, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([14, 128]), output_mlm.shape: torch.Size([14, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([14, 128]), output.shape: torch.Size([14, 128, 30522])


 87%|████████▋ | 13/15 [00:04<00:00,  2.88it/s]

[188633] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188634] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188634] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 1996, labels_s2s.shape: torch.Size([7, 128])
[188634] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1
[188633] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([8, 128])
[188633] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([8, 128, 512]), current 0
[188131], cuda 0, mlm output: torch.Size([15, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([15, 128]), output_mlm.shape: torch.Size([15, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2016, labels.shape: torch.Size([15, 128]), output.shape: torch.Size([15, 128, 30522])


100%|██████████| 15/15 [00:05<00:00,  2.87it/s]


[188635] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188636] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188636] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 1037, labels_s2s.shape: torch.Size([1, 128])[188635] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2106, labels_s2s.shape: torch.Size([1, 128])

[188635] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0[188636] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([1, 128, 512]), current 1

[188131], cuda 0, mlm output: torch.Size([2, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([2, 128]), output_mlm.shape: torch.Size([2, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2106, labels.shape: torch.Size([2, 128]), output.shape: torch.Size([2, 128, 30522])
[2023-12-07 12:04:41.570964] Epoch: 8

  3%|▎         | 6/200 [00:00<00:07, 25.41it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2028, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2028, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2467, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

  6%|▌         | 12/200 [00:00<00:07, 26.59it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 21877, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels

  9%|▉         | 18/200 [00:00<00:06, 26.47it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2986, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2986, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 4441, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 10%|█         | 21/200 [00:00<00:06, 25.96it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 5616, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 5616, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2823, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 14%|█▎        | 27/200 [00:01<00:07, 24.03it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2129, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2129, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2975, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 16%|█▋        | 33/200 [00:01<00:07, 23.70it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2087, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2087, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 20%|█▉        | 39/200 [00:01<00:06, 25.24it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 5616, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 5616, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 7539, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 22%|██▎       | 45/200 [00:01<00:05, 26.36it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 5616, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 5616, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2612, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 26%|██▌       | 51/200 [00:01<00:05, 26.80it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 5293, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 5293, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 8430, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 28%|██▊       | 57/200 [00:02<00:05, 26.69it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2004, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2004, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 21877, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels

 32%|███▏      | 63/200 [00:02<00:04, 27.64it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 9592, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 9592, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 34%|███▍      | 69/200 [00:02<00:04, 27.92it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2054, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2054, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 3398, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 38%|███▊      | 75/200 [00:02<00:04, 27.45it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2021, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 40%|████      | 81/200 [00:03<00:04, 27.80it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2009, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2009, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 44%|████▎     | 87/200 [00:03<00:04, 26.76it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2079, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2079, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1996, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 45%|████▌     | 90/200 [00:03<00:04, 24.95it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2074, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 48%|████▊     | 96/200 [00:03<00:04, 23.75it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2044, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2044, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2043, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 51%|█████     | 102/200 [00:03<00:04, 22.62it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1999, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1999, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2014, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 52%|█████▎    | 105/200 [00:04<00:04, 22.39it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2009, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2009, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 5616, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 56%|█████▌    | 111/200 [00:04<00:03, 22.40it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 10792, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 10792, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, label

 58%|█████▊    | 117/200 [00:04<00:03, 23.54it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 3531, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 3531, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 21877, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels

 62%|██████▏   | 123/200 [00:04<00:03, 25.10it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 9036, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 9036, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 3127, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 64%|██████▍   | 129/200 [00:05<00:02, 26.11it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2077, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2077, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 4445, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 68%|██████▊   | 135/200 [00:05<00:02, 26.21it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2009, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2009, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2025, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 70%|███████   | 141/200 [00:05<00:02, 26.46it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2016, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2111, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 74%|███████▎  | 147/200 [00:05<00:02, 26.05it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2002, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2087, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 76%|███████▋  | 153/200 [00:06<00:01, 25.61it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 3127, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 80%|███████▉  | 159/200 [00:06<00:01, 25.75it/s]

[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 12643, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2085, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2085, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: ou

 81%|████████  | 162/200 [00:06<00:01, 25.53it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 7240, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 7240, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2062, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 84%|████████▍ | 168/200 [00:06<00:01, 25.21it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2016, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2021, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 87%|████████▋ | 174/200 [00:06<00:00, 26.36it/s]

[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2272, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2272, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 1

 90%|█████████ | 180/200 [00:07<00:00, 26.63it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2009, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2009, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 12756, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels

 93%|█████████▎| 186/200 [00:07<00:00, 26.07it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2054, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2054, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2009, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 96%|█████████▌| 192/200 [00:07<00:00, 25.38it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1998, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1998, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 99%|█████████▉| 198/200 [00:07<00:00, 25.36it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2198, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2198, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2012, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

100%|██████████| 200/200 [00:07<00:00, 25.55it/s]


[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2053, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([1, 128]),

  0%|          | 0/15 [00:00<?, ?it/s]

[188637] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188638] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188638] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2045, labels_s2s.shape: torch.Size([8, 128])[188637] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([8, 128])

[188637] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([8, 128, 512]), current 0
[188638] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([8, 128, 512]), current 1
[188131], cuda 0, mlm output: torch.Size([16, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([16, 128]), output_mlm.shape: torch.Size([16, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([16, 128]), output.shape: torch.Size([16, 128, 30522])


  7%|▋         | 1/15 [00:00<00:05,  2.77it/s]

[188639] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188640] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188640] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2021, labels_s2s.shape: torch.Size([6, 128])
[188639] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 20364, labels_s2s.shape: torch.Size([7, 128])
[188640] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([6, 128, 512]), current 1[188639] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0

[188131], cuda 0, mlm output: torch.Size([13, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([13, 128]), output_mlm.shape: torch.Size([13, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 20364, labels.shape: torch.Size([13, 128]), output.shape: torch.Size([13, 128, 30522])


 13%|█▎        | 2/15 [00:00<00:04,  2.92it/s]

[188641] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188642] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188642] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 10166, labels_s2s.shape: torch.Size([7, 128])
[188641] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2469, labels_s2s.shape: torch.Size([7, 128])
[188641] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0[188642] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1

[188131], cuda 0, mlm output: torch.Size([14, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([14, 128]), output_mlm.shape: torch.Size([14, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2469, labels.shape: torch.Size([14, 128]), output.shape: torch.Size([14, 128, 30522])


 20%|██        | 3/15 [00:01<00:04,  2.92it/s]

[188644] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188643] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188644] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([6, 128])
[188643] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2044, labels_s2s.shape: torch.Size([6, 128])
[188644] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([6, 128, 512]), current 1[188643] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([6, 128, 512]), current 0

[188131], cuda 0, mlm output: torch.Size([12, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([12, 128]), output_mlm.shape: torch.Size([12, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2044, labels.shape: torch.Size([12, 128]), output.shape: torch.Size([12, 128, 30522])


 27%|██▋       | 4/15 [00:01<00:03,  2.99it/s]

[188646] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188646] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 1997, labels_s2s.shape: torch.Size([6, 128])
[188646] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([6, 128, 512]), current 1
[188645] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188645] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2057, labels_s2s.shape: torch.Size([6, 128])
[188645] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([6, 128, 512]), current 0
[188131], cuda 0, mlm output: torch.Size([12, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([12, 128]), output_mlm.shape: torch.Size([12, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2057, labels.shape: torch.Size([12, 128]), output.shape: torch.Size([12, 128, 30522])


 33%|███▎      | 5/15 [00:01<00:03,  2.62it/s]

[188647] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188648] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188648] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2007, labels_s2s.shape: torch.Size([7, 128])
[188647] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1037, labels_s2s.shape: torch.Size([7, 128])
[188648] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1[188647] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0

[188131], cuda 0, mlm output: torch.Size([14, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([14, 128]), output_mlm.shape: torch.Size([14, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1037, labels.shape: torch.Size([14, 128]), output.shape: torch.Size([14, 128, 30522])


 40%|████      | 6/15 [00:02<00:03,  2.72it/s]

[188649] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188650] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188650] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2941, labels_s2s.shape: torch.Size([7, 128])
[188650] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1
[188649] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 4067, labels_s2s.shape: torch.Size([8, 128])
[188649] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([8, 128, 512]), current 0
[188131], cuda 0, mlm output: torch.Size([15, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([15, 128]), output_mlm.shape: torch.Size([15, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 4067, labels.shape: torch.Size([15, 128]), output.shape: torch.Size([15, 128, 30522])


 47%|████▋     | 7/15 [00:02<00:02,  2.76it/s]

[188651] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188652] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188652] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 10167, labels_s2s.shape: torch.Size([7, 128])
[188651] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([7, 128])
[188651] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0[188652] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1

[188131], cuda 0, mlm output: torch.Size([14, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([14, 128]), output_mlm.shape: torch.Size([14, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2002, labels.shape: torch.Size([14, 128]), output.shape: torch.Size([14, 128, 30522])


 53%|█████▎    | 8/15 [00:02<00:02,  2.81it/s]

[188653] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188654] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188654] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([7, 128])
[188653] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2821, labels_s2s.shape: torch.Size([7, 128])
[188654] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1[188653] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0

[188131], cuda 0, mlm output: torch.Size([14, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([14, 128]), output_mlm.shape: torch.Size([14, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2821, labels.shape: torch.Size([14, 128]), output.shape: torch.Size([14, 128, 30522])


 60%|██████    | 9/15 [00:03<00:02,  2.84it/s]

[188656] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188655] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188656] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([7, 128])[188655] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2014, labels_s2s.shape: torch.Size([7, 128])

[188656] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1
[188655] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0
[188131], cuda 0, mlm output: torch.Size([14, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([14, 128]), output_mlm.shape: torch.Size([14, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2014, labels.shape: torch.Size([14, 128]), output.shape: torch.Size([14, 128, 30522])


 67%|██████▋   | 10/15 [00:03<00:01,  2.85it/s]

[188658] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188657] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188657] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([8, 128])
[188658] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2004, labels_s2s.shape: torch.Size([8, 128])
[188658] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([8, 128, 512]), current 1
[188657] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([8, 128, 512]), current 0
[188131], cuda 0, mlm output: torch.Size([16, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([16, 128]), output_mlm.shape: torch.Size([16, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([16, 128]), output.shape: torch.Size([16, 128, 30522])


 73%|███████▎  | 11/15 [00:03<00:01,  2.81it/s]

[188659] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188660] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188660] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([6, 128])
[188659] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([6, 128])
[188659] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([6, 128, 512]), current 0[188660] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([6, 128, 512]), current 1

[188131], cuda 0, mlm output: torch.Size([12, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([12, 128]), output_mlm.shape: torch.Size([12, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([12, 128]), output.shape: torch.Size([12, 128, 30522])


 80%|████████  | 12/15 [00:04<00:01,  2.87it/s]

[188661] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188662] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188662] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([7, 128])[188661] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([7, 128])

[188662] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1[188661] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([7, 128, 512]), current 0

[188131], cuda 0, mlm output: torch.Size([14, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([14, 128]), output_mlm.shape: torch.Size([14, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([14, 128]), output.shape: torch.Size([14, 128, 30522])


 87%|████████▋ | 13/15 [00:04<00:00,  2.87it/s]

[188664] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188663] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188664] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 1996, labels_s2s.shape: torch.Size([7, 128])
[188664] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([7, 128, 512]), current 1
[188663] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([8, 128])
[188663] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([8, 128, 512]), current 0
[188131], cuda 0, mlm output: torch.Size([15, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([15, 128]), output_mlm.shape: torch.Size([15, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2016, labels.shape: torch.Size([15, 128]), output.shape: torch.Size([15, 128, 30522])


100%|██████████| 15/15 [00:05<00:00,  2.89it/s]


[188666] in SimpleEncoderDecoder.forward: output cuda:1, current 1
[188665] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188666] in SimpleDecoderHead_S2S.forward: cuda:1, cuda:1, current 1, labels_s2s[0]: 1037, labels_s2s.shape: torch.Size([1, 128])[188665] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2106, labels_s2s.shape: torch.Size([1, 128])

[188666] in SimpleEncoderHead_MLM.forward: cuda:1, torch.Size([1, 128, 512]), current 1[188665] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0

[188131], cuda 0, mlm output: torch.Size([2, 128, 30522])
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([2, 128]), output_mlm.shape: torch.Size([2, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2106, labels.shape: torch.Size([2, 128]), output.shape: torch.Size([2, 128, 30522])
[2023-12-07 12:04:54.599514] Epoch: 9

  3%|▎         | 6/200 [00:00<00:07, 26.40it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2028, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2028, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2467, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

  6%|▌         | 12/200 [00:00<00:07, 26.69it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 21877, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels

  9%|▉         | 18/200 [00:00<00:06, 26.08it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2986, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2986, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 4441, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 12%|█▏        | 24/200 [00:00<00:06, 26.96it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 5616, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 5616, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2823, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 14%|█▎        | 27/200 [00:01<00:06, 26.40it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2975, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2975, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2017, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 16%|█▋        | 33/200 [00:01<00:06, 25.21it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2016, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 24781, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels

 20%|█▉        | 39/200 [00:01<00:06, 25.62it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 21877, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 21877, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 4067, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, label

 23%|██▎       | 46/200 [00:01<00:05, 27.57it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2204, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2204, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 26%|██▋       | 53/200 [00:01<00:05, 29.02it/s]

[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2057, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2004, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2004, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2079, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEnco

 30%|███       | 60/200 [00:02<00:04, 29.25it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 21986, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 21986, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2009, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, label

 33%|███▎      | 66/200 [00:02<00:04, 28.35it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2009, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2009, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 36%|███▌      | 72/200 [00:02<00:04, 28.23it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2002, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 39%|███▉      | 78/200 [00:02<00:04, 28.22it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2021, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2021, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1998, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 42%|████▏     | 84/200 [00:03<00:04, 28.26it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2053, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2053, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 44%|████▍     | 88/200 [00:03<00:03, 29.12it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2059, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 47%|████▋     | 94/200 [00:03<00:03, 28.40it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 9775, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 9775, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 12643, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels

 50%|█████     | 100/200 [00:03<00:03, 28.37it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2024, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2024, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1999, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 54%|█████▍    | 108/200 [00:03<00:03, 29.34it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2009, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2009, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 5616, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 56%|█████▌    | 111/200 [00:03<00:03, 29.50it/s]

[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2002, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 1045, labels.shape: torch.Size([1, 128]),

 59%|█████▉    | 118/200 [00:04<00:02, 29.57it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2425, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2425, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2053, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 62%|██████▎   | 125/200 [00:04<00:02, 29.24it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2428, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2428, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 4228, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 66%|██████▌   | 131/200 [00:04<00:02, 29.19it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2633, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2633, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 68%|██████▊   | 137/200 [00:04<00:02, 27.34it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 10167, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 10167, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 6701, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, label

 72%|███████▏  | 143/200 [00:05<00:01, 28.53it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 3524, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 3524, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2821, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 74%|███████▍  | 149/200 [00:05<00:01, 28.17it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2002, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2007, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 78%|███████▊  | 155/200 [00:05<00:02, 20.26it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 21468, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 21468, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 3201, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, label

 80%|████████  | 161/200 [00:05<00:01, 23.59it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2016, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2292, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 84%|████████▎ | 167/200 [00:06<00:01, 25.89it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2016, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2016, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 86%|████████▋ | 173/200 [00:06<00:01, 26.04it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 12756, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 12756, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2085, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, label

 90%|████████▉ | 179/200 [00:06<00:00, 26.66it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 9152, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 9152, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2348, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 93%|█████████▎| 186/200 [00:06<00:00, 28.14it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2182, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2182, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2002, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 96%|█████████▌| 192/200 [00:07<00:00, 27.43it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2498, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2498, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1998, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

 98%|█████████▊| 196/200 [00:07<00:00, 28.20it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2572, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2572, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2198, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_

100%|██████████| 200/200 [00:07<00:00, 27.16it/s]

[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 2053, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_mlm.shape: torch.Size([1, 128]), output_mlm.shape: torch.Size([1, 128, 30522])
[188131] SimpleDecoderHead_S2S get loss, cuda 0, loss cuda:0, labels_s2s[0]: 2053, labels.shape: torch.Size([1, 128]), output.shape: torch.Size([1, 128, 30522])
[188131] in SimpleEncoderDecoder.forward: output cuda:0, current 0
[188131] in SimpleDecoderHead_S2S.forward: cuda:0, cuda:0, current 0, labels_s2s[0]: 1045, labels_s2s.shape: torch.Size([1, 128])
[188131] in SimpleEncoderHead_MLM.forward: cuda:0, torch.Size([1, 128, 512]), current 0
[188131] SimpleEncoderHead_MLM get loss: cuda:0, cuda:0, current 0, labels_mlm[0]: 101, labels_


