In [1]:
!pip install flash-attn --no-build-isolation
!pip install triton

Collecting flash-attn
  Downloading flash_attn-2.7.4.post1.tar.gz (6.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.0/6.0 MB[0m [31m56.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: flash-attn
  Building wheel for flash-attn (setup.py) ... [?25l[?25hdone
  Created wheel for flash-attn: filename=flash_attn-2.7.4.post1-cp310-cp310-linux_x86_64.whl size=187797312 sha256=b267f80a08e516292cdd748056a2178a45b8abedf7fca123292eb17c21c8c87c
  Stored in directory: /root/.cache/pip/wheels/59/ce/d5/08ea07bfc16ba218dc65a3a7ef9b6a270530bcbd2cea2ee1ca
Successfully built flash-attn
Installing collected packages: flash-attn
Successfully installed flash-attn-2.7.4.post1
Collecting triton
  Downloading triton-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading triton-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (25

In [5]:
import torch
from torch import Tensor,nn
from typing import Dict, List, Optional, Union

import dataclasses


@dataclasses.dataclass
class TensorPointer:
    """Dataclass specifying from which rank we need to query a tensor from in order to access data"""

    # Needed to understand from which rank to get the tensor
    # TODO @thomasw21: Maybe add which group it belongs to as well? Typically this is highly correlated to `p2p.pg`
    group_rank: int
    # TODO @thomasw21: Maybe add a tag (torch.distributed.send/recv allow for tagging)

class TritonRMSNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-5, device=None, dtype=None):
        super().__init__()
        self.eps = eps
        self.weight = nn.Parameter(torch.ones(hidden_size, device=device, dtype=dtype))
        self.register_parameter("bias", None)
        self.reset_parameters()

    def reset_parameters(self):
        nn.init.ones_(self.weight)

    def forward(
        self, input, residual=None, dropout_p=0.0, prenorm=False, residual_in_fp32=False, return_dropout_mask=False
    ):
        from flash_attn.ops.triton.layer_norm import layer_norm_fn

        return layer_norm_fn(
            input,
            self.weight,
            None,
            residual=residual,
            eps=self.eps,
            dropout_p=dropout_p,
            prenorm=prenorm,
            residual_in_fp32=residual_in_fp32,
            is_rms_norm=True,
            return_dropout_mask=return_dropout_mask,
        )

class RotaryEmbedding(nn.Module):
    def __init__(self, dim: int, end: int, theta: float = 10000.0):
        super().__init__()
        assert dim % 2 == 0
        self.dim = dim
        self.end = end
        self.theta = theta
        self.register_buffer(
            "freqs_cis",
            self._compute_freqs(end, dim, theta),
            persistent=False,
        )

    def _compute_freqs(self, end, dim, theta):
        freqs = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
        t = torch.arange(end)
        freqs = torch.outer(t, freqs)
        complex_freqs = torch.polar(torch.ones_like(freqs), freqs)
        return torch.view_as_real(complex_freqs)

    def forward(self, x: torch.Tensor, position_ids: Optional[torch.LongTensor]):
        batch_size, seq_length, num_heads, inner_dim = x.shape
        dtype = x.dtype
        x = x.view(batch_size, seq_length, num_heads, inner_dim // 2, 2)
        complex_x = torch.view_as_complex(x.float())

        if position_ids is None:
            freqs_cis = self.freqs_cis[:seq_length]
        else:
            freqs_cis = self.freqs_cis[position_ids]

        complex_freqs = torch.view_as_complex(freqs_cis)
        x_out = torch.view_as_real(complex_x * complex_freqs).view(batch_size, seq_length, num_heads, inner_dim)
        return x_out.type(dtype)

class CoreAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        assert config.hidden_size % config.num_attention_heads == 0, "Hidden size must be divisible by number of attention heads."
        self.d_qk = config.hidden_size // config.num_attention_heads
        self.d_v = config.hidden_size // config.num_attention_heads
        #self.is_using_mup = config.is_using_mup

    def forward(self, query_states, key_states, value_states, q_sequence_mask, kv_sequence_mask):
        cu_seqlens_q = torch.cumsum(q_sequence_mask.sum(-1, dtype=torch.int32), dim=0, dtype=torch.int32)
        cu_seqlens_k = torch.cumsum(kv_sequence_mask.sum(-1, dtype=torch.int32), dim=0, dtype=torch.int32)
        causal = q_sequence_mask.shape[1] != 1
        #softmax_scale = 1 / query_states.shape[-1] if self.is_using_mup else None

        attn_output = torch.nn.functional.scaled_dot_product_attention(
            query_states, key_states, value_states, attn_mask=None, dropout_p=0.0
        )
        return attn_output

class CausalSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        from flash_attn.layers.rotary import RotaryEmbedding as FlashRotaryEmbedding
        
        self.n_q_heads = config.num_attention_heads
        self.n_kv_heads = config.num_key_value_heads if hasattr(config, 'num_key_value_heads') else config.num_attention_heads
        self.n_repeats = config.num_attention_heads // self.n_kv_heads
        self.is_gqa = config.num_attention_heads != self.n_kv_heads
        self.d_qk = config.hidden_size // config.num_attention_heads
        self.d_v = config.hidden_size // config.num_attention_heads
        self.d_model = config.hidden_size

        self.qkv_proj = nn.Linear(self.d_model, config.num_attention_heads * self.d_qk + 2 * self.n_kv_heads * self.d_qk, bias=False)
        self.rotary_embedding = RotaryEmbedding(dim=self.d_qk, end=config.max_position_embeddings, theta=config.rope_theta)
        self.flash_rotary_embedding = FlashRotaryEmbedding(dim=self.d_qk, base=config.rope_theta, interleaved=config.rope_interleaved)
        self.o_proj = nn.Linear(config.num_attention_heads * self.d_qk, self.d_model, bias=False)
        self.attention = CoreAttention(config)

    def forward(self, hidden_states, sequence_mask):
        qkv_states = self.qkv_proj(hidden_states)
        q_length, batch_size, _ = qkv_states.shape

        if self.is_gqa:
            query_states, key_states, value_states = torch.split(
                qkv_states,
                [self.n_q_heads * self.d_qk, self.n_kv_heads * self.d_qk, self.n_kv_heads * self.d_qk],
                dim=-1,
            )
        else:
            query_states, key_states, value_states = qkv_states.view(q_length, batch_size, 3, self.n_q_heads, self.d_qk).permute(2, 1, 0, 3, 4).contiguous()

        query_states, key_value_states = self.flash_rotary_embedding(query_states, kv=torch.stack([key_states, value_states], dim=2))
        key_states, value_states = torch.chunk(key_value_states, 2, dim=2)

        attention_output = self.attention(
            query_states=query_states.view(batch_size * q_length, self.n_q_heads, self.d_qk),
            key_states=key_states.view(batch_size * q_length, self.n_kv_heads, self.d_qk),
            value_states=value_states.view(batch_size * q_length, self.n_kv_heads, self.d_v),
            q_sequence_mask=sequence_mask,
            kv_sequence_mask=sequence_mask,
        )

        attention_output = attention_output.contiguous().view(batch_size, q_length, self.n_q_heads * self.d_v).transpose(0, 1)
        output = self.o_proj(attention_output)

        return {"hidden_states": output, "sequence_mask": sequence_mask}

class ColumnLinear(nn.Linear):
    def __init__(self, in_features, out_features, bias=True, device=None, dtype=None):
        super().__init__(in_features, out_features, bias, device, dtype)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        #x = x.detach()  # Detach from computation graph
        return nn.functional.linear(x, self.weight, self.bias)


class RowLinear(nn.Linear):
    def __init__(self, in_features, out_features, bias=True, device=None, dtype=None):
        super().__init__(in_features, out_features, bias, device, dtype)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        #x = x.detach()  # Detach from computation graph
        return nn.functional.linear(x, self.weight, self.bias)

class GELUActivation(nn.Module):
    """
    Original Implementation of the GELU activation function in Google BERT repo when initially created. For
    information: OpenAI GPT's GELU is slightly different (and gives slightly different results): 0.5 * x * (1 +
    torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) This is now written in C in nn.functional
    Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
    """

    def __init__(self, use_gelu_python: bool = False):
        super().__init__()
        if use_gelu_python:
            self.act = self._gelu_python
        else:
            self.act = nn.functional.gelu

    def _gelu_python(self, input: Tensor) -> Tensor:
        return input * 0.5 * (1.0 + torch.erf(input / math.sqrt(2.0)))

    def forward(self, input: Tensor) -> Tensor:
        return self.act(input)

class GLUActivation(nn.Module):
    def __init__(self):
        super().__init__()
        self.act = nn.functional.silu

    def forward(self, merged_states: torch.Tensor):
        gate_states, up_states = torch.split(merged_states, merged_states.shape[-1] // 2, dim=-1)
        return self.act(gate_states) * up_states


class MLP(nn.Module):
    def __init__(self, config):
        super().__init__()

        self.gate_up_proj = ColumnLinear(config.hidden_size, 2*config.intermediate_size, bias=False)
        self.down_proj = RowLinear(config.intermediate_size, config.hidden_size, bias=False)
        self.split_silu_mul = GLUActivation()

    def forward(self, hidden_states):
        merged_states = self.gate_up_proj(hidden_states)
        hidden_states = self.down_proj(self.split_silu_mul(merged_states))
        return {"hidden_states": hidden_states}


class LlamaDecoderLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.input_layernorm = TritonRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
        self.attn = CausalSelfAttention(config)
        self.post_attention_layernorm = TritonRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
        self.mlp = MLP(config)

    def _core_forward(
        self,
        hidden_states: Union[torch.Tensor, TensorPointer],
        sequence_mask: Union[torch.Tensor, TensorPointer],
    ) -> List[Union[torch.Tensor, TensorPointer]]:
        residual = hidden_states
        hidden_states = self.input_layernorm(hidden_states)

        output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask)
        hidden_states = output["hidden_states"]
        hidden_states = hidden_states + residual

        residual = hidden_states
        hidden_states = self.post_attention_layernorm(hidden_states)
        hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
        hidden_states = hidden_states + residual

        return hidden_states, output["sequence_mask"]

    def forward(
        self,
        hidden_states: Union[torch.Tensor, TensorPointer],
        sequence_mask: Union[torch.Tensor, TensorPointer],
    ) -> Dict[str, Union[torch.Tensor, TensorPointer]]:


        hidden_states, sequence_mask = self._core_forward(hidden_states, sequence_mask)

        return {
            "hidden_states": hidden_states,
            "sequence_mask": sequence_mask,
        }

class LlamaModel(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)
        #layer_idx = config.num_layers
        self.layers = nn.ModuleList([LlamaDecoderLayer(config) for _ in range(config.num_layers)])
        self.final_layer_norm = TritonRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
        self.lm_head = ColumnLinear(config.hidden_size, config.vocab_size, bias=False)
        self.cast_to_fp32 = lambda: lambda x: x.float()

    def forward(
        self,
        input_ids: Union[torch.Tensor, TensorPointer],  # [batch_size, seq_length]
        input_mask: Union[torch.Tensor, TensorPointer],  # [batch_size, seq_length]
    ):
        return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]

    def forward_with_hidden_states(
        self,
        input_ids: Union[torch.Tensor, TensorPointer],  # [batch_size, seq_length]
        input_mask: Union[torch.Tensor, TensorPointer],  # [batch_size, seq_length]
    ):
        # Format input in `[seq_length, batch_size]` to support high TP with low batch_size
        input_ids = input_ids.transpose(0, 1)
        hidden_states = self.embed_tokens(input_ids)

        hidden_encoder_states = {
            "hidden_states": hidden_states,
            "sequence_mask": input_mask,
        }

        for encoder_block in self.layers:
            hidden_encoder_states = encoder_block(**hidden_encoder_states)

        hidden_encoder_states["hidden_states"] = self.final_layer_norm(hidden_encoder_states["hidden_states"])

        sharded_logits = self.lm_head(x=hidden_encoder_states["hidden_states"])

        fp32_sharded_logits = sharded_logits.float()

        return fp32_sharded_logits, hidden_states

class CrossEntropyLossFunction(torch.autograd.Function):
    @staticmethod
    def forward(ctx, logits, target):
        logits = logits - torch.max(logits, dim=-1, keepdim=True)[0]
        exp_logits = torch.exp(logits)
        sum_exp_logits = exp_logits.sum(dim=-1, keepdim=True)
        log_probs = logits - torch.log(sum_exp_logits)

        loss = -log_probs.gather(dim=-1, index=target.unsqueeze(-1)).squeeze(-1)
        ctx.save_for_backward(exp_logits / sum_exp_logits, target)
        return loss

    @staticmethod
    def backward(ctx, grad_output):
        softmax, target = ctx.saved_tensors
        grad_input = softmax.clone()
        grad_input.scatter_add_(dim=-1, index=target.unsqueeze(-1), src=-1.0)
        grad_input.mul_(grad_output.unsqueeze(-1))
        return grad_input, None


def cross_entropy_loss(logits, target, dtype: torch.dtype = None):
    if dtype is not None:
        logits = logits.to(dtype=dtype)
    return CrossEntropyLossFunction.apply(logits, target)


class Loss(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, logits: torch.Tensor, label_ids: torch.Tensor, label_mask: torch.Tensor) -> dict:
        loss = cross_entropy_loss(logits, label_ids.transpose(0, 1).contiguous(), dtype=torch.float).transpose(0, 1)
        loss = (loss * label_mask).sum() / label_mask.sum()
        return {"loss": loss}


In [3]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer
from tqdm import tqdm
import math
import os
from numpy import arange
import torch.nn.functional as F

# Import the LlamaModel from model_manual.py
#from model_manual import LlamaModel

# Function to generate text from the model
def generate_text(model, input_text, vocab, id_to_token, device, max_length=50, temperature=0.7):
    model.eval()
    input_ids = torch.tensor([[vocab.get(token, vocab['<|endoftext|>']) for token in input_text.split()]], dtype=torch.long).to(device)
    generated_tokens = input_ids.tolist()[0]

    with torch.no_grad():
        for _ in range(max_length):
            logits = model(input_ids)[:, -1, :]
            logits = logits / temperature
            probabilities = F.softmax(logits, dim=-1)
            next_token = torch.multinomial(probabilities, num_samples=1).squeeze().item()

            if next_token == vocab['<|endoftext|>']:
                break

            generated_tokens.append(next_token)
            input_ids = torch.tensor([generated_tokens], dtype=torch.long).to(device)

    return ' '.join(id_to_token[token] for token in generated_tokens if token in id_to_token)


class DataloaderLite:
    def __init__(self, file_path, seq_len, batch_size):
        self.seq_len = seq_len
        self.batch_size = batch_size
        self.tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/cosmo2-tokenizer", add_prefix_space=True)
        if self.tokenizer.pad_token is None:
            if self.tokenizer.eos_token:
                self.tokenizer.pad_token = self.tokenizer.eos_token
            else:
                self.tokenizer.add_special_tokens({"pad_token": "[PAD]"})
                self.tokenizer.resize_token_embeddings(len(self.tokenizer))

        with open(file_path, 'r', encoding='utf-8') as f:
            self.text = f.read()
        self.epochs = len(self.text) // (self.seq_len * self.batch_size)
        self.current_position = 0
        self.padded_chunks = []
        self.max_len = 0
        
    def get_max_length(self):
        return self.max_len

    def next_batch(self):

        self.chunks = [self.text[(self.current_position + i):(self.current_position + i + self.seq_len)] for i in range(0, self.seq_len*self.batch_size, self.seq_len)]
        self.current_position = self.current_position + self.seq_len*self.batch_size
        if self.current_position + (self.seq_len*self.batch_size + 1) > len(self.text):
            self.current_position = 0
        self.encoded_chunks = [self.tokenizer(chunk, return_tensors='pt', truncation=True, max_length=self.seq_len) for chunk in self.chunks]

        self.max_len = max(chunk['input_ids'].shape[1] for chunk in self.encoded_chunks)
        self.padded_chunks = []
        for chunk in self.encoded_chunks:
            input_ids = torch.cat((chunk['input_ids'], torch.full((1, self.max_len - chunk['input_ids'].shape[1]), self.tokenizer.pad_token_id, dtype=torch.long)), dim=1)
            attention_mask = torch.cat((chunk['attention_mask'], torch.zeros((1, self.max_len - chunk['attention_mask'].shape[1]), dtype=torch.long)), dim=1)
            self.padded_chunks.append(input_ids.squeeze(0))
            self.padded_chunks.append(attention_mask.squeeze(0))
        return self.padded_chunks

def collate_fn(batch):
    input_ids = torch.stack([item[0] for item in batch])
    attention_masks = torch.stack([item[1] for item in batch])
    return input_ids, attention_masks


# Random initialization
def init_weights(m):
    if isinstance(m, (torch.nn.Linear, torch.nn.Embedding)):
        torch.nn.init.xavier_uniform_(m.weight)
        if hasattr(m, 'bias') and m.bias is not None:
            torch.nn.init.zeros_(m.bias)

def print_model_parameters(model):
    total_params = sum(p.numel() for p in model.parameters())
    print(f"Total parameters: {total_params:,}\n")

    for name, param in model.named_parameters():
        print(f"{name}: {param.numel():,}")

def train_model(config, train_file, steps, output_dir):
    tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/cosmo2-tokenizer")

    if tokenizer.pad_token is None:
        if tokenizer.eos_token:
            tokenizer.pad_token = tokenizer.eos_token
        else:
            tokenizer.add_special_tokens({"pad_token": "[PAD]"})
            tokenizer.resize_token_embeddings(len(tokenizer))

    vocab = tokenizer.get_vocab()
    id_to_token = {v: k for k, v in vocab.items()}

    dataloader = DataloaderLite(train_file, SEQ_LEN, BATCH_SIZE)
    #padded_chunks = dataloader.next_batch()
    #print(padded_chunks[0])
    #print(padded_chunks[1])
    #print(padded_chunks[2])
    #print(padded_chunks[3])

    model = LlamaModel(config)
    model.apply(init_weights)
    for p in model.parameters():
        p.data.clamp_(-1e5, 1e5)
    print_model_parameters(model)
    model.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
    model.device = next(model.parameters()).device
    optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=100, gamma=0.9)
    
    #loss_fn = torch.nn.CrossEntropyLoss()
    
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    progress_bar = tqdm(range(steps), desc="Training")
    
    PROMPT = "inventory to particularise their abundance"
    
    step = 0
    while step < steps:
        input_tokens = dataloader.next_batch()
        max_len = dataloader.get_max_length()
        input_list = []
        attenttion_list = []
        for i in arange(BATCH_SIZE):
            input_list.append(input_tokens[2*i])
            attenttion_list.append(input_tokens[2*i+1])

        input_ids = torch.stack(input_list)
        inputs = input_ids[:, :-2]  # Keep batch dim and remove last token
        targets = input_ids[:, 1:-1]  # Keep batch dim and remove first token
        attention_mask = torch.stack(attenttion_list)
        attentions = attention_mask[:, :-2]
        device = next(model.parameters()).device
        inputs, attentions = inputs.to(device), attentions.to(device)
        #positions = torch.arange(0, inputs.size(1), dtype=torch.long).unsqueeze(0).repeat(inputs.size(0), 1).to(device)
        #print(inputs.shape)
        #print(attentions.shape)
        #print(positions.shape)
    
        optimizer.zero_grad()
        logits = model(inputs, attentions)
    
        labels = targets.to(device)  # Move labels to the same device as the model and inputs
        # Create a mask based on counts
        mask = attentions.bool()  # The attention mask already has the correct shape
        logits = logits.transpose(0, 1)  # Align logits to (batch_size, seq_len, vocab_size)
        logits_masked = logits[mask].contiguous().view(-1, config.vocab_size)
        labels_masked = labels[mask].contiguous().view(-1)
        probabilities = F.softmax(logits_masked, dim=-1)
        max_prob_indices = torch.argmax(probabilities, dim=-1)
        print(max_prob_indices[:10])
        print(labels_masked[:10])
        
        if mask.sum() == 0:
            raise ValueError("Attention mask sums to zero!")

        loss_cross_entropy = F.cross_entropy(logits_masked, labels_masked)
        #loss = loss_fn(logits_masked, labels_masked)
    
        loss_cross_entropy.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
    
        progress_bar.update(1)
        progress_bar.set_postfix(loss=loss_cross_entropy.item())
    
        #if step % 10 == 0:
        #    torch.save({
        #        'step': step,
        #        'model_state_dict': model.state_dict(),
        #        'optimizer_state_dict': optimizer.state_dict(),
        #        'scheduler_state_dict': scheduler.state_dict()
        #    }, os.path.join(output_dir, f'checkpoint_{step}.pt'))
        #    #generated_text = generate_text(model, PROMPT, vocab, id_to_token, model.device)
        #    #print(f"\nGenerated text at step {step}: {generated_text}\n")
    
        step += 1
        if step >= steps:
            break

class Config:
    pass

if __name__ == "__main__":
    config = Config()
    config.vocab_size = AutoTokenizer.from_pretrained("HuggingFaceTB/cosmo2-tokenizer").vocab_size
    config.num_layers = 30
    config.hidden_size = 576
    config.num_attention_heads = 8
    config.rms_norm_eps = 1.0e-05
    config.max_position_embeddings = 2048
    config.rope_theta = 500000.0
    config.hidden_act = False
    config.intermediate_size = 1536
    config.rope_interleaved = False
    #config.rope_scaling = null
    config.rope_theta = 10000.0

    BATCH_SIZE = 8
    SEQ_LEN = 512

    train_model(config, '/kaggle/input/assign13-era-v3-dataset/input.txt', 1000, './output')


tokenizer_config.json:   0%|          | 0.00/3.91k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/801k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/489 [00:00<?, ?B/s]

Total parameters: 176,097,600

embed_tokens.weight: 28,311,552
layers.0.input_layernorm.weight: 576
layers.0.attn.qkv_proj.weight: 995,328
layers.0.attn.o_proj.weight: 331,776
layers.0.post_attention_layernorm.weight: 576
layers.0.mlp.gate_up_proj.weight: 1,769,472
layers.0.mlp.down_proj.weight: 884,736
layers.1.input_layernorm.weight: 576
layers.1.attn.qkv_proj.weight: 995,328
layers.1.attn.o_proj.weight: 331,776
layers.1.post_attention_layernorm.weight: 576
layers.1.mlp.gate_up_proj.weight: 1,769,472
layers.1.mlp.down_proj.weight: 884,736
layers.2.input_layernorm.weight: 576
layers.2.attn.qkv_proj.weight: 995,328
layers.2.attn.o_proj.weight: 331,776
layers.2.post_attention_layernorm.weight: 576
layers.2.mlp.gate_up_proj.weight: 1,769,472
layers.2.mlp.down_proj.weight: 884,736
layers.3.input_layernorm.weight: 576
layers.3.attn.qkv_proj.weight: 995,328
layers.3.attn.o_proj.weight: 331,776
layers.3.post_attention_layernorm.weight: 576
layers.3.mlp.gate_up_proj.weight: 1,769,472
layers.3

  def forward(
  def backward(ctx, dout, *args):


tensor([44718, 19558, 36263, 13835, 42631, 29561, 15580, 18235, 45635,  8531],
       device='cuda:0')
tensor([32062,    42,   198,  6121,   392,  7219,   750,  2030,    28,  4875],
       device='cuda:0')


Training:   0%|          | 1/1000 [00:07<1:57:25,  7.05s/it, loss=10.8]

tensor([10914, 47464, 35382, 10843,   198, 17178, 26609, 37239,  2475, 15506],
       device='cuda:0')
tensor([1248,  506, 2249,  198,   66, 2713,  478, 6737, 1523,  260],
       device='cuda:0')


Training:   0%|          | 3/1000 [00:07<31:13,  1.88s/it, loss=10.7]  

tensor([ 4889, 16576, 40283, 30012, 38163, 28473,  2107,   198,   198,  6845],
       device='cuda:0')
tensor([8427,  346,  536, 1363,  253, 1945,   28,  198, 3528,  946],
       device='cuda:0')


Training:   0%|          | 4/1000 [00:08<20:49,  1.25s/it, loss=10.6]

tensor([35325, 39760, 23808,   198,  7646, 18747,  1472,   346, 14186, 46714],
       device='cuda:0')
tensor([  384,  1477,   198,  2068,   260, 28382,   282,   253,   555,    28],
       device='cuda:0')


Training:   0%|          | 5/1000 [00:08<15:02,  1.10it/s, loss=10.6]

tensor([ 1678, 28189,  7572,  7160,   198,   198,  2836, 36776, 18045,   198],
       device='cuda:0')
tensor([   55,  4192,  7854,    42,   198, 11952, 23271,  8710,    17,   492],
       device='cuda:0')


Training:   1%|          | 6/1000 [00:08<11:37,  1.43it/s, loss=10.4]

tensor([ 7687,   198,   198,   198, 32062,    42,   198,   198, 32866,   198],
       device='cuda:0')
tensor([   47,   198,   198,  5345, 22152,    42,   198,  5230,    28,  5277],
       device='cuda:0')


Training:   1%|          | 7/1000 [00:09<09:20,  1.77it/s, loss=10.3]

tensor([  198,   198,   608,   198, 13536, 47237, 43531,   198,   198, 43649],
       device='cuda:0')
tensor([  351, 30291,   198,    86, 27045, 26927,    28,   198,  9389,  1928],
       device='cuda:0')


Training:   1%|          | 8/1000 [00:09<07:50,  2.11it/s, loss=10.4]

tensor([33703,   198,   260,  3060,   198,    28, 39382,   198,   518, 18393],
       device='cuda:0')
tensor([   43,   327,   339,   536, 13735, 20322,   198,    71, 13231,   670],
       device='cuda:0')


Training:   1%|          | 9/1000 [00:09<06:54,  2.39it/s, loss=10.4]

tensor([13657, 28834,   198,   198,  2221,   198,  2576,  6907,   198,   198],
       device='cuda:0')
tensor([ 1450,    28,   351,  5337,   392,   654, 24575,    28,   198,  2193],
       device='cuda:0')


Training:   1%|          | 10/1000 [00:09<06:22,  2.59it/s, loss=10.1]

tensor([29086, 41238,   198,   198,   198,   198,   198, 16001,   198,   198],
       device='cuda:0')
tensor([ 274,   28,  198,   96, 8854,   28, 9869,   28, 1028,  105],
       device='cuda:0')


Training:   1%|          | 11/1000 [00:10<05:58,  2.76it/s, loss=10.3]

tensor([  198, 46954,   198,   198,   198, 13536, 32556,   198,   198,   260],
       device='cuda:0')
tensor([  457,   719,   588,   198,    86, 21482,  3447,   327,   511,   260],
       device='cuda:0')


Training:   1%|          | 12/1000 [00:10<05:36,  2.93it/s, loss=10.2]

tensor([28154,   198,  2285,   198,   198,   198,   198,   198,   198,   198],
       device='cuda:0')
tensor([  198,    60,   602,   967,   384, 26365,  3763,    30,   198,   198],
       device='cuda:0')


Training:   1%|▏         | 13/1000 [00:10<05:18,  3.09it/s, loss=10.1]

tensor([47239,   198,   198,   650,   198,   198,   198,  9243, 36796, 19546],
       device='cuda:0')
tensor([   28,   198, 38984,   346,    28,   198,  6101,   298,  2679,   268],
       device='cuda:0')


Training:   1%|▏         | 14/1000 [00:11<05:07,  3.21it/s, loss=10]  

tensor([  198,    28,   198,   198, 24583,  2286,    89,   198,   198,   198],
       device='cuda:0')
tensor([ 1508,    42,   198, 12800,  4618,  2286, 23944,    30,   198,   198],
       device='cuda:0')


Training:   2%|▏         | 15/1000 [00:11<04:58,  3.31it/s, loss=9.91]

tensor([4501, 8292, 4501,   42,  198,  198,  198,  260,   28,  198],
       device='cuda:0')
tensor([ 7113,  2810,  4501,    42,   198, 16817,  1980,   549,    17, 12193],
       device='cuda:0')
tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([  482,  1163,    28,   198,  9302,  1185,   288,   260, 44890,    29],
       device='cuda:0')


Training:   2%|▏         | 17/1000 [00:12<05:40,  2.88it/s, loss=9.94]

tensor([4628, 4501,   42,  198,  198,   28,  198,  198,  198,  198],
       device='cuda:0')
tensor([ 4628,  4501,    42,   198, 45496,    28, 20172,   468,   441,    30],
       device='cuda:0')


Training:   2%|▏         | 18/1000 [00:12<05:27,  3.00it/s, loss=9.66]

tensor([ 198,   57, 4501,   42,  198,  198,  198,  198,  198,  198],
       device='cuda:0')
tensor([ 2113,    57,  4501,    42,   198,   504,   701,   359, 23909,    43],
       device='cuda:0')


Training:   2%|▏         | 19/1000 [00:12<05:21,  3.05it/s, loss=9.81]

tensor([198, 198, 198, 198, 198,  28, 198, 198, 198, 198], device='cuda:0')
tensor([ 1072,   510, 46823,   351,  1728,    28,   260,   550,   198, 20181],
       device='cuda:0')


Training:   2%|▏         | 19/1000 [00:12<05:21,  3.05it/s, loss=9.56]

tensor([  198,   198,   198,   198,   198,   198, 23317,    57,  4501,    42],
       device='cuda:0')
tensor([  346,  7270,    30,   198,   198,  4105, 23317,    57,  4501,    42],
       device='cuda:0')


Training:   2%|▏         | 20/1000 [00:13<06:42,  2.44it/s, loss=9.51]

tensor([ 198,  198,  198,  198, 3020, 2113,   57, 4501,   42,  198],
       device='cuda:0')
tensor([ 423,  198,  198,   67, 3020, 2113,   57, 4501,   42,  198],
       device='cuda:0')


Training:   2%|▏         | 22/1000 [00:14<06:27,  2.52it/s, loss=9.64]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([5344,  253, 3561,  839, 1980,  637,  391,   28,  198, 2193],
       device='cuda:0')


Training:   2%|▏         | 23/1000 [00:14<05:57,  2.73it/s, loss=9.64]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([  338,   392,   457, 13784,  1915,   198,  7143,  1187,   411,   260],
       device='cuda:0')


Training:   2%|▏         | 24/1000 [00:14<05:34,  2.92it/s, loss=9.59]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([  281,  1272,  5721,    28,   429,   655,   288,   655,   198, 38110],
       device='cuda:0')


Training:   2%|▎         | 25/1000 [00:15<05:20,  3.04it/s, loss=9.59]

tensor([198, 198, 198, 198, 198, 260, 198, 198, 198, 198], device='cuda:0')
tensor([ 441, 5091,  198,   63,   23,  259,  260, 4749,  905,  288],
       device='cuda:0')


Training:   3%|▎         | 26/1000 [00:15<05:15,  3.09it/s, loss=9.32]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([  42,  198,  504, 1085,  754, 9699,  282,  357,  314, 1896],
       device='cuda:0')


Training:   3%|▎         | 27/1000 [00:15<05:08,  3.15it/s, loss=9.45]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([   28,   346,   523,   441,    47,   377,   608, 29086,    28,  2505],
       device='cuda:0')


Training:   3%|▎         | 28/1000 [00:16<05:02,  3.22it/s, loss=9.49]

tensor([  198,   198,   198,   198,   198,   198,   198,   198, 37184,   198],
       device='cuda:0')
tensor([ 2745,  3895,   351,  3878,    30,   408, 19550,  2828, 37184,    28],
       device='cuda:0')


Training:   3%|▎         | 29/1000 [00:16<04:56,  3.28it/s, loss=9.49]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([ 198, 7993, 5612,   28,  702,  345,  370,  990, 5249,   28],
       device='cuda:0')


Training:   3%|▎         | 30/1000 [00:16<04:54,  3.29it/s, loss=9.13]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([ 1378,    30,   198,   198, 38371,  9333,    42,   198, 10539,    28],
       device='cuda:0')


Training:   3%|▎         | 31/1000 [00:16<04:50,  3.33it/s, loss=9.13]

tensor([ 198,  198,  198,  198,  198,  198,  198, 3020, 2113,   57],
       device='cuda:0')
tensor([  28,  732, 1745,   47,  198,  198,   67, 3020, 2113,   57],
       device='cuda:0')


Training:   3%|▎         | 32/1000 [00:17<04:50,  3.34it/s, loss=9.23]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([2428,   42,  650, 2988,  288,  549,  436,   28,  198, 3681],
       device='cuda:0')


Training:   3%|▎         | 33/1000 [00:17<04:47,  3.37it/s, loss=9.18]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([ 198, 2683,  359,  253, 4469,   28,  359,  346,   47,  198],
       device='cuda:0')


Training:   3%|▎         | 34/1000 [00:17<04:40,  3.45it/s, loss=9.25]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([ 2526,  1209,  2630,    43,   288, 10297,  1272,   805,   198,  5195],
       device='cuda:0')


Training:   4%|▎         | 35/1000 [00:18<04:38,  3.47it/s, loss=9.24]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([3032,  198, 5195,  511,  564,  392, 2815,   43,  327,  638],
       device='cuda:0')


Training:   4%|▎         | 36/1000 [00:18<04:33,  3.52it/s, loss=9]   

tensor([ 198,  198, 2680,  198,  198,  198,  198,  198,  198,  198],
       device='cuda:0')
tensor([  198,    61,   462,  3497,   288, 16945,  8047,    30,  1249,    28],
       device='cuda:0')


Training:   4%|▎         | 37/1000 [00:18<04:33,  3.52it/s, loss=9]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([6737,  284,  198, 4038, 1390,  288, 1972, 1092,  260,  701],
       device='cuda:0')


Training:   4%|▍         | 38/1000 [00:18<04:38,  3.45it/s, loss=9]

tensor([ 198,  198,  198,  198,  198, 4501,  198,  198,  198,  198],
       device='cuda:0')
tensor([ 2240,  7268,    28,   198,    57,  1643,   637, 15006,  2240,  7904],
       device='cuda:0')


Training:   4%|▍         | 39/1000 [00:19<04:40,  3.42it/s, loss=9.2] 

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([ 6824,   253, 26156,    28,   198,  2068,  7471,   623,  3506,   876],
       device='cuda:0')


Training:   4%|▍         | 40/1000 [00:19<04:37,  3.45it/s, loss=9.16]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([  288, 10419,    28,   198,  1937, 10419,   523,  1188,   260,  1386],
       device='cuda:0')


Training:   4%|▍         | 41/1000 [00:19<04:38,  3.44it/s, loss=9.16]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([   28,   327,  1839,   506, 15215,    28,  8913,    28,   284,  7241],
       device='cuda:0')


Training:   4%|▍         | 42/1000 [00:20<04:42,  3.39it/s, loss=9.11]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([   42,   198,    51, 10942,   441,   295,   672,   482,    28,  3506],
       device='cuda:0')


Training:   4%|▍         | 43/1000 [00:20<04:44,  3.37it/s, loss=9]   

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([  28,  685, 1683,  351,  549,   30,  198,  198,   55, 6426],
       device='cuda:0')


Training:   4%|▍         | 44/1000 [00:20<04:45,  3.35it/s, loss=8.97]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([1970,   30,  198, 8653,  339, 2526,  832, 2494,  284, 3287],
       device='cuda:0')


Training:   4%|▍         | 45/1000 [00:21<04:41,  3.39it/s, loss=8.93]

tensor([198, 198, 198, 198, 198, 198, 198,  57, 198, 198], device='cuda:0')
tensor([44228,    17,   423,   198,   198, 41074,  2680, 35135,    55,  2285],
       device='cuda:0')


Training:   5%|▍         | 46/1000 [00:21<04:32,  3.51it/s, loss=8.87]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([  381,    28,   198,  3528,  1188,  2276,   635,  5585,   327, 13987],
       device='cuda:0')


Training:   5%|▍         | 47/1000 [00:21<04:30,  3.53it/s, loss=8.81]

tensor([198, 198, 198, 198, 198, 198, 198,  57, 198, 198], device='cuda:0')
tensor([  288,   874,    30,   198,   198, 41074,  2680, 19642, 25630,  5431],
       device='cuda:0')


Training:   5%|▍         | 48/1000 [00:21<04:27,  3.56it/s, loss=8.8] 

tensor([198, 198, 198, 198, 198, 198, 198,  28, 198, 198], device='cuda:0')
tensor([4888, 6737,  990, 1029,   43,  198,   63,   28,  965, 2585],
       device='cuda:0')


Training:   5%|▍         | 49/1000 [00:22<04:28,  3.54it/s, loss=8.8]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([ 6836, 29820,  1272,    42,   637, 34818,   253,   754, 10934, 15786],
       device='cuda:0')


Training:   5%|▌         | 50/1000 [00:22<04:29,  3.52it/s, loss=8.57]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([  957,  5717,    28, 13735,   441,   549,    43,   198,    57,   744],
       device='cuda:0')


Training:   5%|▌         | 51/1000 [00:22<04:26,  3.56it/s, loss=8.65]

tensor([198, 198, 198, 198, 198, 198,  28, 198, 198, 198], device='cuda:0')
tensor([   71, 20055,  9536,    42,   198,  2696,    28,  4074,   572, 45927],
       device='cuda:0')


Training:   5%|▌         | 52/1000 [00:23<04:25,  3.56it/s, loss=8.63]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([ 469, 1076, 1191,  355,  469, 7003,   29,  102,  520,  781],
       device='cuda:0')


Training:   5%|▌         | 53/1000 [00:23<04:24,  3.58it/s, loss=8.53]

tensor([198, 198, 198,  28, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([  653, 14119, 21723, 48384,    17,   198,   198,    52,    69,  5229],
       device='cuda:0')


Training:   5%|▌         | 54/1000 [00:23<04:27,  3.53it/s, loss=8.53]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([2205,  282, 1123, 3261, 9446,  506, 2112,   47,  198,  198],
       device='cuda:0')


Training:   6%|▌         | 55/1000 [00:23<04:32,  3.47it/s, loss=8.49]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([  198,    52,    69,  5229,  9248,  6016,   718, 25089,    42,   198],
       device='cuda:0')


Training:   6%|▌         | 56/1000 [00:24<04:31,  3.47it/s, loss=8.49]

tensor([198, 198, 198,  28, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([   30,   198,    63,   926,   457,   339,  3984,   282, 21683,  1800],
       device='cuda:0')


Training:   6%|▌         | 57/1000 [00:24<04:32,  3.46it/s, loss=8.29]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([22534, 48384,    23, 12575, 15534,    42,   198,  5965,  5295,   332],
       device='cuda:0')


Training:   6%|▌         | 58/1000 [00:24<04:30,  3.48it/s, loss=8.22]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([39813,  2097, 25466,    42,   198,  5315,  1123,  2552,  5612,   288],
       device='cuda:0')


Training:   6%|▌         | 59/1000 [00:24<04:33,  3.44it/s, loss=8.22]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([  258,   469, 31923,   974,    30,   198,   198,    66,  2062,  7430],
       device='cuda:0')


Training:   6%|▌         | 60/1000 [00:25<04:33,  3.43it/s, loss=8.2] 

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([  536, 14053,   609,   957,  2112,   351, 24478,   621, 13056,   198],
       device='cuda:0')


Training:   6%|▌         | 61/1000 [00:25<04:34,  3.42it/s, loss=8.22]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([1671,   28,  198,  504, 4033,  282, 3996,  284,  653, 6349],
       device='cuda:0')


Training:   6%|▌         | 62/1000 [00:25<04:27,  3.50it/s, loss=8.16]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([23487,   469,  1761, 12630,    28,   198, 16721,   260,  1048,  2359],
       device='cuda:0')


Training:   6%|▋         | 63/1000 [00:26<04:22,  3.57it/s, loss=8.19]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([   42,   198,    57,  2161,   536, 13168,  1992,   322,   346, 44228],
       device='cuda:0')


Training:   6%|▋         | 64/1000 [00:26<04:22,  3.57it/s, loss=8.19]

tensor([198, 198, 198, 198, 198,  28, 198, 198, 198,  42], device='cuda:0')
tensor([   30,   198,   198,    50, 15604,    59,  4728,    56,  4153,    42],
       device='cuda:0')


Training:   6%|▋         | 65/1000 [00:26<04:27,  3.49it/s, loss=7.99]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([ 2767, 20322,   418,   260, 29289,    43,   198,  1082,   105,  3379],
       device='cuda:0')


Training:   7%|▋         | 66/1000 [00:26<04:31,  3.44it/s, loss=8.11]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([ 1771,   368,  3497,    42,   198, 12192,    29, 23724, 45927,  8759],
       device='cuda:0')


Training:   7%|▋         | 67/1000 [00:27<04:32,  3.42it/s, loss=8.16]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([1991,  346,  523,  355,  787,   30,  198,  198,   59, 4728],
       device='cuda:0')


Training:   7%|▋         | 68/1000 [00:27<04:27,  3.49it/s, loss=8.07]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([   28,   198,    71,  3592,   506,  6621,    28,   905,   506, 15786],
       device='cuda:0')


Training:   7%|▋         | 69/1000 [00:27<04:25,  3.50it/s, loss=8.07]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([   88,   552,    28,   198,  3528,  3727,   549,   638,   288, 27397],
       device='cuda:0')


Training:   7%|▋         | 70/1000 [00:28<04:28,  3.46it/s, loss=7.92]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([  787,   540, 11393,   282,   260,  9202,  1670,   198,  2193, 20322],
       device='cuda:0')


Training:   7%|▋         | 71/1000 [00:28<04:27,  3.47it/s, loss=7.93]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([16087,   358,  3075,   874,   441,   288,  2606,    28,   198, 29752],
       device='cuda:0')


Training:   7%|▋         | 72/1000 [00:28<04:29,  3.45it/s, loss=7.93]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([10278,    28,   281,   480, 14553,    30,   198,   198,    59,  4728],
       device='cuda:0')


Training:   7%|▋         | 73/1000 [00:29<04:30,  3.43it/s, loss=7.76]

tensor([198, 198, 198, 198, 198, 198,  42, 198, 198, 198], device='cuda:0')
tensor([   47,   198,   198, 39813,  2097, 25466,    42,   198,  5345,    28],
       device='cuda:0')


Training:   7%|▋         | 74/1000 [00:29<04:26,  3.47it/s, loss=7.89]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([   28, 14067,   335,    28,  1675,   392,   359,   614,   281,  6548],
       device='cuda:0')


Training:   8%|▊         | 75/1000 [00:29<04:27,  3.46it/s, loss=7.89]

tensor([198, 198, 198, 198, 198, 198,  28, 198, 198, 198], device='cuda:0')
tensor([  284,  1188,    28,   957, 47910, 21723,    30,   198,   198,    59],
       device='cuda:0')


Training:   8%|▊         | 76/1000 [00:30<04:28,  3.44it/s, loss=7.79]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([18266,   284,   260, 27508,   655,   198,    51,  7380,   767,   260],
       device='cuda:0')


Training:   8%|▊         | 77/1000 [00:30<04:25,  3.47it/s, loss=7.72]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([   28,   967,  5337,   392,  3568,  1523,   198, 39855,  2161,   457],
       device='cuda:0')


Training:   8%|▊         | 78/1000 [00:30<04:22,  3.51it/s, loss=7.81]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([  267, 30250,   198,  5519,   339,   475,  2593,  6737,   767,    28],
       device='cuda:0')


Training:   8%|▊         | 79/1000 [00:30<04:18,  3.56it/s, loss=7.83]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([ 4145, 46160,   284, 42521,   288,   957,  1904,  4778,    43,   198],
       device='cuda:0')


Training:   8%|▊         | 80/1000 [00:31<04:13,  3.63it/s, loss=7.77]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([ 7576,   253, 10001, 29562,   198,  8155,   281,   260,  1450,  1670],
       device='cuda:0')


Training:   8%|▊         | 81/1000 [00:31<04:12,  3.64it/s, loss=7.73]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([   28,   198, 25885, 26429, 12101,   638,   288, 47850, 20322,    42],
       device='cuda:0')


Training:   8%|▊         | 82/1000 [00:31<04:10,  3.66it/s, loss=7.77]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([ 372,  323,   82, 1484,  284, 7576,  359,  702,  827, 1800],
       device='cuda:0')


Training:   8%|▊         | 83/1000 [00:31<04:11,  3.65it/s, loss=7.68]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([3726, 4157,   28, 1980, 1556,  282, 1029,   28,  198,   68],
       device='cuda:0')


Training:   8%|▊         | 84/1000 [00:32<04:13,  3.62it/s, loss=7.6] 

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([10054,  2526,  4571,   957,  6943,    30,   198,   198,    59,  4728],
       device='cuda:0')


Training:   8%|▊         | 85/1000 [00:32<04:14,  3.60it/s, loss=7.65]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([20641,   645,   346,   599,   277,   351,  1272,    47,   198,   198],
       device='cuda:0')


Training:   9%|▊         | 86/1000 [00:32<04:16,  3.57it/s, loss=7.65]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([ 7501,  1114,   287,   789, 43228,   260, 31049,    42,   198, 21553],
       device='cuda:0')


Training:   9%|▊         | 87/1000 [00:33<04:19,  3.52it/s, loss=7.48]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([  252,  1207,  6737,   578,   284, 43537,   318,  1076, 35216,    42],
       device='cuda:0')


Training:   9%|▉         | 88/1000 [00:33<04:19,  3.52it/s, loss=7.62]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([ 318, 4699,  198, 2068,  963,  451, 1861,   30, 1626,   29],
       device='cuda:0')


Training:   9%|▉         | 89/1000 [00:33<04:18,  3.53it/s, loss=7.38]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([   88,  9535,  2751,  1352,   198,  3528,  7471,   253, 40590, 27044],
       device='cuda:0')


Training:   9%|▉         | 90/1000 [00:33<04:15,  3.56it/s, loss=7.38]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([1222,   28,  347,  339, 2422,  411,   28,  339, 1217,  665],
       device='cuda:0')


Training:   9%|▉         | 91/1000 [00:34<04:15,  3.55it/s, loss=7.4] 

tensor([ 198,  198,  198,  198,  198,  198,  198,  198, 2680,   42],
       device='cuda:0')
tensor([  589,    28,  7706,    47,   198,   198,    56,  2680, 24625, 33702],
       device='cuda:0')


Training:   9%|▉         | 92/1000 [00:34<04:15,  3.55it/s, loss=7.35]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([   30,   198,  2683,   457,   253,  4132,    28,   330, 13468,   290],
       device='cuda:0')


Training:   9%|▉         | 93/1000 [00:34<04:12,  3.59it/s, loss=7.47]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([  198, 28925,   260, 14568,  6956,   282,   278,  4314,   358,    43],
       device='cuda:0')


Training:   9%|▉         | 94/1000 [00:35<04:13,  3.57it/s, loss=7.42]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([ 1319,   549,  3287,   282,  3878,   564, 20727,    30,   198,  2705],
       device='cuda:0')


Training:  10%|▉         | 95/1000 [00:35<04:06,  3.67it/s, loss=7.28]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([1038,  564, 2112,  198, 3528,  338, 1165, 1743,  282,  260],
       device='cuda:0')


Training:  10%|▉         | 96/1000 [00:35<04:06,  3.66it/s, loss=7.43]

tensor([ 198,  198,  198, 2680,   42,  198,  198,  198,  198,  198],
       device='cuda:0')
tensor([  198,   198,    56,  2680, 24625,   389,  7113,  4728,    50,  3911],
       device='cuda:0')


Training:  10%|▉         | 97/1000 [00:35<04:07,  3.65it/s, loss=7.14]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([   28,   284,   650,  1911,   198,  2068, 17831,  2754,   282,   469],
       device='cuda:0')


Training:  10%|▉         | 98/1000 [00:36<04:08,  3.64it/s, loss=7.17]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([ 339, 3060, 1928,   28,  284, 6819, 1147,   43,  198, 2193],
       device='cuda:0')


Training:  10%|▉         | 99/1000 [00:36<04:09,  3.61it/s, loss=7.11]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([   28,   198, 22204,  3930,    28,   359,   511,   260,  2321,  8770],
       device='cuda:0')


Training:  10%|█         | 100/1000 [00:36<04:09,  3.60it/s, loss=7.2] 

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([  524,   284, 23414,   198,    68,   388, 17072,   260,  8423,    29],
       device='cuda:0')


Training:  10%|█         | 101/1000 [00:36<04:08,  3.62it/s, loss=7.02]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([  60, 4768,   28,  346,  338, 1535,  359,  656,  653, 8641],
       device='cuda:0')


Training:  10%|█         | 102/1000 [00:37<04:11,  3.57it/s, loss=7.02]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([ 957, 8664,  506, 9596,  198, 2068, 3423,   85,  936,  260],
       device='cuda:0')


Training:  10%|█         | 103/1000 [00:37<04:16,  3.50it/s, loss=7.01]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([ 536,  441,  963,   28,  198, 5965, 3506, 8739,  267, 1878],
       device='cuda:0')


Training:  10%|█         | 104/1000 [00:37<04:15,  3.51it/s, loss=7.01]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([   43,   198, 14344,  1928,   339,  8949,    28,   284,  3310,  1188],
       device='cuda:0')


Training:  10%|█         | 105/1000 [00:37<04:22,  3.42it/s, loss=6.94]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([   30,   198,  2696,    28,   411,  8949, 19461,    28,   411,   957],
       device='cuda:0')


Training:  11%|█         | 106/1000 [00:38<04:23,  3.40it/s, loss=6.98]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([   28,   347, 17072,  1012,   506,   100,    28, 13987,  8135,  1301],
       device='cuda:0')


Training:  11%|█         | 107/1000 [00:38<04:19,  3.44it/s, loss=6.92]

tensor([3911,  198,  198, 3911, 9620,   42,  198,  198,  198,  198],
       device='cuda:0')
tensor([7113, 4728,   50, 3911, 9620,   42,  198, 3825,  511,  957],
       device='cuda:0')


Training:  11%|█         | 108/1000 [00:38<04:17,  3.46it/s, loss=6.92]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([ 787,  555, 2093, 2216,  564,  338, 8177, 2767,  198, 5195],
       device='cuda:0')


Training:  11%|█         | 109/1000 [00:39<04:22,  3.40it/s, loss=6.9] 

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([ 2164,   338,   536,  8018,   737,    28,   198, 27737,   536,   339],
       device='cuda:0')


Training:  11%|█         | 110/1000 [00:39<04:30,  3.29it/s, loss=7.02]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([ 5852,   429,   469, 15062,    28,   198,  3280,  1556,   282, 19888],
       device='cuda:0')


Training:  11%|█         | 111/1000 [00:39<04:30,  3.29it/s, loss=6.95]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([ 2717,    17,   198, 11114,  1063,   282,  1022,    28,  5681,  7864],
       device='cuda:0')


Training:  11%|█         | 112/1000 [00:40<04:26,  3.33it/s, loss=6.95]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([ 3786,   325,    42,   198, 14413,   335,   540,  1737,    28,   282],
       device='cuda:0')


Training:  11%|█▏        | 113/1000 [00:40<04:27,  3.32it/s, loss=7.03]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([17264,    17,   423,   198,    71,   518,   282,   354,  1850,    42],
       device='cuda:0')


Training:  11%|█▏        | 114/1000 [00:40<04:26,  3.33it/s, loss=6.94]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([  715, 11415,  1088,   383,    42,   198,  1882,  3060,   457,   787],
       device='cuda:0')


Training:  12%|█▏        | 115/1000 [00:41<04:24,  3.34it/s, loss=6.89]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([  808,   288,  7012,    28,   198, 19671,   601,  1554,   282,  1123],
       device='cuda:0')


Training:  12%|█▏        | 116/1000 [00:41<04:21,  3.38it/s, loss=6.89]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([  585, 17072,  2265,    28,   198, 21175,   253,  3506,  4313,   284],
       device='cuda:0')


Training:  12%|█▏        | 117/1000 [00:41<04:20,  3.40it/s, loss=6.92]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198,  42], device='cuda:0')
tensor([   90,   424,  1147,    30,   198, 46292,    95,    17,   948,  1970],
       device='cuda:0')


Training:  12%|█▏        | 118/1000 [00:41<04:13,  3.48it/s, loss=6.68]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([  339,   699,   260,  2139,    42,   198,  9393, 17072,   441, 30493],
       device='cuda:0')


Training:  12%|█▏        | 119/1000 [00:42<04:11,  3.50it/s, loss=6.68]

tensor([198, 198, 198,  28, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([20322,   957, 21723,  2126,   260,   905,    30,   198,   198,    62],
       device='cuda:0')


Training:  12%|█▏        | 120/1000 [00:42<04:15,  3.45it/s, loss=6.89]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([21965, 17072,   719,    28,   965,    47,   198,   198,  3911,  3945],
       device='cuda:0')


Training:  12%|█▏        | 121/1000 [00:42<04:19,  3.38it/s, loss=6.9] 

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([  874,    43,  9725,    95,   253, 12724, 13285,    43, 41916,   253],
       device='cuda:0')


Training:  12%|█▏        | 122/1000 [00:43<04:22,  3.35it/s, loss=6.68]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([ 282,  650,  651,  456,  105,   47,  198,  198, 3911, 3945],
       device='cuda:0')


Training:  12%|█▏        | 123/1000 [00:43<04:17,  3.41it/s, loss=6.62]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([  100, 17072,  8177,    47,   198, 12908,  3661,   325,  8177,    28],
       device='cuda:0')


Training:  12%|█▏        | 124/1000 [00:43<04:14,  3.44it/s, loss=6.73]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([ 337, 4048,   43,  198, 2596,  957, 2629, 2606,  314, 5951],
       device='cuda:0')


Training:  12%|█▎        | 125/1000 [00:43<04:14,  3.43it/s, loss=6.73]

tensor([2680,  198, 4501,   42,  198,  198,   28,  198,   28,  198],
       device='cuda:0')
tensor([31540,  4628,  8772,    42,   198, 34056,    28, 38061,    28,   469],
       device='cuda:0')


Training:  13%|█▎        | 126/1000 [00:44<04:17,  3.40it/s, loss=6.8] 

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([1607,  284, 6917,   30,  198, 1348,  314,  260, 4569,   28],
       device='cuda:0')


Training:  13%|█▎        | 127/1000 [00:44<04:17,  3.39it/s, loss=6.75]

tensor([198, 198, 198, 198, 198, 198,  28,  42, 198, 198], device='cuda:0')
tensor([  359,  7270,    47,   198,   198,    62, 10942,    42,   198, 35251],
       device='cuda:0')


Training:  13%|█▎        | 128/1000 [00:44<04:17,  3.38it/s, loss=6.75]

tensor([198,  28, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([ 905,  314, 2932,  284, 3599,   30,  198,  198, 3911, 3945],
       device='cuda:0')


Training:  13%|█▎        | 129/1000 [00:45<04:17,  3.39it/s, loss=6.6] 

tensor([  198,   198,   198,   198,   198,   198,   198,    42,   198, 18630],
       device='cuda:0')
tensor([13735,   739, 34104,    30,   198,   198,    54,  5135,  2285, 18630],
       device='cuda:0')


Training:  13%|█▎        | 130/1000 [00:45<04:15,  3.40it/s, loss=6.6]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([ 2275,   392,  3408,  1272,  1176,  8322,    28,   198, 16721,   653],
       device='cuda:0')


Training:  13%|█▎        | 131/1000 [00:45<04:15,  3.41it/s, loss=6.54]

tensor([198, 198, 198, 198,  42,  28, 198,  42, 198, 198], device='cuda:0')
tensor([  198,    60,  3907,    73, 29146,  6565,  3438,    42,   198,  5195],
       device='cuda:0')


Training:  13%|█▎        | 132/1000 [00:46<04:12,  3.43it/s, loss=6.35]

tensor([ 28,  28, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([  313,    42,   198,    57,  3287,   787, 33974,    30,   198,   198],
       device='cuda:0')


Training:  13%|█▎        | 133/1000 [00:46<04:10,  3.46it/s, loss=6.5] 

tensor([198, 198,  42, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([  57, 3438,   42,  198, 2068, 2988,  338,   28,  339,  868],
       device='cuda:0')


Training:  13%|█▎        | 134/1000 [00:46<04:08,  3.48it/s, loss=6.45]

tensor([198, 198, 198, 198, 198, 198, 198, 198,  28, 198], device='cuda:0')
tensor([17072, 36034, 22576,    28,   198,  4370,   449, 30493,   411,   957],
       device='cuda:0')


Training:  14%|█▎        | 135/1000 [00:46<04:12,  3.43it/s, loss=6.45]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198,  28], device='cuda:0')
tensor([1225,   17,  198, 4370,  449,  339,  441,   28,  965,   28],
       device='cuda:0')


Training:  14%|█▎        | 136/1000 [00:47<04:14,  3.40it/s, loss=6.57]

tensor([ 28, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([ 288,  685,   28,  564, 2093,  288, 1003,   30,  198,   63],
       device='cuda:0')


Training:  14%|█▎        | 137/1000 [00:47<04:11,  3.43it/s, loss=6.71]

tensor([198, 198, 198, 198,  28, 198, 198, 198, 198, 198], device='cuda:0')
tensor([  331, 26365,   253,  6243,  2139,    30,   198,   198, 43029,  1754],
       device='cuda:0')


Training:  14%|█▍        | 138/1000 [00:47<04:08,  3.47it/s, loss=6.59]

tensor([ 198,   28,  198,  198,  198,  198,  198, 3945,  198,   28],
       device='cuda:0')
tensor([1503,  346, 5173,   30,  198,  198, 3911, 3945,   63,   42],
       device='cuda:0')


Training:  14%|█▍        | 139/1000 [00:47<04:06,  3.50it/s, loss=6.59]

tensor([ 28, 198, 198, 198, 198, 198, 198,  28, 198, 198], device='cuda:0')
tensor([ 9154,  5569,    28, 13804,   441,   253, 22415,   555,    43,   198],
       device='cuda:0')


Training:  14%|█▍        | 140/1000 [00:48<04:07,  3.48it/s, loss=6.5] 

tensor([ 28, 198,  28, 198,  28, 198, 198, 198, 198, 198], device='cuda:0')
tensor([   30, 14755,    28,  1690,   429,   338, 10172,   198,  6228,  2112],
       device='cuda:0')


Training:  14%|█▍        | 141/1000 [00:48<04:03,  3.52it/s, loss=6.53]

tensor([198, 198,  28, 198, 198, 198,  28, 198, 198,  28], device='cuda:0')
tensor([7200,  874,  429,  451, 1796, 6724,   28,  198, 6882,  281],
       device='cuda:0')


Training:  14%|█▍        | 142/1000 [00:48<04:02,  3.54it/s, loss=6.53]

tensor([ 28, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([  315,   851,    28,   198,    57, 14941,   411, 10419,   623,  3497],
       device='cuda:0')


Training:  14%|█▍        | 143/1000 [00:49<04:04,  3.50it/s, loss=6.34]

tensor([198, 198, 198, 198, 198, 198,  28, 198,  28, 198], device='cuda:0')
tensor([   42,   198,    57,   744,   260,  4132,   282,  6661,   260, 25729],
       device='cuda:0')


Training:  14%|█▍        | 144/1000 [00:49<04:06,  3.47it/s, loss=6.34]

tensor([198, 198, 198,  28, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([ 254,  349,  269,   30,  198, 2596,  325,  357,  347,  357],
       device='cuda:0')


Training:  14%|█▍        | 145/1000 [00:49<04:09,  3.43it/s, loss=6.3] 

tensor([ 28, 198, 198, 198, 198,  42,  42, 198, 198, 198], device='cuda:0')
tensor([ 1448,    30,   198,   198,    73, 25089,    42,   198,  9629,   732],
       device='cuda:0')


Training:  15%|█▍        | 146/1000 [00:50<04:06,  3.47it/s, loss=6.55]

tensor([198, 198, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([ 2853,    96,  6737,   260,  7429,   837, 13987,  1924,   868,  4137],
       device='cuda:0')


Training:  15%|█▍        | 147/1000 [00:50<04:03,  3.50it/s, loss=6.49]

tensor([198, 198, 198, 198,  28, 198,  42,  42, 198, 198], device='cuda:0')
tensor([   30,   198,   198,  7430,  8107,    54, 10030,    42,   198, 45496],
       device='cuda:0')


Training:  15%|█▍        | 148/1000 [00:50<03:59,  3.56it/s, loss=6.43]

tensor([198,  28, 198, 198, 198, 198, 198,  28, 198, 198], device='cuda:0')
tensor([  588,   198,  5195, 12765,   416,   339,  1089,  3806,   957,  3497],
       device='cuda:0')


Training:  15%|█▍        | 149/1000 [00:50<03:56,  3.61it/s, loss=6.56]

tensor([ 28, 198, 198, 198,  28, 198, 198, 198, 198,  28], device='cuda:0')
tensor([ 4196,   384,   392,   437,    28,   198,   504, 42502, 15731,  4502],
       device='cuda:0')


Training:  15%|█▌        | 150/1000 [00:51<03:51,  3.67it/s, loss=6.51]

tensor([ 28, 198,  28, 198,  28, 260,  28, 198, 198, 198], device='cuda:0')
tensor([10937,  1209,   650, 26467,   282, 12669,    30,   198,   198, 29719],
       device='cuda:0')


Training:  15%|█▌        | 151/1000 [00:51<03:51,  3.67it/s, loss=6.24]

tensor([ 28, 198,  28, 198, 198, 198,  28, 198,  28,  28], device='cuda:0')
tensor([15032,   480, 10172,    28,   198, 16654,   259,   480,  1038,  2397],
       device='cuda:0')


Training:  15%|█▌        | 152/1000 [00:51<03:53,  3.64it/s, loss=6.44]

tensor([ 198,  198,   28,   28,  198,  198,  198, 2680,  198,  198],
       device='cuda:0')
tensor([ 4875,   549,  3287,    30,   198,   198, 41074,  2680, 35135,    55],
       device='cuda:0')


Training:  15%|█▌        | 153/1000 [00:51<03:54,  3.61it/s, loss=6.43]

tensor([198, 198,  28, 198, 198,  28,  28,  28,  28, 198], device='cuda:0')
tensor([  281,   544,   381,   411,  4360,  4565,  1802, 11064,    47,   198],
       device='cuda:0')


Training:  15%|█▌        | 154/1000 [00:52<03:54,  3.60it/s, loss=6.28]

tensor([ 28,  28, 198, 198, 198, 198, 198, 198, 198, 198], device='cuda:0')
tensor([ 4463,    28,   198,  2427,  1176,    28, 37494,    28,   284, 33974],
       device='cuda:0')


Training:  16%|█▌        | 155/1000 [00:52<03:51,  3.64it/s, loss=6.42]

tensor([198, 198, 198,  28, 198, 198, 198,  28, 198,  28], device='cuda:0')
tensor([  351,   957,  8664,    17,   198,  5965,  2606,   284,  3696, 36644],
       device='cuda:0')


Training:  16%|█▌        | 156/1000 [00:52<03:54,  3.60it/s, loss=6.21]

tensor([ 28,  28,  28, 198, 198, 198, 198,  28,  28, 198], device='cuda:0')
tensor([ 2874,  1592,    43,   198,  3528,    28,  1953,  4649, 13987,  3289],
       device='cuda:0')


Training:  16%|█▌        | 157/1000 [00:53<03:56,  3.57it/s, loss=6.21]

tensor([ 28, 198,  28,  28, 198, 198, 198,  28,  28, 198], device='cuda:0')
tensor([1251,  325, 2627,  277,  198, 2068,  685, 1683,  351,  468],
       device='cuda:0')


Training:  16%|█▌        | 158/1000 [00:53<04:04,  3.45it/s, loss=5.94]

tensor([198, 198, 198,  42, 198,  42, 198, 198, 198,  28], device='cuda:0')
tensor([   60,  3907,    73, 23675,    73,    42,   198,  5965,  2606, 10937],
       device='cuda:0')


Training:  16%|█▌        | 159/1000 [00:53<04:03,  3.46it/s, loss=6.43]

tensor([ 28, 198,  28,  28,  28, 198,  28, 198, 198, 198], device='cuda:0')
tensor([16568,  8767,   301,  2177,   351,   634,   278,  6842,    28,   198],
       device='cuda:0')


Training:  16%|█▌        | 160/1000 [00:54<03:59,  3.51it/s, loss=6.44]

tensor([ 28, 198, 198, 198, 198,  42,  42, 198, 198, 198], device='cuda:0')
tensor([   30,   198,   198, 29719,    71, 27163,    42,   198,  5212, 14942],
       device='cuda:0')


Training:  16%|█▌        | 161/1000 [00:54<03:57,  3.54it/s, loss=6.26]

tensor([198, 198, 198, 198, 198, 198, 198,  42,  42,  42], device='cuda:0')
tensor([   30,   198,   198,    59,  4728, 19436,    71,  2721, 38744,    42],
       device='cuda:0')


Training:  16%|█▌        | 162/1000 [00:54<03:54,  3.57it/s, loss=6.18]

tensor([ 28, 198, 198, 198, 198,  28,  28, 198, 198,  28], device='cuda:0')
tensor([   28,   284, 18089,  7065,  9446,  5182,    43,   198,  3528,    28],
       device='cuda:0')


Training:  16%|█▋        | 163/1000 [00:54<03:51,  3.62it/s, loss=6.1] 

tensor([198,  28,  28,  28, 198, 198,  28,  28, 198,  28], device='cuda:0')
tensor([ 5151, 11948,  5559,   335,   469,   725,  6221,   506,  4132,    28],
       device='cuda:0')


Training:  16%|█▋        | 164/1000 [00:55<03:52,  3.59it/s, loss=6.16]

tensor([ 28, 198, 198,  28, 198,  28, 198, 198,  28,  28], device='cuda:0')
tensor([ 744,  339, 2090,  282, 9970,   30,  198, 2696, 3472, 1303],
       device='cuda:0')


Training:  16%|█▋        | 165/1000 [00:55<03:53,  3.57it/s, loss=6.16]

tensor([   28,   198,   198,   198,  4728,   198,   198, 20055,    42,    42],
       device='cuda:0')
tensor([   30,   198,   198,    59,  4728, 15494,    71, 20055,  9536,    42],
       device='cuda:0')


Training:  17%|█▋        | 166/1000 [00:55<03:56,  3.53it/s, loss=6.12]

tensor([ 28,  28, 198, 198,  42, 198, 198,  28,  28,  42], device='cuda:0')
tensor([4658,  282, 4929, 9686,  198, 4948, 3100, 6616,  480, 2184],
       device='cuda:0')


Training:  17%|█▋        | 167/1000 [00:55<03:51,  3.60it/s, loss=6.04]

tensor([ 28, 198,  28, 198, 198,  28,  28,  28,  28,  42], device='cuda:0')
tensor([  260,  1532,    28,   198, 16075,  3310,  1869, 18598,   283,   494],
       device='cuda:0')


Training:  17%|█▋        | 168/1000 [00:56<03:53,  3.57it/s, loss=6.04]

tensor([  198,   198,   198,   198, 20055,    42,   198,   198,    28,    28],
       device='cuda:0')
tensor([  198,   198, 29719,    71, 27163,    42,   198,  1780,  8296,    28],
       device='cuda:0')


Training:  17%|█▋        | 169/1000 [00:56<03:56,  3.52it/s, loss=5.97]

tensor([198,  28, 198,  28, 198,  28,  28, 198, 198,  28], device='cuda:0')
tensor([17072,   263,   432,   260,  2240, 17816,    28,   198, 15024,   494],
       device='cuda:0')


Training:  17%|█▋        | 170/1000 [00:56<03:53,  3.56it/s, loss=6.21]

tensor([ 28, 198, 198,  28,  28,  28, 198, 198,  28, 198], device='cuda:0')
tensor([3934,  260, 3102,  284, 3568,   47,  198, 6882, 3786,  392],
       device='cuda:0')


Training:  17%|█▋        | 171/1000 [00:57<03:50,  3.60it/s, loss=6.12]

tensor([ 28,  28,  28, 198,  28,  28, 198, 198,  28,  28], device='cuda:0')
tensor([   93,   284,   702,   253, 27508, 18030,   198,  3825, 12885,   739],
       device='cuda:0')


Training:  17%|█▋        | 172/1000 [00:57<03:52,  3.56it/s, loss=6.12]

tensor([ 28,  28,  28,  28, 198, 198,  28, 198,  28,  28], device='cuda:0')
tensor([4875, 1272, 3287,   30,  198, 1780,   17,  416,  588, 1805],
       device='cuda:0')


Training:  17%|█▋        | 173/1000 [00:57<03:56,  3.49it/s, loss=6.15]

tensor([ 28, 198, 198,  28,  28,  28,  28, 198, 198,  28], device='cuda:0')
tensor([  28,  198, 3528, 1062, 3005,  474, 2442,   43, 3472,   28],
       device='cuda:0')


Training:  17%|█▋        | 174/1000 [00:57<03:52,  3.55it/s, loss=6.59]

tensor([ 28, 198, 198,  28, 198, 198,  28, 198,  28, 198], device='cuda:0')
tensor([  506,  9202, 15308,    28,   198,  3733,    29,    96,  2467,   821],
       device='cuda:0')


Training:  18%|█▊        | 175/1000 [00:58<03:48,  3.61it/s, loss=6.37]

tensor([ 28,  28, 198, 198, 260, 198, 198, 198, 198,  28], device='cuda:0')
tensor([ 1163,   198, 26001,   325,  1042,   614,    28,   957,  5717,    28],
       device='cuda:0')


Training:  18%|█▊        | 176/1000 [00:58<03:49,  3.59it/s, loss=6.34]

tensor([  28,  198,  198,  198,   42, 6801,   42,  198,  198,   28],
       device='cuda:0')
tensor([   47,   198,   198,  4796, 32598,  2097,    42,   198, 19525,    28],
       device='cuda:0')


Training:  18%|█▊        | 177/1000 [00:58<03:51,  3.56it/s, loss=6.46]

tensor([198,  28, 198,  28,  28, 198, 198,  28,  28,  28], device='cuda:0')
tensor([  599,    28,  3449,  2843,   198, 20314,   650,    99,   549,   288],
       device='cuda:0')


Training:  18%|█▊        | 178/1000 [00:59<03:50,  3.57it/s, loss=6.43]

tensor([198,  28,  28, 198,  28,  28, 198, 198, 198, 198], device='cuda:0')
tensor([ 1643,   982,   284, 16011,   982,    30,   198,   198, 39776,  4192],
       device='cuda:0')


Training:  18%|█▊        | 179/1000 [00:59<03:48,  3.59it/s, loss=6.3] 

tensor([ 28, 198, 198, 198,  42, 198,  42, 198, 198,  28], device='cuda:0')
tensor([   47,   198,   198, 39776,  4192,  6426,    42,   198,  3482,  3861],
       device='cuda:0')


Training:  18%|█▊        | 180/1000 [00:59<03:50,  3.55it/s, loss=6.3]

tensor([198,  28,  28, 198, 198,  28, 198, 198,  28, 198], device='cuda:0')
tensor([14229, 12557,   325,   957,  3289,    28,   284,  4160,   198,   504],
       device='cuda:0')


Training:  18%|█▊        | 181/1000 [00:59<03:55,  3.48it/s, loss=6.25]

tensor([198,  28, 198, 198, 198, 198,  28, 198,  28,  28], device='cuda:0')
tensor([ 357,  868,  325,   28,  198, 8113,  506,  354, 2490,   85],
       device='cuda:0')


Training:  18%|█▊        | 182/1000 [01:00<03:52,  3.52it/s, loss=6.29]

tensor([198,  28, 198, 198,  42,  28,  28, 198, 198,  28], device='cuda:0')
tensor([  346,    28,   355,   328,   101,   432, 31886,   198,  6882, 43624],
       device='cuda:0')


Training:  18%|█▊        | 183/1000 [01:00<03:51,  3.53it/s, loss=6.13]

tensor([ 28, 198,  42,  28, 198,  28, 198,  28, 198,  28], device='cuda:0')
tensor([  198,    54,  4985,   284,   430, 33659,  1718,    28,   441,   253],
       device='cuda:0')


Training:  18%|█▊        | 184/1000 [01:00<03:50,  3.53it/s, loss=6.1] 

tensor([198, 198, 198, 198,  28,  28,  42,  42, 198, 198], device='cuda:0')
tensor([   17,   198,   198, 17321,  5819,  2154,  4501,    42,   198,    57],
       device='cuda:0')


Training:  18%|█▊        | 185/1000 [01:01<03:48,  3.57it/s, loss=6.39]

tensor([ 28, 198,  28, 198,  28,  28, 198, 198, 198,  28], device='cuda:0')
tensor([   28,   963,   506,   100, 17072,    17,   327,   260,  2319,   198],
       device='cuda:0')


Training:  19%|█▊        | 186/1000 [01:01<03:46,  3.60it/s, loss=6.15]

tensor([ 42,  28, 198,  28, 198, 198,  28, 198, 198,  28], device='cuda:0')
tensor([15684,    28,   527,   339,   736, 20172,    42,   327, 19461,    28],
       device='cuda:0')


Training:  19%|█▊        | 187/1000 [01:01<03:45,  3.60it/s, loss=6.26]

tensor([ 28,  28,  28, 198, 198, 198,  28,  42, 198, 198], device='cuda:0')
tensor([  313,  2694,    30,   198,   198, 36169,   403,    42,   198,  5965],
       device='cuda:0')


Training:  19%|█▉        | 188/1000 [01:01<03:45,  3.60it/s, loss=6.49]

tensor([198,  28,  28, 198, 198, 198, 198,  28,  28, 198], device='cuda:0')
tensor([ 1038, 21723,    28,   198, 10576,   314,  3590,  1147,    42,  1188],
       device='cuda:0')


Training:  19%|█▉        | 189/1000 [01:02<03:45,  3.60it/s, loss=6.32]

tensor([ 28, 198, 198,  28,  28, 198,  28,  28, 198, 198], device='cuda:0')
tensor([   42,   198, 31000,  9677,    28,  3092,    81,    17,   198,   198],
       device='cuda:0')


Training:  19%|█▉        | 190/1000 [01:02<03:45,  3.59it/s, loss=6.57]

tensor([ 28,  28,  28,  28, 198, 198, 198, 198, 198, 260], device='cuda:0')
tensor([17926, 24113, 20322,   750,  2775,    43,   253,  2112,   288,   198],
       device='cuda:0')


Training:  19%|█▉        | 191/1000 [01:02<03:47,  3.56it/s, loss=6.57]

tensor([198, 198, 198, 198, 198, 198, 198, 198,  28, 198], device='cuda:0')
tensor([  744,   198,    57,   288,  4571,   327,   653, 11108,    29,  6691],
       device='cuda:0')


Training:  19%|█▉        | 192/1000 [01:02<03:50,  3.51it/s, loss=6.36]

tensor([198,  28,  28, 198, 198, 198,  28,  28, 198,  28], device='cuda:0')
tensor([47910, 21723,    28,   198,  2068,   410,   489,   418,   469, 23454],
       device='cuda:0')


Training:  19%|█▉        | 193/1000 [01:03<03:46,  3.57it/s, loss=6.34]

tensor([ 28, 198,  28, 198, 198, 198,  28, 198, 198, 198], device='cuda:0')
tensor([  339,   736,  6796,   198,  1348,  5569,   868,  1643,   637, 10372],
       device='cuda:0')


Training:  19%|█▉        | 194/1000 [01:03<03:47,  3.54it/s, loss=6.34]

tensor([ 28,  28, 198, 198, 198,  28,  42, 198, 198,  42], device='cuda:0')
tensor([ 1756,    30,   198,   198, 36169,   403,    42,   198,  3681, 26365],
       device='cuda:0')


Training:  20%|█▉        | 195/1000 [01:03<04:36,  2.91it/s, loss=6.55]

tensor([ 28, 198, 198,  28,  28, 198, 198, 198,  28,  28], device='cuda:0')
tensor([  253,  1035,  5740,   582,    30,   198,   198, 12850,  4545,    49],
       device='cuda:0')


Training:  20%|█▉        | 196/1000 [01:04<04:27,  3.00it/s, loss=6]   

tensor([  28,  198,  198,  198,  198,   42, 6426,   42,  198,  198],
       device='cuda:0')
tensor([ 6737,    30,   198,   198, 39776,  4192,  6426,    42,   198,  1348],
       device='cuda:0')


Training:  20%|█▉        | 197/1000 [01:04<04:15,  3.15it/s, loss=6.1]

tensor([ 28, 198,  42,  28, 198,  28, 198, 198, 198,  28], device='cuda:0')
tensor([ 253, 4565,  477,   43, 7223, 6737,   28,  198, 2596, 3878],
       device='cuda:0')


Training:  20%|█▉        | 198/1000 [01:04<04:07,  3.24it/s, loss=6.26]

tensor([ 28,  28,  28, 198,  42,  28, 198, 198, 198, 198], device='cuda:0')
tensor([ 550, 1075,  198,   54, 1738,  670, 2275,  355,  655,   30],
       device='cuda:0')


Training:  20%|█▉        | 199/1000 [01:05<04:01,  3.31it/s, loss=6.26]

tensor([ 28,  28, 198, 198, 198, 198, 198,  42, 198, 198], device='cuda:0')
tensor([ 2001,    30,   198,   198, 39776,  4192,  6426,    42,   198,    62],
       device='cuda:0')


Training:  20%|██        | 200/1000 [01:05<04:01,  3.31it/s, loss=6.05]

tensor([198,  42,  42, 198, 198, 198,  28, 198,  28, 198], device='cuda:0')
tensor([ 8113, 21660,    42,   198,  5965,  1861,    28, 38061,    28,   314],
       device='cuda:0')


Training:  20%|██        | 201/1000 [01:05<03:56,  3.37it/s, loss=6.04]

tensor([198, 198, 198, 198,  28, 198, 198, 198, 198, 198], device='cuda:0')
tensor([  347,   384, 48271,   468,    42,   384,   436,  2711,   288,   536],
       device='cuda:0')


Training:  20%|██        | 202/1000 [01:06<03:52,  3.43it/s, loss=6.06]

tensor([198, 198, 198, 198,  42,  28,  28,  42, 198, 198], device='cuda:0')
tensor([   30,   198,   198, 48902,  9232,  2680,  2097,    42,   198, 14229],
       device='cuda:0')


Training:  20%|██        | 203/1000 [01:06<03:47,  3.50it/s, loss=6.09]

tensor([ 28,  28, 198, 198,  28,  28, 198, 198, 198, 198], device='cuda:0')
tensor([ 924,   42,  957, 1450, 4445,  198,   57,  457,  429,  469],
       device='cuda:0')


Training:  20%|██        | 204/1000 [01:06<03:41,  3.59it/s, loss=6.62]

tensor([198, 198, 198,  28, 198, 198, 260, 198, 198,  28], device='cuda:0')
tensor([ 7697,   355, 25514,    43,   564,   281,   260,   198,  1527,  2832],
       device='cuda:0')


Training:  20%|██        | 205/1000 [01:06<03:38,  3.64it/s, loss=6.62]

tensor([ 28, 260,  28, 260, 198,  28, 198, 198,  28, 198], device='cuda:0')
tensor([  281,  3826,   282,   198, 11247,    42,   295,  1878,   351,   511],
       device='cuda:0')


Training:  21%|██        | 206/1000 [01:07<03:41,  3.58it/s, loss=5.96]

tensor([ 42,  57,  42, 198, 198, 198,  28,  28,  28, 198], device='cuda:0')
tensor([ 6565, 38415,    42,   198,  1653,  1041,  4161,  8361,  1325,    28],
       device='cuda:0')


Training:  21%|██        | 207/1000 [01:07<03:39,  3.61it/s, loss=6.24]

tensor([ 28,  28, 198, 198,  28,  28,  28,  28, 198,  28], device='cuda:0')
tensor([ 1272,   198, 35097,  1029, 25865,  1687,   346,    30,  1206, 13461],
       device='cuda:0')


Training:  21%|██        | 208/1000 [01:07<03:38,  3.62it/s, loss=6.03]

tensor([198,  28, 198, 198,  28, 198, 198, 198,  28, 260], device='cuda:0')
tensor([4778,  198, 2068,  582,  338,  416,  957,  599,  281, 1272],
       device='cuda:0')


Training:  21%|██        | 209/1000 [01:07<03:39,  3.60it/s, loss=6.03]

tensor([ 28, 198, 198,  28,  28, 198, 198, 198,  28,  42], device='cuda:0')
tensor([   43,   339,   744,  2139,    30,   198,   198,    60, 15604,  8772],
       device='cuda:0')


Training:  21%|██        | 210/1000 [01:08<03:43,  3.54it/s, loss=6.31]

tensor([ 28,  28, 198, 198, 198,  28,  28,  28,  28, 198], device='cuda:0')
tensor([2428,   28,  198, 5212, 5337,  392, 2275,  357, 2220,  288],
       device='cuda:0')


Training:  21%|██        | 211/1000 [01:08<03:40,  3.57it/s, loss=6.22]

tensor([ 28,  28, 198, 198,  28,  28, 198, 198,  28, 198], device='cuda:0')
tensor([  424,  3786,   339,  8215,   346,    43,   198, 15017,    28,   451],
       device='cuda:0')


Training:  21%|██        | 212/1000 [01:08<03:42,  3.54it/s, loss=6.22]

tensor([198,  28, 198, 198,  28, 198, 198,  28,  28,  28], device='cuda:0')
tensor([  948, 30437,  9984,   346,    42,   198,  9482,   486,   549,   288],
       device='cuda:0')


Training:  21%|██▏       | 213/1000 [01:09<03:48,  3.44it/s, loss=6.11]

tensor([ 28,  28,  28, 198, 198, 198, 198,  28,  28, 198], device='cuda:0')
tensor([ 3275,  8806,    28,   564,  1035,   198, 12018,  8806,    28,   423],
       device='cuda:0')


Training:  21%|██▏       | 214/1000 [01:09<03:54,  3.36it/s, loss=6.02]

tensor([ 28,  28, 198,  28, 198,  28, 198,  28, 198,  28], device='cuda:0')
tensor([ 2016,    28,  1209,    28,   732,   506,  1690,  1980, 20322,    42],
       device='cuda:0')


Training:  22%|██▏       | 215/1000 [01:09<03:56,  3.31it/s, loss=6.09]

tensor([ 28, 198,  42,  28, 198,  28, 260, 198,  28,  28], device='cuda:0')
tensor([  198, 11952, 11737,    43,   654,   325,   384,   523, 23599,    30],
       device='cuda:0')


Training:  22%|██▏       | 216/1000 [01:09<03:56,  3.32it/s, loss=5.72]

tensor([ 28,  28, 260,  28, 260,  28, 198, 198, 198, 260], device='cuda:0')
tensor([4109,  282,  732,  314, 2294,   43,  284,   28,  702,  253],
       device='cuda:0')


Training:  22%|██▏       | 217/1000 [01:10<03:56,  3.32it/s, loss=6.13]

tensor([ 28,  28, 198,  28, 260, 198,  28, 198, 198,  28], device='cuda:0')
tensor([12687,   198,  4590,   281,   260,  6411,    30,  3315,   549,   260],
       device='cuda:0')


Training:  22%|██▏       | 218/1000 [01:10<03:55,  3.32it/s, loss=5.86]

tensor([198,  28,  28, 198, 198, 198,  28, 198, 198,  28], device='cuda:0')
tensor([41063,   924,   198,  1653,  1041,   338,   384, 26365, 15270,  6737],
       device='cuda:0')


Training:  22%|██▏       | 219/1000 [01:10<03:51,  3.37it/s, loss=5.86]

tensor([ 28, 198, 198, 198,  42,  42, 198,  42, 198, 198], device='cuda:0')
tensor([   30,   198,   198,  2721,  5431,  4210,  5963,    42,   198, 34177],
       device='cuda:0')


Training:  22%|██▏       | 220/1000 [01:11<03:54,  3.33it/s, loss=6.12]

tensor([198, 198, 198,  28, 260,  28,  28,  28, 198, 198], device='cuda:0')
tensor([  198, 48111,   549,   288,  4875,   601,  3287,    28,   837,   339],
       device='cuda:0')


Training:  22%|██▏       | 221/1000 [01:11<03:54,  3.32it/s, loss=6.13]

tensor([198,  28, 198,  28, 198,  28, 198, 198, 198,  28], device='cuda:0')
tensor([11424,    28,   357,   868,  7219,    42,   198,    57,  3060,  5774],
       device='cuda:0')


Training:  22%|██▏       | 222/1000 [01:11<03:52,  3.34it/s, loss=6.26]

tensor([ 28, 198, 198,  28,  28,  28, 198, 198, 198, 198], device='cuda:0')
tensor([  28,  564,  716,  547,  441,   30,  198,  198, 2721, 5431],
       device='cuda:0')


Training:  22%|██▏       | 223/1000 [01:12<03:51,  3.36it/s, loss=6.26]

tensor([ 28,  28, 260,  28,  28, 198,  28, 198,  28, 198], device='cuda:0')
tensor([17072,   288,   451, 17324,    28,  2631,   284,  1341,    47,  1431],
       device='cuda:0')


Training:  22%|██▏       | 224/1000 [01:12<03:52,  3.34it/s, loss=6.1] 

tensor([ 28, 198,  28, 260,  28, 198,  28, 198, 198, 198], device='cuda:0')
tensor([  260, 35278,   281,   346,    28, 48275,    28,   355,  4662,   894],
       device='cuda:0')


Training:  22%|██▎       | 225/1000 [01:12<03:47,  3.41it/s, loss=6.11]

tensor([  28,   28,  260,  198,   28,  198,  198,  198,   42, 4501],
       device='cuda:0')
tensor([27044,   436,   260, 48945,    47,   198,   198, 23110,  3206,  4501],
       device='cuda:0')


Training:  23%|██▎       | 226/1000 [01:13<03:42,  3.48it/s, loss=5.95]

tensor([ 28, 198, 198, 198, 198,  69, 198,  42, 198, 198], device='cuda:0')
tensor([ 702,   30,  198,  198,   52,   69, 9620,  717, 2113,   51],
       device='cuda:0')


Training:  23%|██▎       | 227/1000 [01:13<03:43,  3.46it/s, loss=5.95]

tensor([ 28, 260,  28, 198, 198,  28, 198,  28, 198, 198], device='cuda:0')
tensor([  335,  1272,    47,   198, 10768,    28,   685,    28,  5697,  2747],
       device='cuda:0')


Training:  23%|██▎       | 228/1000 [01:13<03:45,  3.42it/s, loss=5.88]

tensor([ 28,  28,  28, 198,  28, 198, 198, 198,  69, 198], device='cuda:0')
tensor([  523,  3694,   260, 11611,    30,   198,   198,    52,    69,  9620],
       device='cuda:0')


Training:  23%|██▎       | 229/1000 [01:13<03:49,  3.36it/s, loss=6.14]

tensor([ 28,  28,  28, 198,  28, 198,  28,  28,  28, 198], device='cuda:0')
tensor([9713,  578,  957, 9983,  351,  278, 3285,  849,   42,  339],
       device='cuda:0')


Training:  23%|██▎       | 230/1000 [01:14<03:48,  3.37it/s, loss=5.93]

tensor([ 28, 198,  28,  28, 198,  28,  28, 198, 198,  28], device='cuda:0')
tensor([   30, 30079,   346,   351,   451,  4386,    42,   198, 17777,   623],
       device='cuda:0')


Training:  23%|██▎       | 231/1000 [01:14<03:44,  3.42it/s, loss=5.81]

tensor([ 28,  28,  28,  28, 198, 198, 198, 198, 198,  42], device='cuda:0')
tensor([35532,   280,  7288,   423,   198,   198,  2721,  5431,  4210,  5963],
       device='cuda:0')


Training:  23%|██▎       | 232/1000 [01:14<03:45,  3.41it/s, loss=5.81]

tensor([  28,  198,   28,  198,  198,  198,  198, 9620,  717, 2113],
       device='cuda:0')
tensor([ 957, 4033,   30,  198,  198,   52,   69, 9620,  717, 2113],
       device='cuda:0')


Training:  23%|██▎       | 233/1000 [01:14<03:49,  3.33it/s, loss=5.88]

tensor([ 28, 198, 260,  28,  28, 198, 260, 198,  42,  28], device='cuda:0')
tensor([  43,  325,  346, 8048,  198, 6228,  469, 1038, 1728,   30],
       device='cuda:0')


Training:  23%|██▎       | 234/1000 [01:15<03:51,  3.31it/s, loss=5.62]

tensor([198,  28,  28, 198, 198, 198,  28, 198,  42, 198], device='cuda:0')
tensor([41288,  1055,    47,   198,   198,    60, 15604,  8772,    42,   198],
       device='cuda:0')


Training:  24%|██▎       | 235/1000 [01:15<03:50,  3.32it/s, loss=5.73]

tensor([ 28,  28, 198, 198, 260,  28,  28, 198, 198,  28], device='cuda:0')
tensor([ 1134,   338,   297,    23,   259, 15480,   381,   253, 48945,    30],
       device='cuda:0')


Training:  24%|██▎       | 236/1000 [01:16<03:46,  3.38it/s, loss=5.71]

tensor([198, 198, 198, 198, 198, 260, 198, 198,  28, 198], device='cuda:0')
tensor([ 384, 1250,  260, 2775,  327,  527,  384, 4242,   42,  198],
       device='cuda:0')


Training:  24%|██▎       | 237/1000 [01:16<03:42,  3.43it/s, loss=5.96]

tensor([ 28, 198, 198,  28,  28, 198, 198, 198,  28, 198], device='cuda:0')
tensor([   42,  1303,  1272,  1690,    28,   198,   397, 25128,    30,   198],
       device='cuda:0')


Training:  24%|██▍       | 238/1000 [01:16<03:38,  3.49it/s, loss=6.46]

tensor([ 28,  28, 198, 198,  28,  28, 198,  28,  28, 260], device='cuda:0')
tensor([8263,   42,  198, 5195, 2294,   28, 2662, 1272,  288,  260],
       device='cuda:0')


Training:  24%|██▍       | 239/1000 [01:16<03:34,  3.55it/s, loss=5.82]

tensor([ 28,  28, 198, 198, 346, 260, 198,  28, 198,  28], device='cuda:0')
tensor([ 2223,    28,   198,  3528,   418,   338,  6968,  3786,  8177, 17600],
       device='cuda:0')


Training:  24%|██▍       | 240/1000 [01:17<03:34,  3.55it/s, loss=6.25]

tensor([ 28, 198, 198,  28,  28, 198, 260, 198, 198,  28], device='cuda:0')
tensor([ 198,   55, 1134,  549,  957, 1036,  284,  957, 4083,  808],
       device='cuda:0')


Training:  24%|██▍       | 241/1000 [01:17<03:33,  3.55it/s, loss=6.08]

tensor([ 28,  28, 198, 198, 198,  28,  28,  28, 198, 198], device='cuda:0')
tensor([  318,   578,    42,   653, 24641, 14517,   335,   198, 15135,  5882],
       device='cuda:0')


Training:  24%|██▍       | 242/1000 [01:17<03:36,  3.50it/s, loss=6.08]

tensor([ 28,  28,  28, 198, 260, 198, 198,  28,  28, 198], device='cuda:0')
tensor([  293, 20322,    28,   327,   339,   457,   357,  2073,    30,   198],
       device='cuda:0')


Training:  24%|██▍       | 243/1000 [01:17<03:38,  3.46it/s, loss=6.31]

tensor([ 28, 198,  42,  28,  28, 260, 198,  28, 198, 198], device='cuda:0')
tensor([  260,   412, 34315, 16592,  1980,   469, 10166,   198,  3528,   856],
       device='cuda:0')


Training:  24%|██▍       | 244/1000 [01:18<03:38,  3.47it/s, loss=6.31]

tensor([ 28, 198, 198, 198,  28,  42,  42,  42, 198, 198], device='cuda:0')
tensor([   30,   198,   198,    56, 13110, 22239,  8772,    42,   198,  2696],
       device='cuda:0')


Training:  24%|██▍       | 245/1000 [01:18<03:41,  3.41it/s, loss=6.01]

tensor([ 28,  28,  28,  28, 260, 198, 198, 198,  28, 198], device='cuda:0')
tensor([ 1441,   441,   874,   288,   423,   198,   198, 10895,  2810,  8772],
       device='cuda:0')


Training:  25%|██▍       | 246/1000 [01:18<03:43,  3.38it/s, loss=5.87]

tensor([ 198,  198, 4728,   42,   28,   42,  198,  198,   28,  198],
       device='cuda:0')
tensor([  198,    59, 12455, 33256,  1933,    42,   198,  1780,    28,   523],
       device='cuda:0')


Training:  25%|██▍       | 247/1000 [01:19<03:42,  3.38it/s, loss=5.87]

tensor([ 28,  28, 198, 198,  28,  28, 198, 198, 260,  28], device='cuda:0')
tensor([ 357,  338, 1041, 6753,  549,   28,  198,  788,  511,  957],
       device='cuda:0')


Training:  25%|██▍       | 248/1000 [01:19<03:46,  3.31it/s, loss=5.96]

tensor([  28,   28,  198,  198,  198, 4728, 8772,    0,   42,  198],
       device='cuda:0')
tensor([12575,    30,   198,   198,    59, 12455, 33256,  1933,    42,   198],
       device='cuda:0')


Training:  25%|██▍       | 249/1000 [01:19<03:48,  3.28it/s, loss=5.99]

tensor([ 28,  28, 198,  42,  28,  28, 198, 198,  28, 198], device='cuda:0')
tensor([   91,    28,  9133,   490, 44004,    43,  1041,  1921,  1041,  3060],
       device='cuda:0')


Training:  25%|██▌       | 250/1000 [01:19<03:46,  3.31it/s, loss=5.99]

tensor([ 28,  28, 198,  42,  28, 198,  28,  28, 198, 198], device='cuda:0')
tensor([ 564,  253, 8150,  301,   42,  384,  314, 1573,   28,  339],
       device='cuda:0')


Training:  25%|██▌       | 251/1000 [01:20<03:50,  3.24it/s, loss=6.1] 

tensor([  28,   28,   28,   28,  198,  198,  198,   42, 8772,   42],
       device='cuda:0')
tensor([27415,   581,  1119,    30,   198,   198, 11718,  2810,  8970,    42],
       device='cuda:0')


Training:  25%|██▌       | 252/1000 [01:20<03:54,  3.18it/s, loss=6.06]

tensor([   28,    28,    28,   198,   198,   198, 46634,    28,  8772,    42],
       device='cuda:0')
tensor([  346,   592,    30,   198,   198, 43029, 46634,  5229,  8772,    42],
       device='cuda:0')


Training:  25%|██▌       | 253/1000 [01:20<03:52,  3.21it/s, loss=5.9] 

tensor([ 28,  28, 198,  28,  28,  28,  28, 198, 198, 198], device='cuda:0')
tensor([  280,   260,  1079,   302, 12471,  1238,    30,   198,   198, 43029],
       device='cuda:0')


Training:  25%|██▌       | 254/1000 [01:21<03:49,  3.25it/s, loss=5.67]

tensor([ 28,  28, 339,  28, 260,  28, 198, 198, 198,  28], device='cuda:0')
tensor([ 2945,   347,  1041,   506,  3213,    47,   198,   198, 13087,  8292],
       device='cuda:0')


Training:  26%|██▌       | 255/1000 [01:21<03:49,  3.24it/s, loss=5.88]

tensor([ 28, 198,  28,  28, 198,  28,  28, 198,  28,  28], device='cuda:0')
tensor([   28,   787, 16981,    47,   787,  2234,    47,   787,  9060,    47],
       device='cuda:0')


Training:  26%|██▌       | 256/1000 [01:21<03:46,  3.28it/s, loss=5.88]

tensor([ 28,  28, 198,  28, 260, 198,  28, 198, 198, 198], device='cuda:0')
tensor([1489,  260, 5710,  282,  650, 3229,   30,  198,  198,   60],
       device='cuda:0')


Training:  26%|██▌       | 257/1000 [01:22<03:46,  3.28it/s, loss=5.59]

tensor([198, 198,  42,  28,  28, 260,  28,  28, 198, 198], device='cuda:0')
tensor([  198,    64, 14765, 14104,   327, 14553,  4240,    30,   198,   198],
       device='cuda:0')


Training:  26%|██▌       | 258/1000 [01:22<03:47,  3.26it/s, loss=5.56]

tensor([ 28,  28, 198, 198,  28,  28, 260,  28,  28, 198], device='cuda:0')
tensor([2373,   28,  198, 3528, 2161,  670,  357, 3786,   28,  339],
       device='cuda:0')


Training:  26%|██▌       | 259/1000 [01:22<03:48,  3.24it/s, loss=5.67]

tensor([ 28,  28,  28,  28, 198,  28, 198, 198,  28, 198], device='cuda:0')
tensor([ 277, 2574, 9289,  260, 3323,   47,  198,   63,   28,  787],
       device='cuda:0')


Training:  26%|██▌       | 260/1000 [01:23<03:46,  3.27it/s, loss=5.65]

tensor([ 28, 198,  28,  28, 198,  28, 260,  28, 260, 198], device='cuda:0')
tensor([ 198, 5812,  620,  260, 2455,  355, 5842,  282,  650, 4120],
       device='cuda:0')


Training:  26%|██▌       | 261/1000 [01:23<03:42,  3.32it/s, loss=5.65]

tensor([198, 198,  28, 260, 198,  28, 198,  28,  28,  42], device='cuda:0')
tensor([  504, 10243,   288,   957,  6221,    28,   451,  9154, 24626,    28],
       device='cuda:0')


Training:  26%|██▌       | 262/1000 [01:23<03:48,  3.23it/s, loss=5.61]

tensor([  28,   28,  198,  198,  198,   28,  198,  198, 8772,   42],
       device='cuda:0')
tensor([1119,   30,  198,  198,   70, 2113,   51, 6213, 8772,   42],
       device='cuda:0')


Training:  26%|██▋       | 263/1000 [01:23<03:47,  3.24it/s, loss=5.56]

tensor([ 28,  28, 260, 339, 198, 198, 198,  28,  28, 198], device='cuda:0')
tensor([4972,  411,  338,   30,  198,  198, 9852,  323,   42,  198],
       device='cuda:0')


Training:  26%|██▋       | 264/1000 [01:24<03:47,  3.24it/s, loss=5.32]

tensor([ 28,  28,  28,  28, 198, 198,  28,  28,  28, 260], device='cuda:0')
tensor([  105,  7557, 28823,    43,   198,  4809, 12724,   541,   288,  1372],
       device='cuda:0')


Training:  26%|██▋       | 265/1000 [01:24<03:42,  3.30it/s, loss=6.23]

tensor([198, 198,  42,  28,  28,  28, 198, 198, 198, 198], device='cuda:0')
tensor([  198, 23641,  1457,   103,   402,    42,   198,  2427,   260,  3426],
       device='cuda:0')


Training:  27%|██▋       | 266/1000 [01:24<03:41,  3.32it/s, loss=6.2] 

tensor([ 28,  28,  28, 198,  28,  28,  28,  28, 198, 198], device='cuda:0')
tensor([ 500,  441,  198, 9142, 1296,  929, 1573,   30,  198,  198],
       device='cuda:0')


Training:  27%|██▋       | 267/1000 [01:25<03:33,  3.43it/s, loss=6.27]

tensor([ 28, 346,  28,  28, 198, 198,  28, 260,  28, 198], device='cuda:0')
tensor([ 1250, 19002,  1440,   198,   504, 17816,   282, 28676,    28,   284],
       device='cuda:0')


Training:  27%|██▋       | 268/1000 [01:25<03:29,  3.49it/s, loss=6.18]

tensor([ 28,  28,  28, 198, 198, 260, 198,  28,  28,  28], device='cuda:0')
tensor([35853,  6737,    43,   198,  3280,   480, 24276, 22566,   441,   253],
       device='cuda:0')


Training:  27%|██▋       | 269/1000 [01:25<03:30,  3.47it/s, loss=6.18]

tensor([ 28,  28, 198,  28, 198,  28, 260,  28, 198, 198], device='cuda:0')
tensor([  982,    42,   685,    28,  8913,   351, 33327,    17,   198,    49],
       device='cuda:0')


Training:  27%|██▋       | 270/1000 [01:26<03:32,  3.43it/s, loss=5.88]

tensor([ 28,  28,  28, 198,  28,  28, 198,  42,  28, 198], device='cuda:0')
tensor([ 318, 8233,  650, 5334,  478,  198,   56,  792,   17, 1209],
       device='cuda:0')


Training:  27%|██▋       | 271/1000 [01:26<03:33,  3.42it/s, loss=5.88]

tensor([ 42,  28, 260,  28, 198,  28, 260,  42,  28,  28], device='cuda:0')
tensor([5674,  411, 4778,   42,  451,  314, 7468,  258, 1132,  198],
       device='cuda:0')


Training:  27%|██▋       | 272/1000 [01:26<03:39,  3.32it/s, loss=6.03]

tensor([ 42, 198,  28, 198,  28, 198,  28, 198,  28, 198], device='cuda:0')
tensor([ 9725,    95,    17, 38467,    28, 23632,  9725,    95,    30,   198],
       device='cuda:0')


Training:  27%|██▋       | 273/1000 [01:27<03:38,  3.33it/s, loss=5.86]

tensor([ 28,  42, 198, 198, 198, 198,  28, 198,  28, 198], device='cuda:0')
tensor([32062,    42,   198,  6121,   392,  7219,   750,  2030,    28,  4875],
       device='cuda:0')


Training:  27%|██▋       | 274/1000 [01:27<03:31,  3.43it/s, loss=5.82]

tensor([ 42, 198, 198,  28, 198,  28,  42,  28, 198, 198], device='cuda:0')
tensor([1248,  506, 2249,  198,   66, 2713,  478, 6737, 1523,  260],
       device='cuda:0')


Training:  28%|██▊       | 275/1000 [01:27<03:31,  3.43it/s, loss=5.82]

tensor([ 28,  28,  28,  28, 198, 198,  28, 198, 198, 198], device='cuda:0')
tensor([8427,  346,  536, 1363,  253, 1945,   28,  198, 3528,  946],
       device='cuda:0')


Training:  28%|██▊       | 276/1000 [01:27<03:31,  3.42it/s, loss=5.99]

tensor([ 28,  28,  28, 198, 260, 198,  28, 260, 198,  28], device='cuda:0')
tensor([  384,  1477,   198,  2068,   260, 28382,   282,   253,   555,    28],
       device='cuda:0')


Training:  28%|██▊       | 277/1000 [01:28<03:30,  3.43it/s, loss=5.99]

tensor([ 198, 6426, 6426,   42,  198,  198,  198,   28,   28,  198],
       device='cuda:0')
tensor([   55,  4192,  7854,    42,   198, 11952, 23271,  8710,    17,   492],
       device='cuda:0')


Training:  28%|██▊       | 278/1000 [01:28<03:32,  3.39it/s, loss=5.84]

tensor([ 28, 198, 198, 198,  28,  42, 198, 198,  28, 198], device='cuda:0')
tensor([   47,   198,   198,  5345, 22152,    42,   198,  5230,    28,  5277],
       device='cuda:0')


Training:  28%|██▊       | 279/1000 [01:28<03:29,  3.44it/s, loss=5.66]

tensor([198, 260,  28, 198,  42,  28,  28, 198, 198,  28], device='cuda:0')
tensor([  351, 30291,   198,    86, 27045, 26927,    28,   198,  9389,  1928],
       device='cuda:0')


Training:  28%|██▊       | 280/1000 [01:29<03:27,  3.47it/s, loss=5.79]

tensor([ 28, 198, 260, 198,  28,  28,  28, 198, 198,  28], device='cuda:0')
tensor([   43,   327,   339,   536, 13735, 20322,   198,    71, 13231,   670],
       device='cuda:0')


Training:  28%|██▊       | 281/1000 [01:29<03:28,  3.45it/s, loss=5.79]

tensor([ 28,  28, 198, 260, 198, 198, 198,  28, 198, 198], device='cuda:0')
tensor([ 1450,    28,   351,  5337,   392,   654, 24575,    28,   198,  2193],
       device='cuda:0')


Training:  28%|██▊       | 282/1000 [01:29<03:33,  3.37it/s, loss=5.44]

tensor([198,  28, 198, 198,  28,  28, 198,  28, 198,  28], device='cuda:0')
tensor([ 274,   28,  198,   96, 8854,   28, 9869,   28, 1028,  105],
       device='cuda:0')


Training:  28%|██▊       | 283/1000 [01:29<03:36,  3.32it/s, loss=6.02]

tensor([ 28, 198, 198,  28, 198,  28,  28,  28, 260, 260], device='cuda:0')
tensor([  457,   719,   588,   198,    86, 21482,  3447,   327,   511,   260],
       device='cuda:0')


Training:  28%|██▊       | 284/1000 [01:30<03:34,  3.34it/s, loss=5.82]

tensor([198, 198, 198, 198, 198,  28, 260,  28, 198, 198], device='cuda:0')
tensor([  198,    60,   602,   967,   384, 26365,  3763,    30,   198,   198],
       device='cuda:0')


Training:  28%|██▊       | 285/1000 [01:30<03:29,  3.41it/s, loss=5.83]

tensor([ 28, 198, 198, 260,  28, 198, 198, 198,  42,  42], device='cuda:0')
tensor([   28,   198, 38984,   346,    28,   198,  6101,   298,  2679,   268],
       device='cuda:0')


Training:  29%|██▊       | 286/1000 [01:30<03:26,  3.45it/s, loss=5.51]

tensor([ 28,  28, 198, 198, 198,  42, 198,  28, 198, 198], device='cuda:0')
tensor([ 1508,    42,   198, 12800,  4618,  2286, 23944,    30,   198,   198],
       device='cuda:0')


Training:  29%|██▊       | 287/1000 [01:31<03:25,  3.47it/s, loss=5.44]

tensor([4501,  198,   42,   42,  198,  198,  198,  198,   28,  198],
       device='cuda:0')
tensor([ 7113,  2810,  4501,    42,   198, 16817,  1980,   549,    17, 12193],
       device='cuda:0')


Training:  29%|██▉       | 288/1000 [01:31<03:24,  3.49it/s, loss=5.59]

tensor([ 28,  28,  28, 198, 198,  42,  28, 260, 198,  28], device='cuda:0')
tensor([  482,  1163,    28,   198,  9302,  1185,   288,   260, 44890,    29],
       device='cuda:0')


Training:  29%|██▉       | 289/1000 [01:31<03:25,  3.47it/s, loss=5.59]

tensor([198, 198,  42, 198, 198,  28, 198, 198,  28, 198], device='cuda:0')
tensor([ 4628,  4501,    42,   198, 45496,    28, 20172,   468,   441,    30],
       device='cuda:0')


Training:  29%|██▉       | 290/1000 [01:31<03:27,  3.41it/s, loss=5.36]

tensor([198, 198, 198,  42, 198, 198, 198,  28, 198,  28], device='cuda:0')
tensor([ 2113,    57,  4501,    42,   198,   504,   701,   359, 23909,    43],
       device='cuda:0')


Training:  29%|██▉       | 291/1000 [01:32<03:31,  3.35it/s, loss=5.61]

tensor([ 28,  28,  28,  28, 260,  28, 198, 198,  28, 198], device='cuda:0')
tensor([ 1072,   510, 46823,   351,  1728,    28,   260,   550,   198, 20181],
       device='cuda:0')


Training:  29%|██▉       | 292/1000 [01:32<03:36,  3.28it/s, loss=5.3] 

tensor([   28,    28,    28,   198,   198,   198, 23317,   198,  4501,    42],
       device='cuda:0')
tensor([  346,  7270,    30,   198,   198,  4105, 23317,    57,  4501,    42],
       device='cuda:0')


Training:  29%|██▉       | 293/1000 [01:32<03:36,  3.26it/s, loss=5.29]

tensor([ 198,  198,  198,  198,  198, 2113,   57, 4501,   42,  198],
       device='cuda:0')
tensor([ 423,  198,  198,   67, 3020, 2113,   57, 4501,   42,  198],
       device='cuda:0')


Training:  29%|██▉       | 294/1000 [01:33<03:31,  3.34it/s, loss=5.53]

tensor([  260,   260,   198,    28,    28,   260, 34818,    28,   198,   198],
       device='cuda:0')
tensor([5344,  253, 3561,  839, 1980,  637,  391,   28,  198, 2193],
       device='cuda:0')


Training:  30%|██▉       | 295/1000 [01:33<03:29,  3.36it/s, loss=5.53]

tensor([ 28,  28,  28, 198,  28,  28, 198, 260,  28, 260], device='cuda:0')
tensor([  338,   392,   457, 13784,  1915,   198,  7143,  1187,   411,   260],
       device='cuda:0')


Training:  30%|██▉       | 296/1000 [01:33<03:28,  3.38it/s, loss=5.61]

tensor([ 28, 260,  28,  28, 198, 260, 198, 260, 198, 198], device='cuda:0')
tensor([  281,  1272,  5721,    28,   429,   655,   288,   655,   198, 38110],
       device='cuda:0')


Training:  30%|██▉       | 297/1000 [01:33<03:27,  3.38it/s, loss=5.61]

tensor([ 28,  28, 198, 198,  42, 260,  28, 198,  28,  28], device='cuda:0')
tensor([ 441, 5091,  198,   63,   23,  259,  260, 4749,  905,  288],
       device='cuda:0')


Training:  30%|██▉       | 298/1000 [01:34<03:31,  3.32it/s, loss=5.17]

tensor([ 28, 198, 198, 198,  28,  28, 198, 260,  28, 260], device='cuda:0')
tensor([  42,  198,  504, 1085,  754, 9699,  282,  357,  314, 1896],
       device='cuda:0')


Training:  30%|██▉       | 299/1000 [01:34<03:31,  3.32it/s, loss=5.5] 

tensor([   28,   198,    28,    28,    28,   198,    28, 29086,    28,   198],
       device='cuda:0')
tensor([   28,   346,   523,   441,    47,   377,   608, 29086,    28,  2505],
       device='cuda:0')


Training:  30%|███       | 300/1000 [01:35<03:34,  3.27it/s, loss=5.56]

tensor([ 28,  28,  28, 260, 198, 198,  42, 260, 198,  28], device='cuda:0')
tensor([ 2745,  3895,   351,  3878,    30,   408, 19550,  2828, 37184,    28],
       device='cuda:0')


Training:  30%|███       | 301/1000 [01:35<03:27,  3.37it/s, loss=5.56]

tensor([198, 198,  28,  28, 198, 253,  28, 198,  28,  28], device='cuda:0')
tensor([ 198, 7993, 5612,   28,  702,  345,  370,  990, 5249,   28],
       device='cuda:0')


Training:  30%|███       | 302/1000 [01:35<03:27,  3.37it/s, loss=5.34]

tensor([ 28, 198, 198, 198, 198, 198,  42, 198, 198,  28], device='cuda:0')
tensor([ 1378,    30,   198,   198, 38371,  9333,    42,   198, 10539,    28],
       device='cuda:0')


Training:  30%|███       | 303/1000 [01:35<03:25,  3.39it/s, loss=5.34]

tensor([ 198,  198,   28,   28,  198,  198,  198, 3020, 2113,   57],
       device='cuda:0')
tensor([  28,  732, 1745,   47,  198,  198,   67, 3020, 2113,   57],
       device='cuda:0')


Training:  30%|███       | 304/1000 [01:36<03:25,  3.39it/s, loss=5.4] 

tensor([198,  28, 198, 198,  28, 260,  28, 260, 198, 198], device='cuda:0')
tensor([2428,   42,  650, 2988,  288,  549,  436,   28,  198, 3681],
       device='cuda:0')


Training:  30%|███       | 305/1000 [01:36<03:24,  3.40it/s, loss=5.58]

tensor([198, 198, 457, 260, 198,  42, 198, 260,  28, 198], device='cuda:0')
tensor([ 198, 2683,  359,  253, 4469,   28,  359,  346,   47,  198],
       device='cuda:0')


Training:  31%|███       | 306/1000 [01:36<03:20,  3.47it/s, loss=5.73]

tensor([ 28, 198,  28, 198, 198, 260,  28,  28,  28, 198], device='cuda:0')
tensor([ 2526,  1209,  2630,    43,   288, 10297,  1272,   805,   198,  5195],
       device='cuda:0')


Training:  31%|███       | 307/1000 [01:37<03:18,  3.50it/s, loss=5.74]

tensor([ 28,  28, 198,  28, 260, 198, 457,  28, 198, 260], device='cuda:0')
tensor([3032,  198, 5195,  511,  564,  392, 2815,   43,  327,  638],
       device='cuda:0')


Training:  31%|███       | 308/1000 [01:37<03:15,  3.54it/s, loss=5.44]

tensor([ 198,  198, 2680,   28,   28,  260,   28,   28,  198,   28],
       device='cuda:0')
tensor([  198,    61,   462,  3497,   288, 16945,  8047,    30,  1249,    28],
       device='cuda:0')


Training:  31%|███       | 309/1000 [01:37<03:15,  3.53it/s, loss=5.44]

tensor([ 28, 198, 198, 198,  28,  28, 260,  28, 198, 198], device='cuda:0')
tensor([6737,  284,  198, 4038, 1390,  288, 1972, 1092,  260,  701],
       device='cuda:0')


Training:  31%|███       | 310/1000 [01:37<03:19,  3.47it/s, loss=5.53]

tensor([  198,    28,    28,   198,   198,  4501,   198, 34818,   198,    28],
       device='cuda:0')
tensor([ 2240,  7268,    28,   198,    57,  1643,   637, 15006,  2240,  7904],
       device='cuda:0')


Training:  31%|███       | 311/1000 [01:38<03:21,  3.43it/s, loss=5.68]

tensor([198, 198, 198,  28, 198, 198, 260, 198, 198,  28], device='cuda:0')
tensor([ 6824,   253, 26156,    28,   198,  2068,  7471,   623,  3506,   876],
       device='cuda:0')


Training:  31%|███       | 312/1000 [01:38<03:18,  3.46it/s, loss=5.74]

tensor([ 28, 260,  28, 198, 198, 198,  28, 198, 198, 198], device='cuda:0')
tensor([  288, 10419,    28,   198,  1937, 10419,   523,  1188,   260,  1386],
       device='cuda:0')


Training:  31%|███▏      | 313/1000 [01:38<03:18,  3.45it/s, loss=5.74]

tensor([ 28, 198, 260,  28, 198,  28, 198,  28, 198, 198], device='cuda:0')
tensor([   28,   327,  1839,   506, 15215,    28,  8913,    28,   284,  7241],
       device='cuda:0')


Training:  31%|███▏      | 314/1000 [01:38<03:22,  3.40it/s, loss=5.7] 

tensor([ 28, 198, 198,  42,  42, 198, 462,  28,  28, 198], device='cuda:0')
tensor([   42,   198,    51, 10942,   441,   295,   672,   482,    28,  3506],
       device='cuda:0')


Training:  32%|███▏      | 315/1000 [01:39<03:23,  3.37it/s, loss=5.61]

tensor([ 28, 198,  28,  28, 260,  28, 198, 198, 198, 198], device='cuda:0')
tensor([  28,  685, 1683,  351,  549,   30,  198,  198,   55, 6426],
       device='cuda:0')


Training:  32%|███▏      | 316/1000 [01:39<03:20,  3.42it/s, loss=5.66]

tensor([ 28,  28, 198, 198, 198, 457, 325,  28, 260, 260], device='cuda:0')
tensor([1970,   30,  198, 8653,  339, 2526,  832, 2494,  284, 3287],
       device='cuda:0')


Training:  32%|███▏      | 317/1000 [01:39<03:16,  3.48it/s, loss=5.63]

tensor([  42,   28,  198,  198,  198,  198, 2680,   57,  198,  198],
       device='cuda:0')
tensor([44228,    17,   423,   198,   198, 41074,  2680, 35135,    55,  2285],
       device='cuda:0')


Training:  32%|███▏      | 318/1000 [01:40<03:13,  3.52it/s, loss=5.54]

tensor([ 28,  28, 198, 198,  28, 198,  28,  28,  28, 260], device='cuda:0')
tensor([  381,    28,   198,  3528,  1188,  2276,   635,  5585,   327, 13987],
       device='cuda:0')


Training:  32%|███▏      | 319/1000 [01:40<03:12,  3.54it/s, loss=5.63]

tensor([  28,  260,   28,  198,  198,  198, 2680,   57,   28,   42],
       device='cuda:0')
tensor([  288,   874,    30,   198,   198, 41074,  2680, 19642, 25630,  5431],
       device='cuda:0')


Training:  32%|███▏      | 320/1000 [01:40<03:11,  3.56it/s, loss=5.75]

tensor([ 28,  28,  28, 260,  28, 198, 198,  28, 198,  28], device='cuda:0')
tensor([4888, 6737,  990, 1029,   43,  198,   63,   28,  965, 2585],
       device='cuda:0')


Training:  32%|███▏      | 321/1000 [01:40<03:11,  3.54it/s, loss=5.75]

tensor([198,  28,  28,  28, 198, 198, 260, 198,  28,  28], device='cuda:0')
tensor([ 6836, 29820,  1272,    42,   637, 34818,   253,   754, 10934, 15786],
       device='cuda:0')


Training:  32%|███▏      | 322/1000 [01:41<03:12,  3.52it/s, loss=5.31]

tensor([  28,  198,   28,  198,   28,   28,   28,  198,  198, 4501],
       device='cuda:0')
tensor([  957,  5717,    28, 13735,   441,   549,    43,   198,    57,   744],
       device='cuda:0')


Training:  32%|███▏      | 323/1000 [01:41<03:10,  3.56it/s, loss=5.54]

tensor([198,  42,  42,  42, 198, 198,  28, 198,  28,  28], device='cuda:0')
tensor([   71, 20055,  9536,    42,   198,  2696,    28,  4074,   572, 45927],
       device='cuda:0')


Training:  32%|███▏      | 324/1000 [01:41<03:09,  3.56it/s, loss=5.59]

tensor([339, 198,  28,  28,  28, 198,  28, 198,  42,  28], device='cuda:0')
tensor([ 469, 1076, 1191,  355,  469, 7003,   29,  102,  520,  781],
       device='cuda:0')


Training:  32%|███▎      | 325/1000 [01:42<03:08,  3.58it/s, loss=5.47]

tensor([ 28, 198,  28,  28,  28, 198, 198, 198,  69, 198], device='cuda:0')
tensor([  653, 14119, 21723, 48384,    17,   198,   198,    52,    69,  5229],
       device='cuda:0')


Training:  33%|███▎      | 326/1000 [01:42<03:11,  3.53it/s, loss=5.47]

tensor([ 28, 198, 260,  28, 282,  28, 198,  28, 198, 198], device='cuda:0')
tensor([2205,  282, 1123, 3261, 9446,  506, 2112,   47,  198,  198],
       device='cuda:0')


Training:  33%|███▎      | 327/1000 [01:42<03:14,  3.46it/s, loss=5.54]

tensor([198, 198,  69, 198,  42, 198,  42,  42,  42, 198], device='cuda:0')
tensor([  198,    52,    69,  5229,  9248,  6016,   718, 25089,    42,   198],
       device='cuda:0')


Training:  33%|███▎      | 328/1000 [01:42<03:13,  3.47it/s, loss=5.54]

tensor([ 28, 198, 198,  28, 198, 346,  28, 198, 260,  28], device='cuda:0')
tensor([   30,   198,    63,   926,   457,   339,  3984,   282, 21683,  1800],
       device='cuda:0')


Training:  33%|███▎      | 329/1000 [01:43<03:14,  3.45it/s, loss=5.37]

tensor([ 28,  28,  28, 260,  28,  28, 198, 198, 198,  42], device='cuda:0')
tensor([22534, 48384,    23, 12575, 15534,    42,   198,  5965,  5295,   332],
       device='cuda:0')


Training:  33%|███▎      | 330/1000 [01:43<03:12,  3.48it/s, loss=5.32]

tensor([198,  28,  42,  42, 198, 198,  28,  28, 198,  28], device='cuda:0')
tensor([39813,  2097, 25466,    42,   198,  5315,  1123,  2552,  5612,   288],
       device='cuda:0')


Training:  33%|███▎      | 331/1000 [01:43<03:12,  3.48it/s, loss=5.32]

tensor([ 42,  28, 198,  28,  28, 198, 198, 198,  28,  42], device='cuda:0')
tensor([  258,   469, 31923,   974,    30,   198,   198,    66,  2062,  7430],
       device='cuda:0')


Training:  33%|███▎      | 332/1000 [01:44<03:14,  3.44it/s, loss=5.31]

tensor([ 28,  28,  42,  28, 198,  28, 260,  28,  28, 198], device='cuda:0')
tensor([  536, 14053,   609,   957,  2112,   351, 24478,   621, 13056,   198],
       device='cuda:0')


Training:  33%|███▎      | 333/1000 [01:44<03:15,  3.42it/s, loss=5.61]

tensor([ 28,  28, 198, 198, 198,  28, 260,  28, 260, 198], device='cuda:0')
tensor([1671,   28,  198,  504, 4033,  282, 3996,  284,  653, 6349],
       device='cuda:0')


Training:  33%|███▎      | 334/1000 [01:44<03:10,  3.50it/s, loss=5.54]

tensor([198,  42, 198,  28,  28, 198, 198, 260, 198,  28], device='cuda:0')
tensor([23487,   469,  1761, 12630,    28,   198, 16721,   260,  1048,  2359],
       device='cuda:0')


Training:  34%|███▎      | 335/1000 [01:45<03:09,  3.50it/s, loss=5.71]

tensor([  28,  198,  198,   28,   28,   28, 1992,  322,  346,   28],
       device='cuda:0')
tensor([   42,   198,    57,  2161,   536, 13168,  1992,   322,   346, 44228],
       device='cuda:0')


Training:  34%|███▎      | 336/1000 [01:45<03:05,  3.58it/s, loss=5.71]

tensor([   28,   198,   198,   198, 15604,   198,  4728,   198,    28,    42],
       device='cuda:0')
tensor([   30,   198,   198,    50, 15604,    59,  4728,    56,  4153,    42],
       device='cuda:0')


Training:  34%|███▎      | 337/1000 [01:45<03:08,  3.51it/s, loss=5.25]

tensor([ 28,  28,  28, 260, 198,  28, 198, 198, 279,  28], device='cuda:0')
tensor([ 2767, 20322,   418,   260, 29289,    43,   198,  1082,   105,  3379],
       device='cuda:0')


Training:  34%|███▍      | 338/1000 [01:45<03:11,  3.45it/s, loss=5.49]

tensor([198,  28,  28,  28, 198, 198,  28, 198,  28,  28], device='cuda:0')
tensor([ 1771,   368,  3497,    42,   198, 12192,    29, 23724, 45927,  8759],
       device='cuda:0')


Training:  34%|███▍      | 339/1000 [01:46<03:12,  3.43it/s, loss=5.76]

tensor([  28,   28,   28,   28,  260,   28,  198,  198,  198, 4728],
       device='cuda:0')
tensor([1991,  346,  523,  355,  787,   30,  198,  198,   59, 4728],
       device='cuda:0')


Training:  34%|███▍      | 340/1000 [01:46<03:09,  3.49it/s, loss=5.72]

tensor([   28,   198,   198, 20055,    28,   198,    28,   198,    28,   198],
       device='cuda:0')
tensor([   28,   198,    71,  3592,   506,  6621,    28,   905,   506, 15786],
       device='cuda:0')


Training:  34%|███▍      | 341/1000 [01:46<03:08,  3.50it/s, loss=5.72]

tensor([260,  42,  28, 198, 198,  28, 260,  28, 339, 260], device='cuda:0')
tensor([   88,   552,    28,   198,  3528,  3727,   549,   638,   288, 27397],
       device='cuda:0')


Training:  34%|███▍      | 342/1000 [01:47<03:10,  3.45it/s, loss=5.31]

tensor([ 28,  28,  28,  28, 260, 198, 198,  28, 198, 260], device='cuda:0')
tensor([  787,   540, 11393,   282,   260,  9202,  1670,   198,  2193, 20322],
       device='cuda:0')


Training:  34%|███▍      | 343/1000 [01:47<03:09,  3.47it/s, loss=5.53]

tensor([ 28,  28,  28,  28,  28,  28, 260,  28, 198, 198], device='cuda:0')
tensor([16087,   358,  3075,   874,   441,   288,  2606,    28,   198, 29752],
       device='cuda:0')


Training:  34%|███▍      | 344/1000 [01:47<03:09,  3.46it/s, loss=5.53]

tensor([  28,   28,  198,  260,  198,   28,  198,  198,  198, 4728],
       device='cuda:0')
tensor([10278,    28,   281,   480, 14553,    30,   198,   198,    59,  4728],
       device='cuda:0')


Training:  34%|███▍      | 345/1000 [01:47<03:10,  3.44it/s, loss=5.33]

tensor([ 28, 198, 198, 198,  28,  42,  42, 198, 198,  28], device='cuda:0')
tensor([   47,   198,   198, 39813,  2097, 25466,    42,   198,  5345,    28],
       device='cuda:0')


Training:  35%|███▍      | 346/1000 [01:48<03:08,  3.47it/s, loss=5.76]

tensor([ 28, 198,  28, 260, 198,  28, 457,  28,  28, 260], device='cuda:0')
tensor([   28, 14067,   335,    28,  1675,   392,   359,   614,   281,  6548],
       device='cuda:0')


Training:  35%|███▍      | 347/1000 [01:48<03:08,  3.46it/s, loss=5.76]

tensor([  260,   260,   260,   198, 21723,   198,    28,   198,   198,   198],
       device='cuda:0')
tensor([  284,  1188,    28,   957, 47910, 21723,    30,   198,   198,    59],
       device='cuda:0')


Training:  35%|███▍      | 348/1000 [01:48<03:09,  3.45it/s, loss=5.63]

tensor([ 28,  28, 260, 198,  28,  28, 198, 198,  28,  28], device='cuda:0')
tensor([18266,   284,   260, 27508,   655,   198,    51,  7380,   767,   260],
       device='cuda:0')


Training:  35%|███▍      | 349/1000 [01:49<03:07,  3.48it/s, loss=5.51]

tensor([ 28, 198, 198, 339, 457,  28, 260, 198,  28, 198], device='cuda:0')
tensor([   28,   967,  5337,   392,  3568,  1523,   198, 39855,  2161,   457],
       device='cuda:0')


Training:  35%|███▌      | 350/1000 [01:49<03:05,  3.51it/s, loss=5.79]

tensor([ 28,   0,  28, 198, 339, 457, 198,  28,  28,  28], device='cuda:0')
tensor([  267, 30250,   198,  5519,   339,   475,  2593,  6737,   767,    28],
       device='cuda:0')


Training:  35%|███▌      | 351/1000 [01:49<03:01,  3.57it/s, loss=5.88]

tensor([  260,   260,    28,   260,    28,   260, 21723,    28,    28,   198],
       device='cuda:0')
tensor([ 4145, 46160,   284, 42521,   288,   957,  1904,  4778,    43,   198],
       device='cuda:0')


Training:  35%|███▌      | 352/1000 [01:49<02:58,  3.63it/s, loss=5.82]

tensor([ 28,  28, 198,  28,  28, 198, 198, 260, 198,  28], device='cuda:0')
tensor([ 7576,   253, 10001, 29562,   198,  8155,   281,   260,  1450,  1670],
       device='cuda:0')


Training:  35%|███▌      | 353/1000 [01:50<02:57,  3.64it/s, loss=5.75]

tensor([  0, 198, 198,  28,  28,  28, 339, 260,  28,  28], device='cuda:0')
tensor([   28,   198, 25885, 26429, 12101,   638,   288, 47850, 20322,    42],
       device='cuda:0')


Training:  35%|███▌      | 354/1000 [01:50<02:56,  3.66it/s, loss=5.92]

tensor([  28, 2680,  198,   42,   28,  260,   28,   28,  253,  198],
       device='cuda:0')
tensor([ 372,  323,   82, 1484,  284, 7576,  359,  702,  827, 1800],
       device='cuda:0')


Training:  36%|███▌      | 355/1000 [01:50<02:56,  3.65it/s, loss=5.76]

tensor([ 28,  42,  28, 198, 260,  28, 260,  28, 198, 198], device='cuda:0')
tensor([3726, 4157,   28, 1980, 1556,  282, 1029,   28,  198,   68],
       device='cuda:0')


Training:  36%|███▌      | 356/1000 [01:51<02:58,  3.61it/s, loss=5.63]

tensor([  198,    28,   325,   260, 21723,    28,   198,   198,   198,  4728],
       device='cuda:0')
tensor([10054,  2526,  4571,   957,  6943,    30,   198,   198,    59,  4728],
       device='cuda:0')


Training:  36%|███▌      | 357/1000 [01:51<02:58,  3.60it/s, loss=5.86]

tensor([ 28,  28, 339,  28,  28,  28, 260,  28, 198, 198], device='cuda:0')
tensor([20641,   645,   346,   599,   277,   351,  1272,    47,   198,   198],
       device='cuda:0')


Training:  36%|███▌      | 358/1000 [01:51<02:59,  3.58it/s, loss=5.86]

tensor([ 28,  28,  28, 789, 339, 260, 198,  42, 198, 198], device='cuda:0')
tensor([ 7501,  1114,   287,   789, 43228,   260, 31049,    42,   198, 21553],
       device='cuda:0')


Training:  36%|███▌      | 359/1000 [01:51<03:02,  3.52it/s, loss=5.64]

tensor([ 28,   0,  28,  28,  28, 260,  28,  28,  28,  42], device='cuda:0')
tensor([  252,  1207,  6737,   578,   284, 43537,   318,  1076, 35216,    42],
       device='cuda:0')


Training:  36%|███▌      | 360/1000 [01:52<03:02,  3.51it/s, loss=5.81]

tensor([ 42,  28,  28, 198, 260, 198,  28,  28, 198, 260], device='cuda:0')
tensor([ 318, 4699,  198, 2068,  963,  451, 1861,   30, 1626,   29],
       device='cuda:0')


Training:  36%|███▌      | 361/1000 [01:52<03:00,  3.55it/s, loss=5.6] 

tensor([198,   0,  28,  42,  28, 198, 339, 260, 198,  42], device='cuda:0')
tensor([   88,  9535,  2751,  1352,   198,  3528,  7471,   253, 40590, 27044],
       device='cuda:0')


Training:  36%|███▌      | 362/1000 [01:52<02:58,  3.57it/s, loss=5.59]

tensor([198,  28, 198, 339, 457,  28, 260, 198, 457, 260], device='cuda:0')
tensor([1222,   28,  347,  339, 2422,  411,   28,  339, 1217,  665],
       device='cuda:0')


Training:  36%|███▋      | 363/1000 [01:53<02:59,  3.54it/s, loss=5.51]

tensor([  42,   28,  198,   28,  198,  198,  198, 2680,   42,  198],
       device='cuda:0')
tensor([  589,    28,  7706,    47,   198,   198,    56,  2680, 24625, 33702],
       device='cuda:0')


Training:  36%|███▋      | 364/1000 [01:53<02:59,  3.55it/s, loss=5.61]

tensor([ 28, 198, 198, 457, 339, 198,  28, 198, 198,  28], device='cuda:0')
tensor([   30,   198,  2683,   457,   253,  4132,    28,   330, 13468,   290],
       device='cuda:0')


Training:  36%|███▋      | 365/1000 [01:53<02:56,  3.59it/s, loss=5.84]

tensor([ 198,  198,   28,  198,   28,   28,  260, 4314, 6737,   28],
       device='cuda:0')
tensor([  198, 28925,   260, 14568,  6956,   282,   278,  4314,   358,    43],
       device='cuda:0')


Training:  37%|███▋      | 366/1000 [01:53<02:54,  3.63it/s, loss=5.83]

tensor([ 42,  28,  28,  28, 260,  28, 260,  28, 198, 198], device='cuda:0')
tensor([ 1319,   549,  3287,   282,  3878,   564, 20727,    30,   198,  2705],
       device='cuda:0')


Training:  37%|███▋      | 367/1000 [01:54<02:52,  3.66it/s, loss=5.6] 

tensor([6737,   28,  260,   28,  198,  339,  339,   28,  282,  260],
       device='cuda:0')
tensor([1038,  564, 2112,  198, 3528,  338, 1165, 1743,  282,  260],
       device='cuda:0')


Training:  37%|███▋      | 368/1000 [01:54<02:52,  3.66it/s, loss=5.93]

tensor([ 198,  198,  198, 2680,   42,  198,  198,  198,  198,  198],
       device='cuda:0')
tensor([  198,   198,    56,  2680, 24625,   389,  7113,  4728,    50,  3911],
       device='cuda:0')


Training:  37%|███▋      | 369/1000 [01:54<02:53,  3.64it/s, loss=5.46]

tensor([ 28, 198, 260, 198,  28, 198, 325, 198, 198, 260], device='cuda:0')
tensor([   28,   284,   650,  1911,   198,  2068, 17831,  2754,   282,   469],
       device='cuda:0')


Training:  37%|███▋      | 370/1000 [01:54<02:53,  3.62it/s, loss=5.57]

tensor([198, 457, 325, 260, 198, 260,  28,  28, 198, 198], device='cuda:0')
tensor([ 339, 3060, 1928,   28,  284, 6819, 1147,   43,  198, 2193],
       device='cuda:0')


Training:  37%|███▋      | 371/1000 [01:55<02:54,  3.61it/s, loss=5.48]

tensor([198, 198, 198,  28,  28, 198, 260, 260, 198,  28], device='cuda:0')
tensor([   28,   198, 22204,  3930,    28,   359,   511,   260,  2321,  8770],
       device='cuda:0')


Training:  37%|███▋      | 372/1000 [01:55<02:57,  3.54it/s, loss=5.67]

tensor([   0,   28,  260,   28,  198,  271,  339,   28, 4197,   28],
       device='cuda:0')
tensor([  524,   284, 23414,   198,    68,   388, 17072,   260,  8423,    29],
       device='cuda:0')


Training:  37%|███▋      | 373/1000 [01:55<02:57,  3.54it/s, loss=5.52]

tensor([198,  42,  42, 198,  28, 339,  28, 260, 260, 198], device='cuda:0')
tensor([  60, 4768,   28,  346,  338, 1535,  359,  656,  653, 8641],
       device='cuda:0')


Training:  37%|███▋      | 374/1000 [01:55<02:57,  3.54it/s, loss=5.52]

tensor([   28, 21723,    28,   100,   198,   198,   260,    42,     0,   198],
       device='cuda:0')
tensor([ 957, 8664,  506, 9596,  198, 2068, 3423,   85,  936,  260],
       device='cuda:0')


Training:  38%|███▊      | 375/1000 [01:56<02:59,  3.48it/s, loss=5.55]

tensor([   28,   339,    28,   260,   198,   198, 21723,    28,   198,     0],
       device='cuda:0')
tensor([ 536,  441,  963,   28,  198, 5965, 3506, 8739,  267, 1878],
       device='cuda:0')


Training:  38%|███▊      | 376/1000 [01:56<02:58,  3.50it/s, loss=5.55]

tensor([ 42, 198, 198,  28, 260, 457,  28, 198, 260,  28], device='cuda:0')
tensor([   43,   198, 14344,  1928,   339,  8949,    28,   284,  3310,  1188],
       device='cuda:0')


Training:  38%|███▊      | 377/1000 [01:56<03:02,  3.41it/s, loss=5.24]

tensor([ 28, 198, 198,  28, 198, 260,  28,  28, 198, 260], device='cuda:0')
tensor([   30,   198,  2696,    28,   411,  8949, 19461,    28,   411,   957],
       device='cuda:0')


Training:  38%|███▊      | 378/1000 [01:57<03:03,  3.39it/s, loss=5.44]

tensor([ 28, 198, 339,  28,  28, 100,  28, 198, 198,  28], device='cuda:0')
tensor([   28,   347, 17072,  1012,   506,   100,    28, 13987,  8135,  1301],
       device='cuda:0')


Training:  38%|███▊      | 379/1000 [01:57<03:00,  3.43it/s, loss=5.58]

tensor([ 198, 4728,  198,  198, 9620,   42,  198,  198,  260,  260],
       device='cuda:0')
tensor([7113, 4728,   50, 3911, 9620,   42,  198, 3825,  511,  957],
       device='cuda:0')


Training:  38%|███▊      | 380/1000 [01:57<02:59,  3.45it/s, loss=5.58]

tensor([314,  28,  28,  28, 260, 260, 339,  28,  28, 198], device='cuda:0')
tensor([ 787,  555, 2093, 2216,  564,  338, 8177, 2767,  198, 5195],
       device='cuda:0')


Training:  38%|███▊      | 381/1000 [01:58<03:02,  3.39it/s, loss=5.46]

tensor([ 28,  28, 339, 339,  28,  28, 198, 198, 339, 339], device='cuda:0')
tensor([ 2164,   338,   536,  8018,   737,    28,   198, 27737,   536,   339],
       device='cuda:0')


Training:  38%|███▊      | 382/1000 [01:58<03:08,  3.28it/s, loss=5.55]

tensor([ 28,  28, 260, 198,  28, 198, 198, 260,  28, 260], device='cuda:0')
tensor([ 5852,   429,   469, 15062,    28,   198,  3280,  1556,   282, 19888],
       device='cuda:0')


Training:  38%|███▊      | 383/1000 [01:58<03:08,  3.28it/s, loss=5.59]

tensor([282,  28, 198, 198,  42,  28, 260, 260, 198,  28], device='cuda:0')
tensor([ 2717,    17,   198, 11114,  1063,   282,  1022,    28,  5681,  7864],
       device='cuda:0')


Training:  38%|███▊      | 384/1000 [01:58<03:05,  3.33it/s, loss=5.59]

tensor([ 28, 325,  28, 198, 198,  28, 260,  28,  28, 198], device='cuda:0')
tensor([ 3786,   325,    42,   198, 14413,   335,   540,  1737,    28,   282],
       device='cuda:0')


Training:  38%|███▊      | 385/1000 [01:59<03:05,  3.31it/s, loss=5.66]

tensor([   42,    28,   198,   198,   198, 20055,   339,   260,   198,    28],
       device='cuda:0')
tensor([17264,    17,   423,   198,    71,   518,   282,   354,  1850,    42],
       device='cuda:0')


Training:  39%|███▊      | 386/1000 [01:59<03:04,  3.32it/s, loss=5.57]

tensor([  0, 253,  28,   0,  28, 198, 198, 457, 325,  28], device='cuda:0')
tensor([  715, 11415,  1088,   383,    42,   198,  1882,  3060,   457,   787],
       device='cuda:0')


Training:  39%|███▊      | 387/1000 [01:59<03:03,  3.34it/s, loss=5.57]

tensor([ 28,  28, 260, 260, 198, 198, 260,  28,  28, 260], device='cuda:0')
tensor([  808,   288,  7012,    28,   198, 19671,   601,  1554,   282,  1123],
       device='cuda:0')


Training:  39%|███▉      | 388/1000 [02:00<03:01,  3.38it/s, loss=5.57]

tensor([ 339,  339,   28,   28,  198,  198,  260, 4197,   28,   28],
       device='cuda:0')
tensor([  585, 17072,  2265,    28,   198, 21175,   253,  3506,  4313,   284],
       device='cuda:0')


Training:  39%|███▉      | 389/1000 [02:00<02:59,  3.40it/s, loss=5.67]

tensor([ 28,  42,  28,  28, 198, 198,  28,   0, 198,  42], device='cuda:0')
tensor([   90,   424,  1147,    30,   198, 46292,    95,    17,   948,  1970],
       device='cuda:0')


Training:  39%|███▉      | 390/1000 [02:00<02:56,  3.47it/s, loss=5.33]

tensor([  28,  457,   28, 4197,   28,  198,  198,  198,   28,   28],
       device='cuda:0')
tensor([  339,   699,   260,  2139,    42,   198,  9393, 17072,   441, 30493],
       device='cuda:0')


Training:  39%|███▉      | 391/1000 [02:00<02:54,  3.50it/s, loss=5.33]

tensor([   28,    28, 21723,    28,    28,  4197,    28,   198,   198,   198],
       device='cuda:0')
tensor([20322,   957, 21723,  2126,   260,   905,    30,   198,   198,    62],
       device='cuda:0')


Training:  39%|███▉      | 392/1000 [02:01<02:56,  3.44it/s, loss=5.64]

tensor([ 28, 198,  28,  28, 198,  28, 198, 198, 198, 198], device='cuda:0')
tensor([21965, 17072,   719,    28,   965,    47,   198,   198,  3911,  3945],
       device='cuda:0')


Training:  39%|███▉      | 393/1000 [02:01<02:59,  3.37it/s, loss=5.69]

tensor([  28,   28,  198,   28,  260, 4197,   42,   42,  198,   28],
       device='cuda:0')
tensor([  874,    43,  9725,    95,   253, 12724, 13285,    43, 41916,   253],
       device='cuda:0')


Training:  39%|███▉      | 394/1000 [02:02<03:01,  3.35it/s, loss=5.51]

tensor([ 28, 260,  28,  42,  28,  28, 198, 198, 198, 198], device='cuda:0')
tensor([ 282,  650,  651,  456,  105,   47,  198,  198, 3911, 3945],
       device='cuda:0')


Training:  40%|███▉      | 395/1000 [02:02<02:57,  3.40it/s, loss=5.39]

tensor([100,  28,  28,  28, 198, 198, 339,  28, 253,  28], device='cuda:0')
tensor([  100, 17072,  8177,    47,   198, 12908,  3661,   325,  8177,    28],
       device='cuda:0')


Training:  40%|███▉      | 396/1000 [02:02<02:55,  3.43it/s, loss=5.54]

tensor([   28,    28,    28,   198,   198,   198, 21723,    28,    28,   260],
       device='cuda:0')
tensor([ 337, 4048,   43,  198, 2596,  957, 2629, 2606,  314, 5951],
       device='cuda:0')


Training:  40%|███▉      | 397/1000 [02:02<02:56,  3.43it/s, loss=5.54]

tensor([2680,  198,   42,   42,  198,  198,   28,  198,   28,  198],
       device='cuda:0')
tensor([31540,  4628,  8772,    42,   198, 34056,    28, 38061,    28,   469],
       device='cuda:0')


Training:  40%|███▉      | 398/1000 [02:03<02:57,  3.39it/s, loss=5.57]

tensor([ 28, 260, 339,  28, 198, 198, 314, 260, 198,  28], device='cuda:0')
tensor([1607,  284, 6917,   30,  198, 1348,  314,  260, 4569,   28],
       device='cuda:0')


Training:  40%|███▉      | 399/1000 [02:03<02:57,  3.38it/s, loss=5.55]

tensor([ 28, 260,  28, 198, 198, 198,  28,  42, 198, 198], device='cuda:0')
tensor([  359,  7270,    47,   198,   198,    62, 10942,    42,   198, 35251],
       device='cuda:0')


Training:  40%|████      | 400/1000 [02:03<02:56,  3.39it/s, loss=5.55]

tensor([198,  28, 260,  28, 339,  28, 198, 198, 198, 198], device='cuda:0')
tensor([ 905,  314, 2932,  284, 3599,   30,  198,  198, 3911, 3945],
       device='cuda:0')


Training:  40%|████      | 401/1000 [02:04<02:56,  3.40it/s, loss=5.45]

tensor([ 28,  28,  28,  28, 198, 198, 198,  42, 198,  42], device='cuda:0')
tensor([13735,   739, 34104,    30,   198,   198,    54,  5135,  2285, 18630],
       device='cuda:0')


Training:  40%|████      | 402/1000 [02:04<02:55,  3.41it/s, loss=5.45]

tensor([ 28,  28, 457, 260,  28,  28,  28, 198, 198, 260], device='cuda:0')
tensor([ 2275,   392,  3408,  1272,  1176,  8322,    28,   198, 16721,   653],
       device='cuda:0')


Training:  40%|████      | 403/1000 [02:04<02:54,  3.42it/s, loss=5.31]

tensor([198, 198,  28, 198,  42,  42,  57,  42, 198, 198], device='cuda:0')
tensor([  198,    60,  3907,    73, 29146,  6565,  3438,    42,   198,  5195],
       device='cuda:0')


Training:  40%|████      | 404/1000 [02:04<02:55,  3.39it/s, loss=5.14]

tensor([6737,   28,  198,  198, 3060,   28,   28,   28,  198,  198],
       device='cuda:0')
tensor([  313,    42,   198,    57,  3287,   787, 33974,    30,   198,   198],
       device='cuda:0')


Training:  40%|████      | 405/1000 [02:05<02:51,  3.47it/s, loss=5.4] 

tensor([  57, 3060,   42,  198,  198,  325,   28,   28,  198, 3060],
       device='cuda:0')
tensor([  57, 3438,   42,  198, 2068, 2988,  338,   28,  339,  868],
       device='cuda:0')


Training:  41%|████      | 406/1000 [02:05<02:50,  3.48it/s, loss=5.25]

tensor([260,  28, 325,  28, 198, 198, 449, 339,  28, 260], device='cuda:0')
tensor([17072, 36034, 22576,    28,   198,  4370,   449, 30493,   411,   957],
       device='cuda:0')


Training:  41%|████      | 407/1000 [02:05<02:52,  3.45it/s, loss=5.25]

tensor([  28,   28,  198,  198,  449,  339, 3060,   28,  198,   28],
       device='cuda:0')
tensor([1225,   17,  198, 4370,  449,  339,  441,   28,  965,   28],
       device='cuda:0')


Training:  41%|████      | 408/1000 [02:06<02:55,  3.38it/s, loss=5.6] 

tensor([ 28, 260,  28, 198, 253,  28, 260,  28, 198, 198], device='cuda:0')
tensor([ 288,  685,   28,  564, 2093,  288, 1003,   30,  198,   63],
       device='cuda:0')


Training:  41%|████      | 409/1000 [02:06<02:53,  3.41it/s, loss=5.72]

tensor([ 28,  28, 253, 198,  28,  28, 198, 198, 198,  28], device='cuda:0')
tensor([  331, 26365,   253,  6243,  2139,    30,   198,   198, 43029,  1754],
       device='cuda:0')


Training:  41%|████      | 410/1000 [02:06<02:50,  3.46it/s, loss=5.56]

tensor([   0,   28,   28,   28,  198,  198,  198, 3945,   63,   42],
       device='cuda:0')
tensor([1503,  346, 5173,   30,  198,  198, 3911, 3945,   63,   42],
       device='cuda:0')


Training:  41%|████      | 411/1000 [02:06<02:48,  3.49it/s, loss=5.56]

tensor([ 28,  28,  28, 198,  28,  28, 198,  28,  28, 198], device='cuda:0')
tensor([ 9154,  5569,    28, 13804,   441,   253, 22415,   555,    43,   198],
       device='cuda:0')


Training:  41%|████      | 412/1000 [02:07<02:49,  3.47it/s, loss=5.43]

tensor([ 28, 198,  42, 198,  28, 260, 339,  28, 198, 260], device='cuda:0')
tensor([   30, 14755,    28,  1690,   429,   338, 10172,   198,  6228,  2112],
       device='cuda:0')


Training:  41%|████▏     | 413/1000 [02:07<02:46,  3.52it/s, loss=5.48]

tensor([ 28, 260,  28, 260,  28,  28,  28, 198, 198,  28], device='cuda:0')
tensor([7200,  874,  429,  451, 1796, 6724,   28,  198, 6882,  281],
       device='cuda:0')


Training:  41%|████▏     | 414/1000 [02:07<02:45,  3.53it/s, loss=5.48]

tensor([  42,   28,   28,  198,  198, 3060,   28,  260,   28,  198],
       device='cuda:0')
tensor([  315,   851,    28,   198,    57, 14941,   411, 10419,   623,  3497],
       device='cuda:0')


Training:  42%|████▏     | 415/1000 [02:08<02:47,  3.49it/s, loss=5.1] 

tensor([ 346,  198,  198, 3060,  339, 4197,   28,  260,   28, 4197],
       device='cuda:0')
tensor([   42,   198,    57,   744,   260,  4132,   282,  6661,   260, 25729],
       device='cuda:0')


Training:  42%|████▏     | 416/1000 [02:08<02:48,  3.46it/s, loss=5.1]

tensor([ 28,  28,  28,  28, 198, 198, 339,  28,  28, 339], device='cuda:0')
tensor([ 254,  349,  269,   30,  198, 2596,  325,  357,  347,  357],
       device='cuda:0')


Training:  42%|████▏     | 417/1000 [02:08<02:50,  3.43it/s, loss=5.26]

tensor([ 28,  28, 198, 198, 198,  42,  42, 198, 198, 260], device='cuda:0')
tensor([ 1448,    30,   198,   198,    73, 25089,    42,   198,  9629,   732],
       device='cuda:0')


Training:  42%|████▏     | 418/1000 [02:08<02:47,  3.47it/s, loss=5.61]

tensor([  28,  260, 6737,   28, 4197,   28,  339,  198,   28,  325],
       device='cuda:0')
tensor([ 2853,    96,  6737,   260,  7429,   837, 13987,  1924,   868,  4137],
       device='cuda:0')


Training:  42%|████▏     | 419/1000 [02:09<02:46,  3.50it/s, loss=5.55]

tensor([ 28, 198, 198, 198,  42, 198,  42,  42, 198, 198], device='cuda:0')
tensor([   30,   198,   198,  7430,  8107,    54, 10030,    42,   198, 45496],
       device='cuda:0')


Training:  42%|████▏     | 420/1000 [02:09<02:42,  3.56it/s, loss=5.48]

tensor([   28,    28,   198,   339,    28,    28,  3060,    28,    28, 21723],
       device='cuda:0')
tensor([  588,   198,  5195, 12765,   416,   339,  1089,  3806,   957,  3497],
       device='cuda:0')


Training:  42%|████▏     | 421/1000 [02:09<02:40,  3.61it/s, loss=5.63]

tensor([  28,  282,  314, 3060,   28,  198,  198, 4197,   28,   28],
       device='cuda:0')
tensor([ 4196,   384,   392,   437,    28,   198,   504, 42502, 15731,  4502],
       device='cuda:0')


Training:  42%|████▏     | 422/1000 [02:10<02:37,  3.67it/s, loss=5.57]

tensor([6737,  260,   28,  198,   28,  260,   28,  198,  198,  198],
       device='cuda:0')
tensor([10937,  1209,   650, 26467,   282, 12669,    30,   198,   198, 29719],
       device='cuda:0')


Training:  42%|████▏     | 423/1000 [02:10<02:37,  3.67it/s, loss=5.18]

tensor([ 28, 260, 198,  28, 198, 198,  28, 260, 198,  28], device='cuda:0')
tensor([15032,   480, 10172,    28,   198, 16654,   259,   480,  1038,  2397],
       device='cuda:0')


Training:  42%|████▏     | 424/1000 [02:10<02:38,  3.63it/s, loss=5.46]

tensor([  260,   260,    28,    28,   198,   198,   198,  2680, 24625,   198],
       device='cuda:0')
tensor([ 4875,   549,  3287,    30,   198,   198, 41074,  2680, 35135,    55],
       device='cuda:0')


Training:  42%|████▎     | 425/1000 [02:10<02:37,  3.65it/s, loss=5.57]

tensor([ 28, 260, 381,  28, 260,  42,  28,  28,  28, 198], device='cuda:0')
tensor([  281,   544,   381,   411,  4360,  4565,  1802, 11064,    47,   198],
       device='cuda:0')


Training:  43%|████▎     | 426/1000 [02:11<02:35,  3.70it/s, loss=5.37]

tensor([ 28,  28, 198, 198, 339,  28, 198,  28, 198, 260], device='cuda:0')
tensor([ 4463,    28,   198,  2427,  1176,    28, 37494,    28,   284, 33974],
       device='cuda:0')


Training:  43%|████▎     | 427/1000 [02:11<02:35,  3.69it/s, loss=5.52]

tensor([  260,   260, 21723,    28,   198,   198, 21723,    28,   260,    28],
       device='cuda:0')
tensor([  351,   957,  8664,    17,   198,  5965,  2606,   284,  3696, 36644],
       device='cuda:0')


Training:  43%|████▎     | 428/1000 [02:11<02:36,  3.65it/s, loss=5.27]

tensor([   28,    28,    28,   198,   198,   339,   198,   260,    28, 21723],
       device='cuda:0')
tensor([ 2874,  1592,    43,   198,  3528,    28,  1953,  4649, 13987,  3289],
       device='cuda:0')


Training:  43%|████▎     | 429/1000 [02:11<02:38,  3.59it/s, loss=5.27]

tensor([ 28, 325,  28,  28,  28, 198, 325,  28,  28, 260], device='cuda:0')
tensor([1251,  325, 2627,  277,  198, 2068,  685, 1683,  351,  468],
       device='cuda:0')


Training:  43%|████▎     | 430/1000 [02:12<02:44,  3.46it/s, loss=4.75]

tensor([  198,  3907,    42,    42,   198,    42,   198,   198, 21723,    28],
       device='cuda:0')
tensor([   60,  3907,    73, 23675,    73,    42,   198,  5965,  2606, 10937],
       device='cuda:0')


Training:  43%|████▎     | 431/1000 [02:12<02:44,  3.46it/s, loss=5.57]

tensor([  28,   28,   42,   28,   28,  260,   28, 4314,   28,  198],
       device='cuda:0')
tensor([16568,  8767,   301,  2177,   351,   634,   278,  6842,    28,   198],
       device='cuda:0')


Training:  43%|████▎     | 432/1000 [02:12<02:41,  3.51it/s, loss=5.53]

tensor([   28,   198,   198,   198,    71, 20055,    42,   198,   198,   260],
       device='cuda:0')
tensor([   30,   198,   198, 29719,    71, 27163,    42,   198,  5212, 14942],
       device='cuda:0')


Training:  43%|████▎     | 433/1000 [02:13<02:40,  3.54it/s, loss=5.34]

tensor([   28,   198,   198,   198,  4728,   198,   198, 20055,    42,    42],
       device='cuda:0')
tensor([   30,   198,   198,    59,  4728, 19436,    71,  2721, 38744,    42],
       device='cuda:0')


Training:  43%|████▎     | 434/1000 [02:13<02:38,  3.57it/s, loss=5.24]

tensor([ 28, 198, 260, 260,  28,  28,  28, 198, 198,  28], device='cuda:0')
tensor([   28,   284, 18089,  7065,  9446,  5182,    43,   198,  3528,    28],
       device='cuda:0')


Training:  44%|████▎     | 435/1000 [02:13<02:36,  3.61it/s, loss=5.16]

tensor([4197,  282,   28,   28,  260,  198,   28,   28,  100,   28],
       device='cuda:0')
tensor([ 5151, 11948,  5559,   335,   469,   725,  6221,   506,  4132,    28],
       device='cuda:0')


Training:  44%|████▎     | 436/1000 [02:13<02:34,  3.65it/s, loss=5.31]

tensor([  28,  260, 3060,   28,  260,   28,  198,  198,   28,   28],
       device='cuda:0')
tensor([ 744,  339, 2090,  282, 9970,   30,  198, 2696, 3472, 1303],
       device='cuda:0')


Training:  44%|████▎     | 437/1000 [02:14<02:36,  3.59it/s, loss=5.31]

tensor([   28,   198,   198,   198,  4728,   198,    71, 20055,    42,    42],
       device='cuda:0')
tensor([   30,   198,   198,    59,  4728, 15494,    71, 20055,  9536,    42],
       device='cuda:0')


Training:  44%|████▍     | 438/1000 [02:14<02:38,  3.54it/s, loss=5.27]

tensor([  28,   28,  260,    0,   42,  198, 3060,   28,  198, 4197],
       device='cuda:0')
tensor([4658,  282, 4929, 9686,  198, 4948, 3100, 6616,  480, 2184],
       device='cuda:0')


Training:  44%|████▍     | 439/1000 [02:14<02:35,  3.60it/s, loss=5.16]

tensor([ 260, 4197,   28,  198,  198,   28,   28,   28,   28,    0],
       device='cuda:0')
tensor([  260,  1532,    28,   198, 16075,  3310,  1869, 18598,   283,   494],
       device='cuda:0')


Training:  44%|████▍     | 440/1000 [02:14<02:39,  3.51it/s, loss=5.16]

tensor([  198,   198,   198,    71, 20055,    42,   198,   198,    28,    28],
       device='cuda:0')
tensor([  198,   198, 29719,    71, 27163,    42,   198,  1780,  8296,    28],
       device='cuda:0')


Training:  44%|████▍     | 441/1000 [02:15<02:38,  3.54it/s, loss=5.09]

tensor([  28,   28,   23,  260, 4197,   28,  282,  198,  198,  494],
       device='cuda:0')
tensor([17072,   263,   432,   260,  2240, 17816,    28,   198, 15024,   494],
       device='cuda:0')


Training:  44%|████▍     | 442/1000 [02:15<02:36,  3.58it/s, loss=5.43]

tensor([  28,   28, 4197,   28,  260,   28,  198,  198,   28,  325],
       device='cuda:0')
tensor([3934,  260, 3102,  284, 3568,   47,  198, 6882, 3786,  392],
       device='cuda:0')


Training:  44%|████▍     | 443/1000 [02:15<02:34,  3.60it/s, loss=5.34]

tensor([  28,   28,  260,  253, 4197,   28,  198,  198,  260,  260],
       device='cuda:0')
tensor([   93,   284,   702,   253, 27508, 18030,   198,  3825, 12885,   739],
       device='cuda:0')


Training:  44%|████▍     | 444/1000 [02:16<02:36,  3.56it/s, loss=5.34]

tensor([ 28, 260,  28,  28, 198, 198,  28, 198, 302,  28], device='cuda:0')
tensor([4875, 1272, 3287,   30,  198, 1780,   17,  416,  588, 1805],
       device='cuda:0')


Training:  44%|████▍     | 445/1000 [02:16<02:38,  3.50it/s, loss=5.31]

tensor([   28,   198,   198,    28,   260,     0, 17072,    28,   198,    28],
       device='cuda:0')
tensor([  28,  198, 3528, 1062, 3005,  474, 2442,   43, 3472,   28],
       device='cuda:0')


Training:  45%|████▍     | 446/1000 [02:16<02:36,  3.55it/s, loss=5.82]

tensor([ 260,  100, 4197,   28,  198,  198,    0,  198, 6737,   28],
       device='cuda:0')
tensor([  506,  9202, 15308,    28,   198,  3733,    29,    96,  2467,   821],
       device='cuda:0')


Training:  45%|████▍     | 447/1000 [02:17<02:33,  3.61it/s, loss=5.55]

tensor([   28,    28,   198,   339,   260,   260,    28,   198, 21723,    28],
       device='cuda:0')
tensor([ 1163,   198, 26001,   325,  1042,   614,    28,   957,  5717,    28],
       device='cuda:0')


Training:  45%|████▍     | 448/1000 [02:17<02:33,  3.59it/s, loss=5.59]

tensor([6737,  198,  198,  198,   42,   28,   42,  198,  198,   28],
       device='cuda:0')
tensor([   47,   198,   198,  4796, 32598,  2097,    42,   198, 19525,    28],
       device='cuda:0')


Training:  45%|████▍     | 449/1000 [02:17<02:34,  3.57it/s, loss=5.7] 

tensor([4197,   28,  198,   28,   28,  198,  339, 4197,   28,   28],
       device='cuda:0')
tensor([  599,    28,  3449,  2843,   198, 20314,   650,    99,   549,   288],
       device='cuda:0')


Training:  45%|████▌     | 450/1000 [02:17<02:33,  3.57it/s, loss=5.67]

tensor([198,  28,  28, 260, 260,  28, 198, 198, 198, 198], device='cuda:0')
tensor([ 1643,   982,   284, 16011,   982,    30,   198,   198, 39776,  4192],
       device='cuda:0')


Training:  45%|████▌     | 451/1000 [02:18<02:32,  3.60it/s, loss=5.49]

tensor([  28,  198,  198,  198,  198, 6426,   42,  198,  198,   28],
       device='cuda:0')
tensor([   47,   198,   198, 39776,  4192,  6426,    42,   198,  3482,  3861],
       device='cuda:0')


Training:  45%|████▌     | 452/1000 [02:18<02:34,  3.55it/s, loss=5.49]

tensor([  198,   198,    28,   260, 21723,    28,   198,   260,   260,   198],
       device='cuda:0')
tensor([14229, 12557,   325,   957,  3289,    28,   284,  4160,   198,   504],
       device='cuda:0')


Training:  45%|████▌     | 453/1000 [02:18<02:37,  3.48it/s, loss=5.49]

tensor([549,  28, 325, 260, 198, 198,  28, 100, 198,  28], device='cuda:0')
tensor([ 357,  868,  325,   28,  198, 8113,  506,  354, 2490,   85],
       device='cuda:0')


Training:  45%|████▌     | 454/1000 [02:19<02:35,  3.52it/s, loss=5.5] 

tensor([260,  28, 198, 339,  42,  28,  28,  28, 198,  28], device='cuda:0')
tensor([  346,    28,   355,   328,   101,   432, 31886,   198,  6882, 43624],
       device='cuda:0')


Training:  46%|████▌     | 455/1000 [02:19<02:34,  3.53it/s, loss=5.3]

tensor([ 28, 198,  42,  28, 260,  28,  42,  28, 198,  28], device='cuda:0')
tensor([  198,    54,  4985,   284,   430, 33659,  1718,    28,   441,   253],
       device='cuda:0')


Training:  46%|████▌     | 456/1000 [02:19<02:33,  3.53it/s, loss=5.3]

tensor([  28,  198,  198,  198, 2680, 2680,   42,   42,  198,  198],
       device='cuda:0')
tensor([   17,   198,   198, 17321,  5819,  2154,  4501,    42,   198,    57],
       device='cuda:0')


Training:  46%|████▌     | 457/1000 [02:19<02:32,  3.57it/s, loss=5.72]

tensor([  28,  198,  198,  100,   28,   28,  198,  260, 4197,   28],
       device='cuda:0')
tensor([   28,   963,   506,   100, 17072,    17,   327,   260,  2319,   198],
       device='cuda:0')


Training:  46%|████▌     | 458/1000 [02:20<02:30,  3.59it/s, loss=5.43]

tensor([  42,   28,  198,  339, 3060,  198,  260,  198,  260,   28],
       device='cuda:0')
tensor([15684,    28,   527,   339,   736, 20172,    42,   327, 19461,    28],
       device='cuda:0')


Training:  46%|████▌     | 459/1000 [02:20<02:30,  3.59it/s, loss=5.55]

tensor([ 42,  28,  28, 198, 198, 198,  42,  42, 198, 198], device='cuda:0')
tensor([  313,  2694,    30,   198,   198, 36169,   403,    42,   198,  5965],
       device='cuda:0')


Training:  46%|████▌     | 460/1000 [02:20<02:30,  3.59it/s, loss=5.84]

tensor([21723,    28,    28,   198,   198,    28,   260,   198,    28,   198],
       device='cuda:0')
tensor([ 1038, 21723,    28,   198, 10576,   314,  3590,  1147,    42,  1188],
       device='cuda:0')


Training:  46%|████▌     | 461/1000 [02:20<02:29,  3.60it/s, loss=5.72]

tensor([6737,  198,  198,   28,   28,  198,   28,    0,  198,  198],
       device='cuda:0')
tensor([   42,   198, 31000,  9677,    28,  3092,    81,    17,   198,   198],
       device='cuda:0')


Training:  46%|████▌     | 462/1000 [02:21<02:29,  3.59it/s, loss=5.95]

tensor([   0,   28,   28,   28,   28,   28,  198, 4197,   28,  260],
       device='cuda:0')
tensor([17926, 24113, 20322,   750,  2775,    43,   253,  2112,   288,   198],
       device='cuda:0')


Training:  46%|████▋     | 463/1000 [02:21<02:31,  3.56it/s, loss=5.95]

tensor([ 339,  339,  198, 3060,  260,  260,  260, 4197,   28,   93],
       device='cuda:0')
tensor([  744,   198,    57,   288,  4571,   327,   653, 11108,    29,  6691],
       device='cuda:0')


Training:  46%|████▋     | 464/1000 [02:21<02:32,  3.52it/s, loss=5.68]

tensor([21723, 21723,    28,   198,   198,   325,    42,    28,   260,   198],
       device='cuda:0')
tensor([47910, 21723,    28,   198,  2068,   410,   489,   418,   469, 23454],
       device='cuda:0')


Training:  46%|████▋     | 465/1000 [02:22<02:30,  3.56it/s, loss=5.67]

tensor([   28,  3060,   441,    28,   198,   314,    28,   325,   198, 34818],
       device='cuda:0')
tensor([  339,   736,  6796,   198,  1348,  5569,   868,  1643,   637, 10372],
       device='cuda:0')


Training:  47%|████▋     | 466/1000 [02:22<02:31,  3.53it/s, loss=5.67]

tensor([ 28,  42, 198, 198, 198, 198,  42, 198, 198, 314], device='cuda:0')
tensor([ 1756,    30,   198,   198, 36169,   403,    42,   198,  3681, 26365],
       device='cuda:0')


Training:  47%|████▋     | 467/1000 [02:22<02:35,  3.43it/s, loss=5.91]

tensor([ 28, 198, 198,  28,  28, 198, 198, 198,  28, 198], device='cuda:0')
tensor([  253,  1035,  5740,   582,    30,   198,   198, 12850,  4545,    49],
       device='cuda:0')


Training:  47%|████▋     | 468/1000 [02:23<02:38,  3.37it/s, loss=5.27]

tensor([  28,   28,  198,  198,  198,  198, 6426,   42,  198,  198],
       device='cuda:0')
tensor([ 6737,    30,   198,   198, 39776,  4192,  6426,    42,   198,  1348],
       device='cuda:0')


Training:  47%|████▋     | 469/1000 [02:23<02:35,  3.41it/s, loss=5.39]

tensor([ 28, 198,  28,  28, 198,  28,  28, 198, 198,  28], device='cuda:0')
tensor([ 253, 4565,  477,   43, 7223, 6737,   28,  198, 2596, 3878],
       device='cuda:0')


Training:  47%|████▋     | 470/1000 [02:23<02:32,  3.48it/s, loss=5.56]

tensor([ 28,  28,  28, 198,  42,  28, 260,  28, 198,  28], device='cuda:0')
tensor([ 550, 1075,  198,   54, 1738,  670, 2275,  355,  655,   30],
       device='cuda:0')


Training:  47%|████▋     | 471/1000 [02:23<02:32,  3.47it/s, loss=5.56]

tensor([  42,   28,  198,  198,  198, 4192, 6426,   42,  198,  198],
       device='cuda:0')
tensor([ 2001,    30,   198,   198, 39776,  4192,  6426,    42,   198,    62],
       device='cuda:0')


Training:  47%|████▋     | 472/1000 [02:24<02:34,  3.41it/s, loss=5.41]

tensor([  198,   314,    42,   198,   198, 21723,    28,   198,    28,   198],
       device='cuda:0')
tensor([ 8113, 21660,    42,   198,  5965,  1861,    28, 38061,    28,   314],
       device='cuda:0')


Training:  47%|████▋     | 473/1000 [02:24<02:33,  3.44it/s, loss=5.41]

tensor([ 28, 339, 314, 198,  28, 198, 314, 260, 198, 260], device='cuda:0')
tensor([  347,   384, 48271,   468,    42,   384,   436,  2711,   288,   536],
       device='cuda:0')


Training:  47%|████▋     | 474/1000 [02:24<02:30,  3.49it/s, loss=5.41]

tensor([ 198,  198,  198,  198,   42, 2680,   28,   42,  198,  198],
       device='cuda:0')
tensor([   30,   198,   198, 48902,  9232,  2680,  2097,    42,   198, 14229],
       device='cuda:0')


Training:  48%|████▊     | 475/1000 [02:24<02:29,  3.51it/s, loss=5.46]

tensor([   28,    28,   198, 21723,    28,    28,   198,  3060,   198,   260],
       device='cuda:0')
tensor([ 924,   42,  957, 1450, 4445,  198,   57,  457,  429,  469],
       device='cuda:0')


Training:  48%|████▊     | 476/1000 [02:25<02:24,  3.62it/s, loss=6.09]

tensor([339, 198, 198,  28, 198, 198, 260, 198, 198,  42], device='cuda:0')
tensor([ 7697,   355, 25514,    43,   564,   281,   260,   198,  1527,  2832],
       device='cuda:0')


Training:  48%|████▊     | 477/1000 [02:25<02:22,  3.66it/s, loss=6.09]

tensor([ 28, 260,  28, 260, 198,  28, 198, 462,  28, 260], device='cuda:0')
tensor([  281,  3826,   282,   198, 11247,    42,   295,  1878,   351,   511],
       device='cuda:0')


Training:  48%|████▊     | 478/1000 [02:25<02:25,  3.59it/s, loss=5.31]

tensor([ 42,  57,  42, 198, 198, 339, 314,  28,  28, 198], device='cuda:0')
tensor([ 6565, 38415,    42,   198,  1653,  1041,  4161,  8361,  1325,    28],
       device='cuda:0')


Training:  48%|████▊     | 479/1000 [02:26<02:23,  3.62it/s, loss=5.53]

tensor([ 28,  28, 198, 198,  28,  42,  28,  28, 198,  28], device='cuda:0')
tensor([ 1272,   198, 35097,  1029, 25865,  1687,   346,    30,  1206, 13461],
       device='cuda:0')


Training:  48%|████▊     | 480/1000 [02:26<02:22,  3.64it/s, loss=5.34]

tensor([21723,    28,   198,   325,   198,   339,   198, 21723,    28,   260],
       device='cuda:0')
tensor([4778,  198, 2068,  582,  338,  416,  957,  599,  281, 1272],
       device='cuda:0')


Training:  48%|████▊     | 481/1000 [02:26<02:23,  3.61it/s, loss=5.34]

tensor([  28,  198,  457,  253,   28,  198,  198,  198,    0, 8242],
       device='cuda:0')
tensor([   43,   339,   744,  2139,    30,   198,   198,    60, 15604,  8772],
       device='cuda:0')


Training:  48%|████▊     | 482/1000 [02:26<02:28,  3.49it/s, loss=5.63]

tensor([ 28,  28, 198, 198, 260,  28, 457,  28,  28, 260], device='cuda:0')
tensor([2428,   28,  198, 5212, 5337,  392, 2275,  357, 2220,  288],
       device='cuda:0')


Training:  48%|████▊     | 483/1000 [02:27<02:25,  3.55it/s, loss=5.57]

tensor([ 28,  28, 325, 457,  28,  28, 198, 198,  28, 198], device='cuda:0')
tensor([  424,  3786,   339,  8215,   346,    43,   198, 15017,    28,   451],
       device='cuda:0')


Training:  48%|████▊     | 484/1000 [02:27<02:25,  3.54it/s, loss=5.57]

tensor([457,  42, 198, 346,  28, 198, 198,   0,  28,  28], device='cuda:0')
tensor([  948, 30437,  9984,   346,    42,   198,  9482,   486,   549,   288],
       device='cuda:0')


Training:  48%|████▊     | 485/1000 [02:27<02:29,  3.44it/s, loss=5.45]

tensor([ 28,   0,  28, 198, 339, 198, 198,  23,  28, 198], device='cuda:0')
tensor([ 3275,  8806,    28,   564,  1035,   198, 12018,  8806,    28,   423],
       device='cuda:0')


Training:  49%|████▊     | 486/1000 [02:28<02:33,  3.36it/s, loss=5.32]

tensor([ 28,  28, 198,  28, 198, 339, 100,  28, 260,  28], device='cuda:0')
tensor([ 2016,    28,  1209,    28,   732,   506,  1690,  1980, 20322,    42],
       device='cuda:0')


Training:  49%|████▊     | 487/1000 [02:28<02:34,  3.31it/s, loss=5.36]

tensor([  0, 198, 198,  28, 198, 325, 260, 198, 325,  28], device='cuda:0')
tensor([  198, 11952, 11737,    43,   654,   325,   384,   523, 23599,    30],
       device='cuda:0')


Training:  49%|████▉     | 488/1000 [02:28<02:34,  3.32it/s, loss=5.05]

tensor([ 28,  28, 260,  28, 260,  28, 198, 198, 198, 253], device='cuda:0')
tensor([4109,  282,  732,  314, 2294,   43,  284,   28,  702,  253],
       device='cuda:0')


Training:  49%|████▉     | 489/1000 [02:28<02:34,  3.31it/s, loss=5.46]

tensor([ 28,  28, 198,  28, 260, 198,  28, 198, 339,  28], device='cuda:0')
tensor([12687,   198,  4590,   281,   260,  6411,    30,  3315,   549,   260],
       device='cuda:0')


Training:  49%|████▉     | 490/1000 [02:29<02:33,  3.33it/s, loss=5.23]

tensor([ 28,  42,  28, 198, 339, 314, 339,  28, 253, 198], device='cuda:0')
tensor([41063,   924,   198,  1653,  1041,   338,   384, 26365, 15270,  6737],
       device='cuda:0')


Training:  49%|████▉     | 491/1000 [02:29<02:31,  3.37it/s, loss=5.23]

tensor([ 28, 198, 198, 198,  42,  42,  42,  42, 198, 198], device='cuda:0')
tensor([   30,   198,   198,  2721,  5431,  4210,  5963,    42,   198, 34177],
       device='cuda:0')


Training:  49%|████▉     | 492/1000 [02:29<02:32,  3.33it/s, loss=5.46]

tensor([198, 198, 260,  28, 260,  28,  28,  28, 198, 339], device='cuda:0')
tensor([  198, 48111,   549,   288,  4875,   601,  3287,    28,   837,   339],
       device='cuda:0')


Training:  49%|████▉     | 493/1000 [02:30<02:32,  3.32it/s, loss=5.48]

tensor([  42,   28,  198,   28,  325,   28,  198,  198, 3060,  325],
       device='cuda:0')
tensor([11424,    28,   357,   868,  7219,    42,   198,    57,  3060,  5774],
       device='cuda:0')


Training:  49%|████▉     | 494/1000 [02:30<02:31,  3.34it/s, loss=5.61]

tensor([ 28, 198, 260,  42,  28, 260, 198, 198, 198,  42], device='cuda:0')
tensor([  28,  564,  716,  547,  441,   30,  198,  198, 2721, 5431],
       device='cuda:0')


Training:  50%|████▉     | 495/1000 [02:30<02:30,  3.36it/s, loss=5.61]

tensor([ 28,  28, 260,  28,  28, 198,  28, 260,  28, 198], device='cuda:0')
tensor([17072,   288,   451, 17324,    28,  2631,   284,  1341,    47,  1431],
       device='cuda:0')


Training:  50%|████▉     | 496/1000 [02:31<02:31,  3.33it/s, loss=5.39]

tensor([ 28, 198,  28, 260,  28, 198,  28, 198, 260,   0], device='cuda:0')
tensor([  260, 35278,   281,   346,    28, 48275,    28,   355,  4662,   894],
       device='cuda:0')


Training:  50%|████▉     | 497/1000 [02:31<02:27,  3.40it/s, loss=5.43]

tensor([  28,   28,  253,  198,   28,  198,  198,  198,   42, 4501],
       device='cuda:0')
tensor([27044,   436,   260, 48945,    47,   198,   198, 23110,  3206,  4501],
       device='cuda:0')


Training:  50%|████▉     | 498/1000 [02:31<02:24,  3.48it/s, loss=5.28]

tensor([   0,  253,  198,  198,  198,   69, 9620,  717, 2113,  198],
       device='cuda:0')
tensor([ 702,   30,  198,  198,   52,   69, 9620,  717, 2113,   51],
       device='cuda:0')


Training:  50%|████▉     | 499/1000 [02:31<02:24,  3.46it/s, loss=5.28]

tensor([ 23, 260,  28, 198, 198,  28, 198,  28, 198, 198], device='cuda:0')
tensor([  335,  1272,    47,   198, 10768,    28,   685,    28,  5697,  2747],
       device='cuda:0')


Training:  50%|█████     | 500/1000 [02:32<02:26,  3.40it/s, loss=5.15]

tensor([ 260,  325,   28,  198,   28,  198,  198,  198,   69, 9620],
       device='cuda:0')
tensor([  523,  3694,   260, 11611,    30,   198,   198,    52,    69,  9620],
       device='cuda:0')


Training:  50%|█████     | 501/1000 [02:32<02:28,  3.37it/s, loss=5.53]

tensor([  339,   260,   282, 21723,    28,   260,     0,   198,    28,   198],
       device='cuda:0')
tensor([9713,  578,  957, 9983,  351,  278, 3285,  849,   42,  339],
       device='cuda:0')


Training:  50%|█████     | 502/1000 [02:32<02:27,  3.37it/s, loss=5.26]

tensor([288, 198, 260,  28, 260,  28,  28, 198, 198,  28], device='cuda:0')
tensor([   30, 30079,   346,   351,   451,  4386,    42,   198, 17777,   623],
       device='cuda:0')


Training:  50%|█████     | 503/1000 [02:33<02:25,  3.41it/s, loss=5.13]

tensor([ 325,   28,   28,   28,  198,  198,  198,   42, 4210,  198],
       device='cuda:0')
tensor([35532,   280,  7288,   423,   198,   198,  2721,  5431,  4210,  5963],
       device='cuda:0')


Training:  50%|█████     | 504/1000 [02:33<02:25,  3.41it/s, loss=5.13]

tensor([   28, 21723,    28,   198,   198,   198,    69,  9620,   717,  2113],
       device='cuda:0')
tensor([ 957, 4033,   30,  198,  198,   52,   69, 9620,  717, 2113],
       device='cuda:0')


Training:  50%|█████     | 505/1000 [02:33<02:27,  3.36it/s, loss=5.16]

tensor([ 28, 198, 253,  28,  28, 198, 260, 198, 198,  28], device='cuda:0')
tensor([  43,  325,  346, 8048,  198, 6228,  469, 1038, 1728,   30],
       device='cuda:0')


Training:  51%|█████     | 506/1000 [02:33<02:29,  3.30it/s, loss=4.91]

tensor([  198,    28,    28,   198,   198,   198, 15604,  8772,    42,   198],
       device='cuda:0')
tensor([41288,  1055,    47,   198,   198,    60, 15604,  8772,    42,   198],
       device='cuda:0')


Training:  51%|█████     | 507/1000 [02:34<02:28,  3.32it/s, loss=5.06]

tensor([  0,  28, 339, 257, 259,  28,  28,  28, 198,  28], device='cuda:0')
tensor([ 1134,   338,   297,    23,   259, 15480,   381,   253, 48945,    30],
       device='cuda:0')


Training:  51%|█████     | 508/1000 [02:34<02:25,  3.38it/s, loss=5.05]

tensor([339, 314, 346, 198,  28, 260, 339, 314,  28, 198], device='cuda:0')
tensor([ 384, 1250,  260, 2775,  327,  527,  384, 4242,   42,  198],
       device='cuda:0')


Training:  51%|█████     | 509/1000 [02:34<02:23,  3.43it/s, loss=5.42]

tensor([ 28, 198, 549,  28,  28, 198, 198, 260,  28, 198], device='cuda:0')
tensor([   42,  1303,  1272,  1690,    28,   198,   397, 25128,    30,   198],
       device='cuda:0')


Training:  51%|█████     | 510/1000 [02:35<02:20,  3.49it/s, loss=5.94]

tensor([ 28,  28, 198, 198,  28,  28, 198,  28,  28, 260], device='cuda:0')
tensor([8263,   42,  198, 5195, 2294,   28, 2662, 1272,  288,  260],
       device='cuda:0')


Training:  51%|█████     | 511/1000 [02:35<02:17,  3.54it/s, loss=5.25]

tensor([ 28,  28, 198, 198, 339, 260, 339,  28, 325,  28], device='cuda:0')
tensor([ 2223,    28,   198,  3528,   418,   338,  6968,  3786,  8177, 17600],
       device='cuda:0')


Training:  51%|█████     | 512/1000 [02:35<02:17,  3.55it/s, loss=5.59]

tensor([   28,   198,  6426,    28,    28, 21723,   253,   198, 21723,    28],
       device='cuda:0')
tensor([ 198,   55, 1134,  549,  957, 1036,  284,  957, 4083,  808],
       device='cuda:0')


Training:  51%|█████▏    | 513/1000 [02:36<02:17,  3.55it/s, loss=5.42]

tensor([ 28,  28, 282, 198, 198,  28,  28, 260, 198, 260], device='cuda:0')
tensor([  318,   578,    42,   653, 24641, 14517,   335,   198, 15135,  5882],
       device='cuda:0')


Training:  51%|█████▏    | 514/1000 [02:36<02:18,  3.50it/s, loss=5.42]

tensor([  28,   28,   28,  198,  260, 3060,  253,  314,  282,  198],
       device='cuda:0')
tensor([  293, 20322,    28,   327,   339,   457,   357,  2073,    30,   198],
       device='cuda:0')


Training:  52%|█████▏    | 515/1000 [02:36<02:19,  3.47it/s, loss=5.67]

tensor([549, 198,   0,  28,  28, 260, 198,  28, 198, 339], device='cuda:0')
tensor([  260,   412, 34315, 16592,  1980,   469, 10166,   198,  3528,   856],
       device='cuda:0')


Training:  52%|█████▏    | 516/1000 [02:36<02:19,  3.48it/s, loss=5.67]

tensor([  28,  198,  198,  198,    0,    0, 8772,   42,  198,  198],
       device='cuda:0')
tensor([   30,   198,   198,    56, 13110, 22239,  8772,    42,   198,  2696],
       device='cuda:0')


Training:  52%|█████▏    | 517/1000 [02:37<02:21,  3.41it/s, loss=5.3] 

tensor([  28,   28,   28,   28,  260,  198,  198,  198, 2810, 4501],
       device='cuda:0')
tensor([ 1441,   441,   874,   288,   423,   198,   198, 10895,  2810,  8772],
       device='cuda:0')


Training:  52%|█████▏    | 518/1000 [02:37<02:22,  3.37it/s, loss=5.19]

tensor([  198,   198,  4728, 33256,    42,    42,   198,   198,    28,   198],
       device='cuda:0')
tensor([  198,    59, 12455, 33256,  1933,    42,   198,  1780,    28,   523],
       device='cuda:0')


Training:  52%|█████▏    | 519/1000 [02:37<02:22,  3.38it/s, loss=5.19]

tensor([253, 314, 339, 314, 260,  28, 198, 198, 260, 260], device='cuda:0')
tensor([ 357,  338, 1041, 6753,  549,   28,  198,  788,  511,  957],
       device='cuda:0')


Training:  52%|█████▏    | 520/1000 [02:38<02:24,  3.33it/s, loss=5.29]

tensor([   28,    28,   198,   198,   198,  4728, 33256,    42,    42,   198],
       device='cuda:0')
tensor([12575,    30,   198,   198,    59, 12455, 33256,  1933,    42,   198],
       device='cuda:0')


Training:  52%|█████▏    | 521/1000 [02:38<02:25,  3.30it/s, loss=5.34]

tensor([6737,   28,  198,   42,    0,   28,  198,  314,   28,  314],
       device='cuda:0')
tensor([   91,    28,  9133,   490, 44004,    43,  1041,  1921,  1041,  3060],
       device='cuda:0')


Training:  52%|█████▏    | 522/1000 [02:38<02:24,  3.32it/s, loss=5.34]

tensor([ 28, 339, 198,  28,  28, 198, 314, 253, 198, 198], device='cuda:0')
tensor([ 564,  253, 8150,  301,   42,  384,  314, 1573,   28,  339],
       device='cuda:0')


Training:  52%|█████▏    | 523/1000 [02:38<02:26,  3.25it/s, loss=5.49]

tensor([  28,   28,    0,   28,  198,  198,  198, 2810, 8772,   42],
       device='cuda:0')
tensor([27415,   581,  1119,    30,   198,   198, 11718,  2810,  8970,    42],
       device='cuda:0')


Training:  52%|█████▏    | 524/1000 [02:39<02:27,  3.22it/s, loss=5.41]

tensor([   28,    28,   339,   198,   198,   198, 46634,  5229,  8772,    42],
       device='cuda:0')
tensor([  346,   592,    30,   198,   198, 43029, 46634,  5229,  8772,    42],
       device='cuda:0')


Training:  52%|█████▎    | 525/1000 [02:39<02:26,  3.25it/s, loss=5.3] 

tensor([314,  28, 198,   0,  28,  28,  28, 198, 198, 198], device='cuda:0')
tensor([  280,   260,  1079,   302, 12471,  1238,    30,   198,   198, 43029],
       device='cuda:0')


Training:  53%|█████▎    | 526/1000 [02:39<02:26,  3.24it/s, loss=5.04]

tensor([ 42,  28, 339, 314, 253,  28, 198, 198, 198, 198], device='cuda:0')
tensor([ 2945,   347,  1041,   506,  3213,    47,   198,   198, 13087,  8292],
       device='cuda:0')


Training:  53%|█████▎    | 527/1000 [02:40<02:26,  3.24it/s, loss=5.34]

tensor([ 28, 198, 540,  28, 198, 540,  28, 198, 540,  28], device='cuda:0')
tensor([   28,   787, 16981,    47,   787,  2234,    47,   787,  9060,    47],
       device='cuda:0')


Training:  53%|█████▎    | 528/1000 [02:40<02:23,  3.29it/s, loss=5.34]

tensor([ 28,  28, 198,  28, 260, 198,  28, 198, 198, 198], device='cuda:0')
tensor([1489,  260, 5710,  282,  650, 3229,   30,  198,  198,   60],
       device='cuda:0')


Training:  53%|█████▎    | 529/1000 [02:40<02:23,  3.29it/s, loss=4.95]

tensor([198, 198,   0,  28,  28, 260,  28,  28, 198, 198], device='cuda:0')
tensor([  198,    64, 14765, 14104,   327, 14553,  4240,    30,   198,   198],
       device='cuda:0')


Training:  53%|█████▎    | 530/1000 [02:41<02:23,  3.27it/s, loss=4.96]

tensor([325,  28, 198, 198,  28,  28, 260,  28, 325, 198], device='cuda:0')
tensor([2373,   28,  198, 3528, 2161,  670,  357, 3786,   28,  339],
       device='cuda:0')


Training:  53%|█████▎    | 531/1000 [02:41<02:24,  3.24it/s, loss=5.08]

tensor([  95,   28,   28,   28, 4197,   28,  198,  198,   42,  198],
       device='cuda:0')
tensor([ 277, 2574, 9289,  260, 3323,   47,  198,   63,   28,  787],
       device='cuda:0')


Training:  53%|█████▎    | 532/1000 [02:41<02:23,  3.27it/s, loss=5.06]

tensor([ 325,  198,   42,   28, 4197,   28,  339,   28,  260,  198],
       device='cuda:0')
tensor([ 198, 5812,  620,  260, 2455,  355, 5842,  282,  650, 4120],
       device='cuda:0')


Training:  53%|█████▎    | 533/1000 [02:42<02:20,  3.32it/s, loss=5.06]

tensor([  198,  4197,    28,   260, 21723,    28,   198,   314,    28,    42],
       device='cuda:0')
tensor([  504, 10243,   288,   957,  6221,    28,   451,  9154, 24626,    28],
       device='cuda:0')


Training:  53%|█████▎    | 534/1000 [02:42<02:21,  3.30it/s, loss=5]   

tensor([  95,   28,  198,  198,  198, 7113,   51, 6213, 8772,   42],
       device='cuda:0')
tensor([1119,   30,  198,  198,   70, 2113,   51, 6213, 8772,   42],
       device='cuda:0')


Training:  54%|█████▎    | 535/1000 [02:42<02:22,  3.26it/s, loss=4.89]

tensor([ 28,  28, 260, 339, 198, 198, 198,  28,  28, 198], device='cuda:0')
tensor([4972,  411,  338,   30,  198,  198, 9852,  323,   42,  198],
       device='cuda:0')


Training:  54%|█████▎    | 536/1000 [02:42<02:22,  3.25it/s, loss=4.65]

tensor([ 95,  28,  28,  30, 198, 198,  28,  42,  28, 260], device='cuda:0')
tensor([  105,  7557, 28823,    43,   198,  4809, 12724,   541,   288,  1372],
       device='cuda:0')


Training:  54%|█████▎    | 537/1000 [02:43<02:20,  3.29it/s, loss=5.7] 

tensor([ 198,  198,   28,   28,   28,   28,  198,  198,  339, 4197],
       device='cuda:0')
tensor([  198, 23641,  1457,   103,   402,    42,   198,  2427,   260,  3426],
       device='cuda:0')


Training:  54%|█████▍    | 538/1000 [02:43<02:17,  3.37it/s, loss=5.66]

tensor([ 28,  28,  28, 198,  28,  28,  28,  28, 198, 198], device='cuda:0')
tensor([ 500,  441,  198, 9142, 1296,  929, 1573,   30,  198,  198],
       device='cuda:0')


Training:  54%|█████▍    | 539/1000 [02:43<02:14,  3.43it/s, loss=5.79]

tensor([  28,  346,  506,   28,  198, 4197,   28,  260,   28,  198],
       device='cuda:0')
tensor([ 1250, 19002,  1440,   198,   504, 17816,   282, 28676,    28,   284],
       device='cuda:0')


Training:  54%|█████▍    | 540/1000 [02:44<02:12,  3.48it/s, loss=5.68]

tensor([ 28,  30,  28, 198, 198, 260, 198,  28,  28,  28], device='cuda:0')
tensor([35853,  6737,    43,   198,  3280,   480, 24276, 22566,   441,   253],
       device='cuda:0')


Training:  54%|█████▍    | 541/1000 [02:44<02:12,  3.47it/s, loss=5.68]

tensor([ 28,  30, 198,  30, 198,  28, 260,  28, 198, 198], device='cuda:0')
tensor([  982,    42,   685,    28,  8913,   351, 33327,    17,   198,    49],
       device='cuda:0')


Training:  54%|█████▍    | 542/1000 [02:44<02:12,  3.45it/s, loss=5.37]

tensor([  28,   28,   28,  198,   28, 6737,  198,    0,   28,  198],
       device='cuda:0')
tensor([ 318, 8233,  650, 5334,  478,  198,   56,  792,   17, 1209],
       device='cuda:0')


Training:  54%|█████▍    | 543/1000 [02:44<02:15,  3.36it/s, loss=5.37]

tensor([6737,   28,  260,   28,  198,  314,  260,   42,   28,  198],
       device='cuda:0')
tensor([5674,  411, 4778,   42,  451,  314, 7468,  258, 1132,  198],
       device='cuda:0')


Training:  54%|█████▍    | 544/1000 [02:45<02:17,  3.32it/s, loss=5.51]

tensor([ 42, 346,  28, 198,  28, 198,  28, 346,  28, 198], device='cuda:0')
tensor([ 9725,    95,    17, 38467,    28, 23632,  9725,    95,    30,   198],
       device='cuda:0')


Training:  55%|█████▍    | 545/1000 [02:45<02:16,  3.33it/s, loss=5.44]

tensor([ 28,  42, 198, 198, 346, 457,  28, 198,  30, 198], device='cuda:0')
tensor([32062,    42,   198,  6121,   392,  7219,   750,  2030,    28,  4875],
       device='cuda:0')


Training:  55%|█████▍    | 546/1000 [02:45<02:12,  3.44it/s, loss=5.39]

tensor([  198,   198,   100,    28,   198, 41003,    42,  6737,   198,   260],
       device='cuda:0')
tensor([1248,  506, 2249,  198,   66, 2713,  478, 6737, 1523,  260],
       device='cuda:0')


Training:  55%|█████▍    | 547/1000 [02:46<02:11,  3.43it/s, loss=5.39]

tensor([ 30,  28,  28, 346, 260, 555,  28, 198, 198,  28], device='cuda:0')
tensor([8427,  346,  536, 1363,  253, 1945,   28,  198, 3528,  946],
       device='cuda:0')


Training:  55%|█████▍    | 548/1000 [02:46<02:12,  3.42it/s, loss=5.52]

tensor([  28,  314, 6737,  198,  260, 4197,   28,  260,  555,   28],
       device='cuda:0')
tensor([  384,  1477,   198,  2068,   260, 28382,   282,   253,   555,    28],
       device='cuda:0')


Training:  55%|█████▍    | 549/1000 [02:46<02:11,  3.43it/s, loss=5.52]

tensor([2810, 6426, 6426,   42,  198,  198,  198,   28,   28,  198],
       device='cuda:0')
tensor([   55,  4192,  7854,    42,   198, 11952, 23271,  8710,    17,   492],
       device='cuda:0')


Training:  55%|█████▌    | 550/1000 [02:47<02:12,  3.39it/s, loss=5.38]

tensor([ 28, 198, 198, 198,  28,  42, 198, 198,  28, 198], device='cuda:0')
tensor([   47,   198,   198,  5345, 22152,    42,   198,  5230,    28,  5277],
       device='cuda:0')


Training:  55%|█████▌    | 551/1000 [02:47<02:10,  3.44it/s, loss=5.19]

tensor([198, 260,  28, 198,   0,  28,  28, 198, 198, 325], device='cuda:0')
tensor([  351, 30291,   198,    86, 27045, 26927,    28,   198,  9389,  1928],
       device='cuda:0')


Training:  55%|█████▌    | 552/1000 [02:47<02:08,  3.48it/s, loss=5.33]

tensor([   28,   198,   260,   457,   346,    28,    28,   198, 20055,    28],
       device='cuda:0')
tensor([   43,   327,   339,   536, 13735, 20322,   198,    71, 13231,   670],
       device='cuda:0')


Training:  55%|█████▌    | 553/1000 [02:47<02:09,  3.45it/s, loss=5.33]

tensor([ 28,  28, 198, 260, 339, 457, 325,  28, 198, 198], device='cuda:0')
tensor([ 1450,    28,   351,  5337,   392,   654, 24575,    28,   198,  2193],
       device='cuda:0')


Training:  55%|█████▌    | 554/1000 [02:48<02:12,  3.36it/s, loss=4.94]

tensor([29086,    28,   198,   198,  6737,    28,   198,    28,   198,    28],
       device='cuda:0')
tensor([ 274,   28,  198,   96, 8854,   28, 9869,   28, 1028,  105],
       device='cuda:0')


Training:  56%|█████▌    | 555/1000 [02:48<02:14,  3.32it/s, loss=5.61]

tensor([ 28, 253,  28,  28, 198,   0,   0,  28, 260, 260], device='cuda:0')
tensor([  457,   719,   588,   198,    86, 21482,  3447,   327,   511,   260],
       device='cuda:0')


Training:  56%|█████▌    | 556/1000 [02:48<02:12,  3.35it/s, loss=5.38]

tensor([  325,   198, 15604,   198,   198,   314,   253,    28,   198,   198],
       device='cuda:0')
tensor([  198,    60,   602,   967,   384, 26365,  3763,    30,   198,   198],
       device='cuda:0')


Training:  56%|█████▌    | 557/1000 [02:49<02:10,  3.40it/s, loss=5.37]

tensor([ 28, 198, 198, 260,  28, 198, 198, 198,  42, 198], device='cuda:0')
tensor([   28,   198, 38984,   346,    28,   198,  6101,   298,  2679,   268],
       device='cuda:0')


Training:  56%|█████▌    | 558/1000 [02:49<02:08,  3.43it/s, loss=5.04]

tensor([  42,   42,  198,  198,  198, 2286,  198,   28,  198,  198],
       device='cuda:0')
tensor([ 1508,    42,   198, 12800,  4618,  2286, 23944,    30,   198,   198],
       device='cuda:0')


Training:  56%|█████▌    | 559/1000 [02:49<02:07,  3.47it/s, loss=4.93]

tensor([4501,  198,   42,   42,  198,  198,  346,  260,   28,  198],
       device='cuda:0')
tensor([ 7113,  2810,  4501,    42,   198, 16817,  1980,   549,    17, 12193],
       device='cuda:0')


Training:  56%|█████▌    | 560/1000 [02:50<02:06,  3.48it/s, loss=5.14]

tensor([ 42, 198,  30, 198, 198,  28,  28, 260, 198,  28], device='cuda:0')
tensor([  482,  1163,    28,   198,  9302,  1185,   288,   260, 44890,    29],
       device='cuda:0')


Training:  56%|█████▌    | 561/1000 [02:50<02:06,  3.46it/s, loss=5.14]

tensor([4628, 4501,   42,  198,  198,   28,  198,  260,   28,   28],
       device='cuda:0')
tensor([ 4628,  4501,    42,   198, 45496,    28, 20172,   468,   441,    30],
       device='cuda:0')


Training:  56%|█████▌    | 562/1000 [02:50<02:08,  3.41it/s, loss=4.85]

tensor([2680,   57, 4501,   42,  198,  198,  198,   28,  198,   28],
       device='cuda:0')
tensor([ 2113,    57,  4501,    42,   198,   504,   701,   359, 23909,    43],
       device='cuda:0')


Training:  56%|█████▋    | 563/1000 [02:50<02:10,  3.35it/s, loss=5.1] 

tensor([ 28,  28, 198,  42, 260, 198, 198, 198,  28, 198], device='cuda:0')
tensor([ 1072,   510, 46823,   351,  1728,    28,   260,   550,   198, 20181],
       device='cuda:0')


Training:  56%|█████▋    | 564/1000 [02:51<02:13,  3.27it/s, loss=4.79]

tensor([  28,   28,   28,  198,  198,  198,  198,   57, 4501,   42],
       device='cuda:0')
tensor([  346,  7270,    30,   198,   198,  4105, 23317,    57,  4501,    42],
       device='cuda:0')


Training:  56%|█████▋    | 565/1000 [02:51<02:13,  3.27it/s, loss=4.83]

tensor([ 198,  198,  198,  198, 3020, 2113,   57, 4501,   42,  198],
       device='cuda:0')
tensor([ 423,  198,  198,   67, 3020, 2113,   57, 4501,   42,  198],
       device='cuda:0')


Training:  57%|█████▋    | 566/1000 [02:51<02:10,  3.34it/s, loss=5.04]

tensor([   28,   260,   198,     0,    28,   260, 34818,    28,   198,   198],
       device='cuda:0')
tensor([5344,  253, 3561,  839, 1980,  637,  391,   28,  198, 2193],
       device='cuda:0')


Training:  57%|█████▋    | 567/1000 [02:52<02:08,  3.36it/s, loss=5.04]

tensor([ 28,  28, 457, 198,  42,  28, 198, 260,  28, 260], device='cuda:0')
tensor([  338,   392,   457, 13784,  1915,   198,  7143,  1187,   411,   260],
       device='cuda:0')


Training:  57%|█████▋    | 568/1000 [02:52<02:09,  3.34it/s, loss=5.14]

tensor([ 28, 260,  28, 198, 198, 260, 198, 260, 198, 198], device='cuda:0')
tensor([  281,  1272,  5721,    28,   429,   655,   288,   655,   198, 38110],
       device='cuda:0')


Training:  57%|█████▋    | 569/1000 [02:52<02:07,  3.39it/s, loss=5.14]

tensor([325,  28, 198, 198,  42, 260,  28, 198,  28,  28], device='cuda:0')
tensor([ 441, 5091,  198,   63,   23,  259,  260, 4749,  905,  288],
       device='cuda:0')


Training:  57%|█████▋    | 570/1000 [02:52<02:09,  3.33it/s, loss=4.69]

tensor([  42,  198,  198, 4197,   28,    0,  198,  260,   28,  260],
       device='cuda:0')
tensor([  42,  198,  504, 1085,  754, 9699,  282,  357,  314, 1896],
       device='cuda:0')


Training:  57%|█████▋    | 571/1000 [02:53<02:08,  3.33it/s, loss=5.06]

tensor([   28,   198,    28,   325,    28,   198,    28, 29086,    28,   198],
       device='cuda:0')
tensor([   28,   346,   523,   441,    47,   377,   608, 29086,    28,  2505],
       device='cuda:0')


Training:  57%|█████▋    | 572/1000 [02:53<02:08,  3.34it/s, loss=5.12]

tensor([  346,    28,    28,   260,   198,   198,    42,   198, 37184,    28],
       device='cuda:0')
tensor([ 2745,  3895,   351,  3878,    30,   408, 19550,  2828, 37184,    28],
       device='cuda:0')


Training:  57%|█████▋    | 573/1000 [02:53<02:07,  3.36it/s, loss=5.12]

tensor([198, 198,  28,  28, 198, 253,  42,  28,  28,  28], device='cuda:0')
tensor([ 198, 7993, 5612,   28,  702,  345,  370,  990, 5249,   28],
       device='cuda:0')


Training:  57%|█████▋    | 574/1000 [02:54<02:06,  3.36it/s, loss=4.86]

tensor([  28,  198,  198,  198,  198, 9333,   42,  198,  198,   28],
       device='cuda:0')
tensor([ 1378,    30,   198,   198, 38371,  9333,    42,   198, 10539,    28],
       device='cuda:0')


Training:  57%|█████▊    | 575/1000 [02:54<02:05,  3.38it/s, loss=4.86]

tensor([  28,  198,   28,   28,  198,  198,  198, 3020, 2113,   57],
       device='cuda:0')
tensor([  28,  732, 1745,   47,  198,  198,   67, 3020, 2113,   57],
       device='cuda:0')


Training:  58%|█████▊    | 576/1000 [02:54<02:05,  3.37it/s, loss=4.95]

tensor([198,  28, 198, 198,  28, 260,  28, 253, 198, 198], device='cuda:0')
tensor([2428,   42,  650, 2988,  288,  549,  436,   28,  198, 3681],
       device='cuda:0')


Training:  58%|█████▊    | 577/1000 [02:55<02:06,  3.35it/s, loss=5.17]

tensor([198, 198, 457,  28, 198,  42, 198,  28,  28, 198], device='cuda:0')
tensor([ 198, 2683,  359,  253, 4469,   28,  359,  346,   47,  198],
       device='cuda:0')


Training:  58%|█████▊    | 578/1000 [02:55<02:02,  3.43it/s, loss=5.31]

tensor([ 28, 325,  28, 198, 198, 260,  28,  28,  28, 198], device='cuda:0')
tensor([ 2526,  1209,  2630,    43,   288, 10297,  1272,   805,   198,  5195],
       device='cuda:0')


Training:  58%|█████▊    | 579/1000 [02:55<02:01,  3.47it/s, loss=5.32]

tensor([ 28,  28, 198,  28, 260, 198, 457,  28, 198, 260], device='cuda:0')
tensor([3032,  198, 5195,  511,  564,  392, 2815,   43,  327,  638],
       device='cuda:0')


Training:  58%|█████▊    | 580/1000 [02:55<01:59,  3.52it/s, loss=5.04]

tensor([ 198,  198, 2680,   28,   28,  260,   28,   28,  198,   28],
       device='cuda:0')
tensor([  198,    61,   462,  3497,   288, 16945,  8047,    30,  1249,    28],
       device='cuda:0')


Training:  58%|█████▊    | 581/1000 [02:56<01:59,  3.52it/s, loss=5.04]

tensor([6737,  198,  198,  198,   95,   28,  260,   28,  198,  198],
       device='cuda:0')
tensor([6737,  284,  198, 4038, 1390,  288, 1972, 1092,  260,  701],
       device='cuda:0')


Training:  58%|█████▊    | 582/1000 [02:56<02:00,  3.47it/s, loss=5.08]

tensor([ 6737,    28,    28,   198,   198,  4501,    28, 34818,   198,    28],
       device='cuda:0')
tensor([ 2240,  7268,    28,   198,    57,  1643,   637, 15006,  2240,  7904],
       device='cuda:0')


Training:  58%|█████▊    | 583/1000 [02:56<02:01,  3.43it/s, loss=5.22]

tensor([ 325,  253,  198,   28,  198,  198,  260, 6737,  198,   28],
       device='cuda:0')
tensor([ 6824,   253, 26156,    28,   198,  2068,  7471,   623,  3506,   876],
       device='cuda:0')


Training:  58%|█████▊    | 584/1000 [02:57<02:00,  3.46it/s, loss=5.29]

tensor([ 30, 260,  28, 198, 198, 346,  28, 441, 198, 198], device='cuda:0')
tensor([  288, 10419,    28,   198,  1937, 10419,   523,  1188,   260,  1386],
       device='cuda:0')


Training:  58%|█████▊    | 585/1000 [02:57<02:00,  3.45it/s, loss=5.29]

tensor([ 28, 198, 260, 506, 100,  28, 198,  28, 198, 260], device='cuda:0')
tensor([   28,   327,  1839,   506, 15215,    28,  8913,    28,   284,  7241],
       device='cuda:0')


Training:  59%|█████▊    | 586/1000 [02:57<02:01,  3.40it/s, loss=5.2] 

tensor([41003,   198,   198,  6213,    42,   198,   462,   482,   198,   198],
       device='cuda:0')
tensor([   42,   198,    51, 10942,   441,   295,   672,   482,    28,  3506],
       device='cuda:0')


Training:  59%|█████▊    | 587/1000 [02:58<02:02,  3.37it/s, loss=5.14]

tensor([  28,  198,   28,   28,  260,   28,  198,  198,  198, 6426],
       device='cuda:0')
tensor([  28,  685, 1683,  351,  549,   30,  198,  198,   55, 6426],
       device='cuda:0')


Training:  59%|█████▉    | 588/1000 [02:58<02:00,  3.43it/s, loss=5.18]

tensor([  28,  198,  198,  198,  346,  457,  325, 2494,  198,  260],
       device='cuda:0')
tensor([1970,   30,  198, 8653,  339, 2526,  832, 2494,  284, 3287],
       device='cuda:0')


Training:  59%|█████▉    | 589/1000 [02:58<01:59,  3.45it/s, loss=5.16]

tensor([   0,   28,  198,  198,  198,  198, 2680, 2680,  198, 6426],
       device='cuda:0')
tensor([44228,    17,   423,   198,   198, 41074,  2680, 35135,    55,  2285],
       device='cuda:0')


Training:  59%|█████▉    | 590/1000 [02:58<01:58,  3.47it/s, loss=5.02]

tensor([   0,   28,  198,  198,   28,  198,   28, 2608,   28,  260],
       device='cuda:0')
tensor([  381,    28,   198,  3528,  1188,  2276,   635,  5585,   327, 13987],
       device='cuda:0')


Training:  59%|█████▉    | 591/1000 [02:59<01:55,  3.53it/s, loss=5.17]

tensor([   28,   260,    28,   198,   198,   198,  2680,  2680, 25630,    42],
       device='cuda:0')
tensor([  288,   874,    30,   198,   198, 41074,  2680, 19642, 25630,  5431],
       device='cuda:0')


Training:  59%|█████▉    | 592/1000 [02:59<01:54,  3.55it/s, loss=5.3] 

tensor([ 28,  28,  28, 260,  28, 198, 198,  28, 198,  28], device='cuda:0')
tensor([4888, 6737,  990, 1029,   43,  198,   63,   28,  965, 2585],
       device='cuda:0')


Training:  59%|█████▉    | 593/1000 [02:59<01:55,  3.54it/s, loss=5.3]

tensor([  198,    28,   549,    28,   198, 34818,   253,   198,     0,    28],
       device='cuda:0')
tensor([ 6836, 29820,  1272,    42,   637, 34818,   253,   754, 10934, 15786],
       device='cuda:0')


Training:  59%|█████▉    | 594/1000 [03:00<01:55,  3.51it/s, loss=4.88]

tensor([   28, 21723,    28,   198,    28,    28,    28,   198,   198,  4501],
       device='cuda:0')
tensor([  957,  5717,    28, 13735,   441,   549,    43,   198,    57,   744],
       device='cuda:0')


Training:  60%|█████▉    | 595/1000 [03:00<01:53,  3.56it/s, loss=5.07]

tensor([  198, 20055,    42,    42,   198,   198,    28,   198,   572,    28],
       device='cuda:0')
tensor([   71, 20055,  9536,    42,   198,  2696,    28,  4074,   572, 45927],
       device='cuda:0')


Training:  60%|█████▉    | 596/1000 [03:00<01:53,  3.56it/s, loss=5.13]

tensor([ 339,  198,   28,   28,   28,  198, 6737,   93,    0,  282],
       device='cuda:0')
tensor([ 469, 1076, 1191,  355,  469, 7003,   29,  102,  520,  781],
       device='cuda:0')


Training:  60%|█████▉    | 597/1000 [03:00<01:52,  3.57it/s, loss=5.03]

tensor([ 6737, 21723,    28,    28,    28,   198,   198,   198,    69,  9620],
       device='cuda:0')
tensor([  653, 14119, 21723, 48384,    17,   198,   198,    52,    69,  5229],
       device='cuda:0')


Training:  60%|█████▉    | 598/1000 [03:01<01:53,  3.53it/s, loss=5.03]

tensor([ 28,  28, 260,  28, 282,  28, 100,  28, 198, 198], device='cuda:0')
tensor([2205,  282, 1123, 3261, 9446,  506, 2112,   47,  198,  198],
       device='cuda:0')


Training:  60%|█████▉    | 599/1000 [03:01<01:55,  3.47it/s, loss=5.05]

tensor([ 198,  198,   69, 9620, 8772,  198,  718,   42,   42,  198],
       device='cuda:0')
tensor([  198,    52,    69,  5229,  9248,  6016,   718, 25089,    42,   198],
       device='cuda:0')


Training:  60%|██████    | 600/1000 [03:01<01:55,  3.47it/s, loss=5.05]

tensor([506, 198, 198,  28, 282, 346, 457, 260, 260,  28], device='cuda:0')
tensor([   30,   198,    63,   926,   457,   339,  3984,   282, 21683,  1800],
       device='cuda:0')


Training:  60%|██████    | 601/1000 [03:02<01:55,  3.46it/s, loss=4.9] 

tensor([21723,    28,    28,   260,    30,    28,   198,   198, 21723,    42],
       device='cuda:0')
tensor([22534, 48384,    23, 12575, 15534,    42,   198,  5965,  5295,   332],
       device='cuda:0')


Training:  60%|██████    | 602/1000 [03:02<01:54,  3.48it/s, loss=4.87]

tensor([ 198, 2097,   42,   42,  198,  198,  253,   28,  720,   28],
       device='cuda:0')
tensor([39813,  2097, 25466,    42,   198,  5315,  1123,  2552,  5612,   288],
       device='cuda:0')


Training:  60%|██████    | 603/1000 [03:02<01:53,  3.49it/s, loss=4.87]

tensor([ 6737,    28, 21723,    28,    28,   198,   198,   198, 41003,     0],
       device='cuda:0')
tensor([  258,   469, 31923,   974,    30,   198,   198,    66,  2062,  7430],
       device='cuda:0')


Training:  60%|██████    | 604/1000 [03:02<01:54,  3.44it/s, loss=4.85]

tensor([   28,   441,    28,    28, 21723,    28,   260,    28,  6737,   198],
       device='cuda:0')
tensor([  536, 14053,   609,   957,  2112,   351, 24478,   621, 13056,   198],
       device='cuda:0')


Training:  60%|██████    | 605/1000 [03:03<01:55,  3.43it/s, loss=5.21]

tensor([   28,    28,   198,   198,  4197,    30,   260,    28,   260, 21723],
       device='cuda:0')
tensor([1671,   28,  198,  504, 4033,  282, 3996,  284,  653, 6349],
       device='cuda:0')


Training:  61%|██████    | 606/1000 [03:03<01:52,  3.51it/s, loss=5.09]

tensor([  198,   260, 21723,   282,    28,   198,   198,   260,  4197,    28],
       device='cuda:0')
tensor([23487,   469,  1761, 12630,    28,   198, 16721,   260,  1048,  2359],
       device='cuda:0')


Training:  61%|██████    | 607/1000 [03:03<01:49,  3.57it/s, loss=5.33]

tensor([  42,  198,  198, 4501,  325,  441, 1992,  322,  346,   28],
       device='cuda:0')
tensor([   42,   198,    57,  2161,   536, 13168,  1992,   322,   346, 44228],
       device='cuda:0')


Training:  61%|██████    | 608/1000 [03:03<01:49,  3.58it/s, loss=5.33]

tensor([   28,   198,   198,   198, 15604,   198,  4728,   198,   679,    42],
       device='cuda:0')
tensor([   30,   198,   198,    50, 15604,    59,  4728,    56,  4153,    42],
       device='cuda:0')


Training:  61%|██████    | 609/1000 [03:04<01:51,  3.50it/s, loss=4.77]

tensor([  42,   28,   28,  260, 4197,   28,  198,  198,  279,   28],
       device='cuda:0')
tensor([ 2767, 20322,   418,   260, 29289,    43,   198,  1082,   105,  3379],
       device='cuda:0')


Training:  61%|██████    | 610/1000 [03:04<01:53,  3.45it/s, loss=4.96]

tensor([260,  28, 198,  28, 198, 198,  28,  93,   0,  28], device='cuda:0')
tensor([ 1771,   368,  3497,    42,   198, 12192,    29, 23724, 45927,  8759],
       device='cuda:0')


Training:  61%|██████    | 611/1000 [03:04<01:55,  3.37it/s, loss=5.29]

tensor([  28,  339,   28,  441,  339,  540,  198,  198,  198, 4728],
       device='cuda:0')
tensor([1991,  346,  523,  355,  787,   30,  198,  198,   59, 4728],
       device='cuda:0')


Training:  61%|██████    | 612/1000 [03:05<01:52,  3.46it/s, loss=5.31]

tensor([   28,   198,   198, 20055,    28,   100,    28,   198,    28,   100],
       device='cuda:0')
tensor([   28,   198,    71,  3592,   506,  6621,    28,   905,   506, 15786],
       device='cuda:0')


Training:  61%|██████▏   | 613/1000 [03:05<01:51,  3.48it/s, loss=5.31]

tensor([6737,    0,   28,  198,  198,   28,  260,   28,  339,  260],
       device='cuda:0')
tensor([   88,   552,    28,   198,  3528,  3727,   549,   638,   288, 27397],
       device='cuda:0')


Training:  61%|██████▏   | 614/1000 [03:05<01:52,  3.44it/s, loss=4.75]

tensor([   28,    28,   282,    28,   260,  4197, 21723,    28,   198,   339],
       device='cuda:0')
tensor([  787,   540, 11393,   282,   260,  9202,  1670,   198,  2193, 20322],
       device='cuda:0')


Training:  62%|██████▏   | 615/1000 [03:06<01:51,  3.47it/s, loss=5.04]

tensor([21723,   282,   282,    28,    28,    28,   260,    28,   198,   198],
       device='cuda:0')
tensor([16087,   358,  3075,   874,   441,   288,  2606,    28,   198, 29752],
       device='cuda:0')


Training:  62%|██████▏   | 616/1000 [03:06<01:51,  3.45it/s, loss=5.04]

tensor([   28,    28,   198,   260, 21723,    28,   198,   198,   198,  4728],
       device='cuda:0')
tensor([10278,    28,   281,   480, 14553,    30,   198,   198,    59,  4728],
       device='cuda:0')


Training:  62%|██████▏   | 617/1000 [03:06<01:51,  3.44it/s, loss=4.86]

tensor([  28,  198,  198,  198, 2097,   42,   42,  198,  198,   28],
       device='cuda:0')
tensor([   47,   198,   198, 39813,  2097, 25466,    42,   198,  5345,    28],
       device='cuda:0')


Training:  62%|██████▏   | 618/1000 [03:06<01:50,  3.47it/s, loss=5.36]

tensor([ 28, 198,  28, 260, 198,  28, 457, 441,  28, 260], device='cuda:0')
tensor([   28, 14067,   335,    28,  1675,   392,   359,   614,   281,  6548],
       device='cuda:0')


Training:  62%|██████▏   | 619/1000 [03:07<01:49,  3.47it/s, loss=5.36]

tensor([  549,   260,   260,   198, 21723, 21723,    28,   198,   198,   198],
       device='cuda:0')
tensor([  284,  1188,    28,   957, 47910, 21723,    30,   198,   198,    59],
       device='cuda:0')


Training:  62%|██████▏   | 620/1000 [03:07<01:50,  3.45it/s, loss=5.21]

tensor([  28,   28,  260, 4197,   28,   28,  198,   42,  260,   28],
       device='cuda:0')
tensor([18266,   284,   260, 27508,   655,   198,    51,  7380,   767,   260],
       device='cuda:0')


Training:  62%|██████▏   | 621/1000 [03:07<01:48,  3.48it/s, loss=5.06]

tensor([ 28, 198, 338, 339, 457,  28, 260, 198, 339, 325], device='cuda:0')
tensor([   28,   967,  5337,   392,  3568,  1523,   198, 39855,  2161,   457],
       device='cuda:0')


Training:  62%|██████▏   | 622/1000 [03:08<01:47,  3.51it/s, loss=5.37]

tensor([  28,    0,   28,  198,  339,  457, 2593, 6737,   28,   28],
       device='cuda:0')
tensor([  267, 30250,   198,  5519,   339,   475,  2593,  6737,   767,    28],
       device='cuda:0')


Training:  62%|██████▏   | 623/1000 [03:08<01:45,  3.57it/s, loss=5.47]

tensor([  260,   260,    28,   260,    28,   260, 21723,    28,    28,   198],
       device='cuda:0')
tensor([ 4145, 46160,   284, 42521,   288,   957,  1904,  4778,    43,   198],
       device='cuda:0')


Training:  62%|██████▏   | 624/1000 [03:08<01:43,  3.64it/s, loss=5.4] 

tensor([  28,   28,  555,   28,   28,  198,  288,  260, 4197,  282],
       device='cuda:0')
tensor([ 7576,   253, 10001, 29562,   198,  8155,   281,   260,  1450,  1670],
       device='cuda:0')


Training:  62%|██████▎   | 625/1000 [03:08<01:43,  3.64it/s, loss=5.37]

tensor([   0,  198,  198,    0,   28,   28,  339,  260, 6737,   28],
       device='cuda:0')
tensor([   28,   198, 25885, 26429, 12101,   638,   288, 47850, 20322,    42],
       device='cuda:0')


Training:  63%|██████▎   | 626/1000 [03:09<01:42,  3.66it/s, loss=5.52]

tensor([6737,   28,   28, 6737,   28,  260,   28,   28,  253,   28],
       device='cuda:0')
tensor([ 372,  323,   82, 1484,  284, 7576,  359,  702,  827, 1800],
       device='cuda:0')


Training:  63%|██████▎   | 627/1000 [03:09<01:42,  3.65it/s, loss=5.32]

tensor([ 28,  28,  28, 198, 260, 282, 260,  28, 198, 198], device='cuda:0')
tensor([3726, 4157,   28, 1980, 1556,  282, 1029,   28,  198,   68],
       device='cuda:0')


Training:  63%|██████▎   | 628/1000 [03:09<01:42,  3.62it/s, loss=5.18]

tensor([21723,    28,   325,   260, 21723,    28,   198,   198,   198,  4728],
       device='cuda:0')
tensor([10054,  2526,  4571,   957,  6943,    30,   198,   198,    59,  4728],
       device='cuda:0')


Training:  63%|██████▎   | 629/1000 [03:09<01:42,  3.61it/s, loss=5.47]

tensor([6737,   28,  339,   28,   28,   28,  260,   28,  198,  198],
       device='cuda:0')
tensor([20641,   645,   346,   599,   277,   351,  1272,    47,   198,   198],
       device='cuda:0')


Training:  63%|██████▎   | 630/1000 [03:10<01:43,  3.57it/s, loss=5.47]

tensor([6737,   28,   28,  789,  339,  260, 4197,   42,  198,  198],
       device='cuda:0')
tensor([ 7501,  1114,   287,   789, 43228,   260, 31049,    42,   198, 21553],
       device='cuda:0')


Training:  63%|██████▎   | 631/1000 [03:10<01:44,  3.53it/s, loss=5.2] 

tensor([  28,    0, 6737,   28,  282,  260,   28,   28,   28,   42],
       device='cuda:0')
tensor([  252,  1207,  6737,   578,   284, 43537,   318,  1076, 35216,    42],
       device='cuda:0')


Training:  63%|██████▎   | 632/1000 [03:10<01:44,  3.53it/s, loss=5.37]

tensor([ 42,  28,  28, 198, 325, 260,  28,  28, 198, 260], device='cuda:0')
tensor([ 318, 4699,  198, 2068,  963,  451, 1861,   30, 1626,   29],
       device='cuda:0')


Training:  63%|██████▎   | 633/1000 [03:11<01:43,  3.54it/s, loss=5.22]

tensor([  93,    0, 6737,  282,  282,  198,  339,  260,  555,   28],
       device='cuda:0')
tensor([   88,  9535,  2751,  1352,   198,  3528,  7471,   253, 40590, 27044],
       device='cuda:0')


Training:  63%|██████▎   | 634/1000 [03:11<01:42,  3.57it/s, loss=5.19]

tensor([ 93,  28, 198, 339, 457, 288, 260, 198, 457, 260], device='cuda:0')
tensor([1222,   28,  347,  339, 2422,  411,   28,  339, 1217,  665],
       device='cuda:0')


Training:  64%|██████▎   | 635/1000 [03:11<01:42,  3.56it/s, loss=5.05]

tensor([   42,    28,   198,    28,   198,   198,   198,  2680, 24625,   198],
       device='cuda:0')
tensor([  589,    28,  7706,    47,   198,   198,    56,  2680, 24625, 33702],
       device='cuda:0')


Training:  64%|██████▎   | 636/1000 [03:11<01:42,  3.56it/s, loss=5.21]

tensor([ 28, 198, 198, 457, 339, 555,  28, 198,  28, 198], device='cuda:0')
tensor([   30,   198,  2683,   457,   253,  4132,    28,   330, 13468,   290],
       device='cuda:0')


Training:  64%|██████▎   | 637/1000 [03:12<01:41,  3.59it/s, loss=5.42]

tensor([ 198,  198,  260, 4197,   28,  282,  260, 4314, 6737,   28],
       device='cuda:0')
tensor([  198, 28925,   260, 14568,  6956,   282,   278,  4314,   358,    43],
       device='cuda:0')


Training:  64%|██████▍   | 638/1000 [03:12<01:39,  3.64it/s, loss=5.43]

tensor([ 42,  28,  28,  28, 260,  28, 260,  28, 198, 198], device='cuda:0')
tensor([ 1319,   549,  3287,   282,  3878,   564, 20727,    30,   198,  2705],
       device='cuda:0')


Training:  64%|██████▍   | 639/1000 [03:12<01:38,  3.67it/s, loss=5.19]

tensor([6737,   28,  260,   28,  198,  339,  339,   28,  282,  260],
       device='cuda:0')
tensor([1038,  564, 2112,  198, 3528,  338, 1165, 1743,  282,  260],
       device='cuda:0')


Training:  64%|██████▍   | 640/1000 [03:13<01:38,  3.67it/s, loss=5.54]

tensor([  198,   198,   198,  2680, 24625,   198,  7113,  4728,   198,  4501],
       device='cuda:0')
tensor([  198,   198,    56,  2680, 24625,   389,  7113,  4728,    50,  3911],
       device='cuda:0')


Training:  64%|██████▍   | 641/1000 [03:13<01:38,  3.65it/s, loss=5.03]

tensor([   28,   198,   260, 21723,    28,   198,   325, 21723,   198,   260],
       device='cuda:0')
tensor([   28,   284,   650,  1911,   198,  2068, 17831,  2754,   282,   469],
       device='cuda:0')


Training:  64%|██████▍   | 642/1000 [03:13<01:38,  3.63it/s, loss=5.21]

tensor([198, 457, 325, 549, 198, 260,  28,  28, 198, 198], device='cuda:0')
tensor([ 339, 3060, 1928,   28,  284, 6819, 1147,   43,  198, 2193],
       device='cuda:0')


Training:  64%|██████▍   | 643/1000 [03:13<01:39,  3.61it/s, loss=5.08]

tensor([ 198,  198,  198,   28,   28,  198,  260,  260, 4197,   28],
       device='cuda:0')
tensor([   28,   198, 22204,  3930,    28,   359,   511,   260,  2321,  8770],
       device='cuda:0')


Training:  64%|██████▍   | 644/1000 [03:14<01:38,  3.60it/s, loss=5.26]

tensor([   0,   28,  260,   28,  198,  271,  339,   28, 4197,   28],
       device='cuda:0')
tensor([  524,   284, 23414,   198,    68,   388, 17072,   260,  8423,    29],
       device='cuda:0')


Training:  64%|██████▍   | 645/1000 [03:14<01:37,  3.62it/s, loss=5.14]

tensor([ 198,  381,   42,  198,   28,  339,   28,  260,  260, 4197],
       device='cuda:0')
tensor([  60, 4768,   28,  346,  338, 1535,  359,  656,  653, 8641],
       device='cuda:0')


Training:  65%|██████▍   | 646/1000 [03:14<01:39,  3.56it/s, loss=5.14]

tensor([   28, 21723,    28,   100,   198,   198,   325,    42,     0,   198],
       device='cuda:0')
tensor([ 957, 8664,  506, 9596,  198, 2068, 3423,   85,  936,  260],
       device='cuda:0')


Training:  65%|██████▍   | 647/1000 [03:15<01:43,  3.41it/s, loss=5.2] 

tensor([  282,   441,    28,   260,   198,   198, 21723,    28,   198,     0],
       device='cuda:0')
tensor([ 536,  441,  963,   28,  198, 5965, 3506, 8739,  267, 1878],
       device='cuda:0')


Training:  65%|██████▍   | 648/1000 [03:15<01:40,  3.52it/s, loss=5.2]

tensor([ 42, 198, 198,  28, 549, 457,  28, 198, 260,  28], device='cuda:0')
tensor([   43,   198, 14344,  1928,   339,  8949,    28,   284,  3310,  1188],
       device='cuda:0')


Training:  65%|██████▍   | 649/1000 [03:15<01:42,  3.42it/s, loss=4.79]

tensor([ 28, 198, 198,  28, 198, 260,  28,  28, 198, 260], device='cuda:0')
tensor([   30,   198,  2696,    28,   411,  8949, 19461,    28,   411,   957],
       device='cuda:0')


Training:  65%|██████▌   | 650/1000 [03:15<01:42,  3.40it/s, loss=5.01]

tensor([   28,   198,   339,    28,   198,   100,    28,   198, 21723,    28],
       device='cuda:0')
tensor([   28,   347, 17072,  1012,   506,   100,    28, 13987,  8135,  1301],
       device='cuda:0')


Training:  65%|██████▌   | 651/1000 [03:16<01:41,  3.44it/s, loss=5.25]

tensor([3911, 4728,  198, 3911, 9620,   42,  198,  198,  260,  260],
       device='cuda:0')
tensor([7113, 4728,   50, 3911, 9620,   42,  198, 3825,  511,  957],
       device='cuda:0')


Training:  65%|██████▌   | 652/1000 [03:16<01:40,  3.46it/s, loss=5.25]

tensor([314,  28,  28, 457, 260, 339, 339,  28,  28, 198], device='cuda:0')
tensor([ 787,  555, 2093, 2216,  564,  338, 8177, 2767,  198, 5195],
       device='cuda:0')


Training:  65%|██████▌   | 653/1000 [03:16<01:42,  3.40it/s, loss=5.07]

tensor([  0,  28, 339, 441,  28,  28, 198, 198, 339, 441], device='cuda:0')
tensor([ 2164,   338,   536,  8018,   737,    28,   198, 27737,   536,   339],
       device='cuda:0')


Training:  65%|██████▌   | 654/1000 [03:16<01:45,  3.28it/s, loss=5.21]

tensor([   28,   260,   260, 21723,    28,   198,   198,   260,   282,   260],
       device='cuda:0')
tensor([ 5852,   429,   469, 15062,    28,   198,  3280,  1556,   282, 19888],
       device='cuda:0')


Training:  66%|██████▌   | 655/1000 [03:17<01:44,  3.29it/s, loss=5.22]

tensor([282,  28, 198, 198,  42,  28, 260, 549, 198,  28], device='cuda:0')
tensor([ 2717,    17,   198, 11114,  1063,   282,  1022,    28,  5681,  7864],
       device='cuda:0')


Training:  66%|██████▌   | 656/1000 [03:17<01:43,  3.33it/s, loss=5.22]

tensor([ 28, 325,  28, 198, 198,  28, 260,  28, 282, 198], device='cuda:0')
tensor([ 3786,   325,    42,   198, 14413,   335,   540,  1737,    28,   282],
       device='cuda:0')


Training:  66%|██████▌   | 657/1000 [03:17<01:43,  3.33it/s, loss=5.25]

tensor([   42,    28,   198,   198,   198, 20055,   288,   260,  4197,    28],
       device='cuda:0')
tensor([17264,    17,   423,   198,    71,   518,   282,   354,  1850,    42],
       device='cuda:0')


Training:  66%|██████▌   | 658/1000 [03:18<01:42,  3.32it/s, loss=5.19]

tensor([  0, 253,   0,  42,  28, 198, 198, 457, 325,  28], device='cuda:0')
tensor([  715, 11415,  1088,   383,    42,   198,  1882,  3060,   457,   787],
       device='cuda:0')


Training:  66%|██████▌   | 659/1000 [03:18<01:42,  3.34it/s, loss=5.22]

tensor([ 28,  28, 260, 260, 198, 198, 260,  28,  28, 260], device='cuda:0')
tensor([  808,   288,  7012,    28,   198, 19671,   601,  1554,   282,  1123],
       device='cuda:0')


Training:  66%|██████▌   | 660/1000 [03:18<01:40,  3.38it/s, loss=5.22]

tensor([ 339,  346,   28,   28,  198,  198, 6737, 4197,   28,   28],
       device='cuda:0')
tensor([  585, 17072,  2265,    28,   198, 21175,   253,  3506,  4313,   284],
       device='cuda:0')


Training:  66%|██████▌   | 661/1000 [03:19<01:39,  3.40it/s, loss=5.31]

tensor([ 28,  42,  28,  28, 198, 198,  28,   0, 198,  42], device='cuda:0')
tensor([   90,   424,  1147,    30,   198, 46292,    95,    17,   948,  1970],
       device='cuda:0')


Training:  66%|██████▌   | 662/1000 [03:19<01:37,  3.47it/s, loss=4.96]

tensor([   28,   457,    28,  4197,    28,   198,   198, 17072,    28,    28],
       device='cuda:0')
tensor([  339,   699,   260,  2139,    42,   198,  9393, 17072,   441, 30493],
       device='cuda:0')


Training:  66%|██████▋   | 663/1000 [03:19<01:36,  3.50it/s, loss=4.96]

tensor([   28,    28, 21723,    28,   260,  4197,    28,   198,   198,   198],
       device='cuda:0')
tensor([20322,   957, 21723,  2126,   260,   905,    30,   198,   198,    62],
       device='cuda:0')


Training:  66%|██████▋   | 664/1000 [03:19<01:37,  3.44it/s, loss=5.29]

tensor([   28, 17072,    28,    28,   198,    28,   198,   198,   198,  9620],
       device='cuda:0')
tensor([21965, 17072,   719,    28,   965,    47,   198,   198,  3911,  3945],
       device='cuda:0')


Training:  66%|██████▋   | 665/1000 [03:20<01:39,  3.37it/s, loss=5.32]

tensor([  28,   28,  198,  346,    0, 4197,  541,   28,  198,  260],
       device='cuda:0')
tensor([  874,    43,  9725,    95,   253, 12724, 13285,    43, 41916,   253],
       device='cuda:0')


Training:  67%|██████▋   | 666/1000 [03:20<01:39,  3.35it/s, loss=5.18]

tensor([    0,   260, 21723,    42,  6737,    28,   198,   198,   198,  9620],
       device='cuda:0')
tensor([ 282,  650,  651,  456,  105,   47,  198,  198, 3911, 3945],
       device='cuda:0')


Training:  67%|██████▋   | 667/1000 [03:20<01:37,  3.40it/s, loss=5.06]

tensor([100,  28,  28,  28, 198, 198, 339,  28, 253,  28], device='cuda:0')
tensor([  100, 17072,  8177,    47,   198, 12908,  3661,   325,  8177,    28],
       device='cuda:0')


Training:  67%|██████▋   | 668/1000 [03:21<01:36,  3.44it/s, loss=5.19]

tensor([   28,    28,    42,   198,   198,   339, 21723,    28,    28,   260],
       device='cuda:0')
tensor([ 337, 4048,   43,  198, 2596,  957, 2629, 2606,  314, 5951],
       device='cuda:0')


Training:  67%|██████▋   | 669/1000 [03:21<01:36,  3.43it/s, loss=5.19]

tensor([2680, 4628, 8772,   42,  198,  198,   28,  198,   28,  198],
       device='cuda:0')
tensor([31540,  4628,  8772,    42,   198, 34056,    28, 38061,    28,   469],
       device='cuda:0')


Training:  67%|██████▋   | 670/1000 [03:21<01:37,  3.39it/s, loss=5.18]

tensor([  28,  260,  339,   28,  198,  198,  314,  260, 4197,   28],
       device='cuda:0')
tensor([1607,  284, 6917,   30,  198, 1348,  314,  260, 4569,   28],
       device='cuda:0')


Training:  67%|██████▋   | 671/1000 [03:22<01:37,  3.38it/s, loss=5.19]

tensor([ 28, 260,  28, 198, 198, 198, 341,  42, 198, 198], device='cuda:0')
tensor([  359,  7270,    47,   198,   198,    62, 10942,    42,   198, 35251],
       device='cuda:0')


Training:  67%|██████▋   | 672/1000 [03:22<01:36,  3.39it/s, loss=5.19]

tensor([4197,   28,  260,   30,  339,   28,  198,  198,  198, 3945],
       device='cuda:0')
tensor([ 905,  314, 2932,  284, 3599,   30,  198,  198, 3911, 3945],
       device='cuda:0')


Training:  67%|██████▋   | 673/1000 [03:22<01:36,  3.39it/s, loss=5.11]

tensor([  28,   28,   28,  282,  198,  198,  198,   42, 2285, 3438],
       device='cuda:0')
tensor([13735,   739, 34104,    30,   198,   198,    54,  5135,  2285, 18630],
       device='cuda:0')


Training:  67%|██████▋   | 674/1000 [03:22<01:35,  3.40it/s, loss=5.11]

tensor([  0,  28, 457, 260,  28,  28,  28, 198, 198, 253], device='cuda:0')
tensor([ 2275,   392,  3408,  1272,  1176,  8322,    28,   198, 16721,   653],
       device='cuda:0')


Training:  68%|██████▊   | 675/1000 [03:23<01:35,  3.41it/s, loss=4.93]

tensor([ 198,  198,  381,   73,   42, 6565,   57,   42,  198,  198],
       device='cuda:0')
tensor([  198,    60,  3907,    73, 29146,  6565,  3438,    42,   198,  5195],
       device='cuda:0')


Training:  68%|██████▊   | 676/1000 [03:23<01:34,  3.43it/s, loss=4.79]

tensor([6737,   28,  198,  198, 3060,   28,   28,   28,  198,  198],
       device='cuda:0')
tensor([  313,    42,   198,    57,  3287,   787, 33974,    30,   198,   198],
       device='cuda:0')


Training:  68%|██████▊   | 677/1000 [03:23<01:33,  3.46it/s, loss=5.07]

tensor([  57, 3060,   42,  198,  198,  325,   28,  314,  198, 3060],
       device='cuda:0')
tensor([  57, 3438,   42,  198, 2068, 2988,  338,   28,  339,  868],
       device='cuda:0')


Training:  68%|██████▊   | 678/1000 [03:24<01:32,  3.48it/s, loss=4.89]

tensor([260,  28, 325,  28, 198, 198, 449, 339,  28, 260], device='cuda:0')
tensor([17072, 36034, 22576,    28,   198,  4370,   449, 30493,   411,   957],
       device='cuda:0')


Training:  68%|██████▊   | 679/1000 [03:24<01:33,  3.45it/s, loss=4.89]

tensor([  28,   28,  198,  198,  449,  339, 3060,   28,  198,   28],
       device='cuda:0')
tensor([1225,   17,  198, 4370,  449,  339,  441,   28,  965,   28],
       device='cuda:0')


Training:  68%|██████▊   | 680/1000 [03:24<01:34,  3.39it/s, loss=5.31]

tensor([ 28, 260,  28, 198, 253,  28, 260,  28, 198, 198], device='cuda:0')
tensor([ 288,  685,   28,  564, 2093,  288, 1003,   30,  198,   63],
       device='cuda:0')


Training:  68%|██████▊   | 681/1000 [03:25<01:35,  3.35it/s, loss=5.41]

tensor([   28,    28,   253,   555,    28,    28,   198,   198,   198, 46634],
       device='cuda:0')
tensor([  331, 26365,   253,  6243,  2139,    30,   198,   198, 43029,  1754],
       device='cuda:0')


Training:  68%|██████▊   | 682/1000 [03:25<01:31,  3.48it/s, loss=5.23]

tensor([   0,   28,   28,   28,  198,  198,  198, 3945,   63,   42],
       device='cuda:0')
tensor([1503,  346, 5173,   30,  198,  198, 3911, 3945,   63,   42],
       device='cuda:0')


Training:  68%|██████▊   | 683/1000 [03:25<01:30,  3.51it/s, loss=5.23]

tensor([ 28,  28,  28, 198, 549,  28, 555,  28,  28, 198], device='cuda:0')
tensor([ 9154,  5569,    28, 13804,   441,   253, 22415,   555,    43,   198],
       device='cuda:0')


Training:  68%|██████▊   | 684/1000 [03:25<01:30,  3.48it/s, loss=5.1] 

tensor([ 28, 198,  42, 198,  28, 260, 314, 282, 198, 260], device='cuda:0')
tensor([   30, 14755,    28,  1690,   429,   338, 10172,   198,  6228,  2112],
       device='cuda:0')


Training:  68%|██████▊   | 685/1000 [03:26<01:29,  3.53it/s, loss=5.16]

tensor([ 28, 549,  28, 260,  28,  28,  28, 198, 198,  28], device='cuda:0')
tensor([7200,  874,  429,  451, 1796, 6724,   28,  198, 6882,  281],
       device='cuda:0')


Training:  69%|██████▊   | 686/1000 [03:26<01:28,  3.53it/s, loss=5.16]

tensor([  28,   28,   28,  198,  198, 3060,   28,  260,   28,  198],
       device='cuda:0')
tensor([  315,   851,    28,   198,    57, 14941,   411, 10419,   623,  3497],
       device='cuda:0')


Training:  69%|██████▊   | 687/1000 [03:26<01:29,  3.49it/s, loss=4.7] 

tensor([ 346,  198,  198, 3060,  253, 4197,   28,  260,   28, 4197],
       device='cuda:0')
tensor([   42,   198,    57,   744,   260,  4132,   282,  6661,   260, 25729],
       device='cuda:0')


Training:  69%|██████▉   | 688/1000 [03:26<01:30,  3.46it/s, loss=4.7]

tensor([  0,  28,  28,  28, 198, 198,  28, 253,  28, 339], device='cuda:0')
tensor([ 254,  349,  269,   30,  198, 2596,  325,  357,  347,  357],
       device='cuda:0')


Training:  69%|██████▉   | 689/1000 [03:27<01:30,  3.43it/s, loss=4.93]

tensor([ 28,  28, 198, 198, 198,  42,  42, 198, 198, 260], device='cuda:0')
tensor([ 1448,    30,   198,   198,    73, 25089,    42,   198,  9629,   732],
       device='cuda:0')


Training:  69%|██████▉   | 690/1000 [03:27<01:29,  3.46it/s, loss=5.29]

tensor([   28,   260,  6737,    28,  4197,   282,   339, 21723,    28,   325],
       device='cuda:0')
tensor([ 2853,    96,  6737,   260,  7429,   837, 13987,  1924,   868,  4137],
       device='cuda:0')


Training:  69%|██████▉   | 691/1000 [03:27<01:28,  3.50it/s, loss=5.22]

tensor([  42,  198,  198,  198, 2285,   54,   42,   42,  198,  198],
       device='cuda:0')
tensor([   30,   198,   198,  7430,  8107,    54, 10030,    42,   198, 45496],
       device='cuda:0')


Training:  69%|██████▉   | 692/1000 [03:28<01:26,  3.55it/s, loss=5.16]

tensor([   28,    28,   198,   339,   198,   302,  3060,    28,    28, 21723],
       device='cuda:0')
tensor([  588,   198,  5195, 12765,   416,   339,  1089,  3806,   957,  3497],
       device='cuda:0')


Training:  69%|██████▉   | 693/1000 [03:28<01:25,  3.60it/s, loss=5.32]

tensor([6737,  282,  314,  457,   28,  198,  198, 4197, 4197,   28],
       device='cuda:0')
tensor([ 4196,   384,   392,   437,    28,   198,   504, 42502, 15731,  4502],
       device='cuda:0')


Training:  69%|██████▉   | 694/1000 [03:28<01:23,  3.66it/s, loss=5.24]

tensor([ 6737,   339,    28, 21723,   282,   260,    28,   198,   198,   198],
       device='cuda:0')
tensor([10937,  1209,   650, 26467,   282, 12669,    30,   198,   198, 29719],
       device='cuda:0')


Training:  70%|██████▉   | 695/1000 [03:28<01:23,  3.67it/s, loss=4.82]

tensor([  28,  260, 4197,  282,  198,  198,   28,  260, 4197,   28],
       device='cuda:0')
tensor([15032,   480, 10172,    28,   198, 16654,   259,   480,  1038,  2397],
       device='cuda:0')


Training:  70%|██████▉   | 696/1000 [03:29<01:23,  3.64it/s, loss=5.13]

tensor([  260,   549,    28,    30,   198,   198,   198,  2680, 24625,   198],
       device='cuda:0')
tensor([ 4875,   549,  3287,    30,   198,   198, 41074,  2680, 35135,    55],
       device='cuda:0')


Training:  70%|██████▉   | 697/1000 [03:29<01:22,  3.66it/s, loss=5.28]

tensor([ 28, 260, 381,  28, 260,  42,   0, 282,  28, 198], device='cuda:0')
tensor([  281,   544,   381,   411,  4360,  4565,  1802, 11064,    47,   198],
       device='cuda:0')


Training:  70%|██████▉   | 698/1000 [03:29<01:21,  3.71it/s, loss=5.05]

tensor([ 28,  28, 198, 198, 339,  28, 198,  28, 198, 260], device='cuda:0')
tensor([ 4463,    28,   198,  2427,  1176,    28, 37494,    28,   284, 33974],
       device='cuda:0')


Training:  70%|██████▉   | 699/1000 [03:30<01:22,  3.64it/s, loss=5.22]

tensor([  260,   260, 21723,    28,   198,   198, 21723,    28,   260,    28],
       device='cuda:0')
tensor([  351,   957,  8664,    17,   198,  5965,  2606,   284,  3696, 36644],
       device='cuda:0')


Training:  70%|███████   | 700/1000 [03:30<01:23,  3.58it/s, loss=4.96]

tensor([   28,    28,    28,   198,   198,   339,   198,   260,    28, 21723],
       device='cuda:0')
tensor([ 2874,  1592,    43,   198,  3528,    28,  1953,  4649, 13987,  3289],
       device='cuda:0')


Training:  70%|███████   | 701/1000 [03:30<01:23,  3.58it/s, loss=4.96]

tensor([ 28, 325,  28,  42,  28, 198, 325,  28,  28, 260], device='cuda:0')
tensor([1251,  325, 2627,  277,  198, 2068,  685, 1683,  351,  468],
       device='cuda:0')


Training:  70%|███████   | 702/1000 [03:30<01:26,  3.43it/s, loss=4.31]

tensor([  198,  3907,    73,    42,   198,    42,   198,   198, 21723,    28],
       device='cuda:0')
tensor([   60,  3907,    73, 23675,    73,    42,   198,  5965,  2606, 10937],
       device='cuda:0')


Training:  70%|███████   | 703/1000 [03:31<01:26,  3.45it/s, loss=5.27]

tensor([  28,  253,   42,   28,   28,  260, 4197, 4314,   28,  198],
       device='cuda:0')
tensor([16568,  8767,   301,  2177,   351,   634,   278,  6842,    28,   198],
       device='cuda:0')


Training:  70%|███████   | 704/1000 [03:31<01:24,  3.50it/s, loss=5.2] 

tensor([   28,   198,   198,   198,    71, 20055,    42,   198,   198,   260],
       device='cuda:0')
tensor([   30,   198,   198, 29719,    71, 27163,    42,   198,  5212, 14942],
       device='cuda:0')


Training:  70%|███████   | 705/1000 [03:31<01:23,  3.54it/s, loss=5.02]

tensor([   28,   198,   198,   198,  4728,   198,    71, 20055,    42,    42],
       device='cuda:0')
tensor([   30,   198,   198,    59,  4728, 19436,    71,  2721, 38744,    42],
       device='cuda:0')


Training:  71%|███████   | 706/1000 [03:32<01:23,  3.54it/s, loss=4.91]

tensor([ 28, 198, 260, 549,  28,  28,  28, 198, 198,  28], device='cuda:0')
tensor([   28,   284, 18089,  7065,  9446,  5182,    43,   198,  3528,    28],
       device='cuda:0')


Training:  71%|███████   | 707/1000 [03:32<01:21,  3.60it/s, loss=4.83]

tensor([4197,  282,    0,   28,  260, 4083,   29,   28,  100,   28],
       device='cuda:0')
tensor([ 5151, 11948,  5559,   335,   469,   725,  6221,   506,  4132,    28],
       device='cuda:0')


Training:  71%|███████   | 708/1000 [03:32<01:20,  3.64it/s, loss=5]   

tensor([6737,  253, 3060,   28,  260,   28,  198,  198,   28,   28],
       device='cuda:0')
tensor([ 744,  339, 2090,  282, 9970,   30,  198, 2696, 3472, 1303],
       device='cuda:0')


Training:  71%|███████   | 709/1000 [03:32<01:21,  3.58it/s, loss=5]

tensor([   28,   198,   198,   198,  4728,   198,    71, 20055,    42,    42],
       device='cuda:0')
tensor([   30,   198,   198,    59,  4728, 15494,    71, 20055,  9536,    42],
       device='cuda:0')


Training:  71%|███████   | 710/1000 [03:33<01:21,  3.55it/s, loss=4.96]

tensor([  28,   28,  260,  282,   42,  198,  457,   28,   28, 4197],
       device='cuda:0')
tensor([4658,  282, 4929, 9686,  198, 4948, 3100, 6616,  480, 2184],
       device='cuda:0')


Training:  71%|███████   | 711/1000 [03:33<01:20,  3.59it/s, loss=4.84]

tensor([  28, 4197,   28,  198,  198,   28,   28,   28,   28,    0],
       device='cuda:0')
tensor([  260,  1532,    28,   198, 16075,  3310,  1869, 18598,   283,   494],
       device='cuda:0')


Training:  71%|███████   | 712/1000 [03:33<01:20,  3.57it/s, loss=4.84]

tensor([  198,   198,   198,    71, 20055,    42,   198,   198,    28,    28],
       device='cuda:0')
tensor([  198,   198, 29719,    71, 27163,    42,   198,  1780,  8296,    28],
       device='cuda:0')


Training:  71%|███████▏  | 713/1000 [03:34<01:21,  3.52it/s, loss=4.76]

tensor([17072,    28,    23,    28,  4197,    28,   282,   198,   198,   494],
       device='cuda:0')
tensor([17072,   263,   432,   260,  2240, 17816,    28,   198, 15024,   494],
       device='cuda:0')


Training:  71%|███████▏  | 714/1000 [03:34<01:20,  3.56it/s, loss=5.12]

tensor([ 325,   28, 4197,  282,  260,   28,  198,  198,   28,  325],
       device='cuda:0')
tensor([3934,  260, 3102,  284, 3568,   47,  198, 6882, 3786,  392],
       device='cuda:0')


Training:  72%|███████▏  | 715/1000 [03:34<01:19,  3.59it/s, loss=5.04]

tensor([  28, 6189,  260,  253, 4197,   28,   28,  198,  260,  260],
       device='cuda:0')
tensor([   93,   284,   702,   253, 27508, 18030,   198,  3825, 12885,   739],
       device='cuda:0')


Training:  72%|███████▏  | 716/1000 [03:34<01:19,  3.57it/s, loss=5.04]

tensor([ 28, 549,  28,  30, 198, 198,  28, 198, 302,  28], device='cuda:0')
tensor([4875, 1272, 3287,   30,  198, 1780,   17,  416,  588, 1805],
       device='cuda:0')


Training:  72%|███████▏  | 717/1000 [03:35<01:21,  3.46it/s, loss=4.99]

tensor([   28,   198,   198,    28,   260,   789, 17072,  4197,   198,    28],
       device='cuda:0')
tensor([  28,  198, 3528, 1062, 3005,  474, 2442,   43, 3472,   28],
       device='cuda:0')


Training:  72%|███████▏  | 718/1000 [03:35<01:20,  3.52it/s, loss=5.5] 

tensor([ 260,  100, 4197,   28,  198,  198,    0,   93, 6737,   28],
       device='cuda:0')
tensor([  506,  9202, 15308,    28,   198,  3733,    29,    96,  2467,   821],
       device='cuda:0')


Training:  72%|███████▏  | 719/1000 [03:35<01:18,  3.58it/s, loss=5.22]

tensor([  282,    28,   198,   339,   253,   260,   260,   198, 21723,    28],
       device='cuda:0')
tensor([ 1163,   198, 26001,   325,  1042,   614,    28,   957,  5717,    28],
       device='cuda:0')


Training:  72%|███████▏  | 720/1000 [03:35<01:18,  3.57it/s, loss=5.3] 

tensor([6737,  198,  198,  198,   42, 2097,   42,  198,  198,   28],
       device='cuda:0')
tensor([   47,   198,   198,  4796, 32598,  2097,    42,   198, 19525,    28],
       device='cuda:0')


Training:  72%|███████▏  | 721/1000 [03:36<01:18,  3.55it/s, loss=5.4]

tensor([ 4197,    28,   198,  4197,   282,   198,   346, 21723,    28,    28],
       device='cuda:0')
tensor([  599,    28,  3449,  2843,   198, 20314,   650,    99,   549,   288],
       device='cuda:0')


Training:  72%|███████▏  | 722/1000 [03:36<01:17,  3.57it/s, loss=5.39]

tensor([ 198,   28,   28,  260,  260,   28,  198,  198,  198, 4192],
       device='cuda:0')
tensor([ 1643,   982,   284, 16011,   982,    30,   198,   198, 39776,  4192],
       device='cuda:0')


Training:  72%|███████▏  | 723/1000 [03:36<01:17,  3.59it/s, loss=5.2] 

tensor([   28,   198,   198,   198,  4192,  6426, 15604,   198,   198,    28],
       device='cuda:0')
tensor([   47,   198,   198, 39776,  4192,  6426,    42,   198,  3482,  3861],
       device='cuda:0')


Training:  72%|███████▏  | 724/1000 [03:36<01:17,  3.55it/s, loss=5.2]

tensor([  198,  4197,    28,   253, 21723,    28,   198,   260,   260,   198],
       device='cuda:0')
tensor([14229, 12557,   325,   957,  3289,    28,   284,  4160,   198,   504],
       device='cuda:0')


Training:  72%|███████▎  | 725/1000 [03:37<01:19,  3.48it/s, loss=5.21]

tensor([549,  28, 325, 253, 198, 198, 314, 100,   0,  42], device='cuda:0')
tensor([ 357,  868,  325,   28,  198, 8113,  506,  354, 2490,   85],
       device='cuda:0')


Training:  73%|███████▎  | 726/1000 [03:37<01:17,  3.52it/s, loss=5.21]

tensor([346,  28, 198, 339,   0,  28,  28, 198, 198,  28], device='cuda:0')
tensor([  346,    28,   355,   328,   101,   432, 31886,   198,  6882, 43624],
       device='cuda:0')


Training:  73%|███████▎  | 727/1000 [03:37<01:17,  3.53it/s, loss=4.98]

tensor([ 28, 198,  42, 198, 260,  93,   0,  28, 198,  28], device='cuda:0')
tensor([  198,    54,  4985,   284,   430, 33659,  1718,    28,   441,   253],
       device='cuda:0')


Training:  73%|███████▎  | 728/1000 [03:38<01:16,  3.54it/s, loss=5]   

tensor([ 28, 198, 198, 198,  42,  42,  42,  42, 198, 198], device='cuda:0')
tensor([   17,   198,   198, 17321,  5819,  2154,  4501,    42,   198,    57],
       device='cuda:0')


Training:  73%|███████▎  | 729/1000 [03:38<01:15,  3.57it/s, loss=5.46]

tensor([  28,  198,  198,  100,   28,   28,  198,  260, 4197,   28],
       device='cuda:0')
tensor([   28,   963,   506,   100, 17072,    17,   327,   260,  2319,   198],
       device='cuda:0')


Training:  73%|███████▎  | 730/1000 [03:38<01:15,  3.59it/s, loss=5.16]

tensor([ 552,   28,  198,  339, 3060,  441,  260,  198,  260,   28],
       device='cuda:0')
tensor([15684,    28,   527,   339,   736, 20172,    42,   327, 19461,    28],
       device='cuda:0')


Training:  73%|███████▎  | 731/1000 [03:39<01:14,  3.60it/s, loss=5.27]

tensor([  0,  28,  28, 198, 198, 198, 403,  42, 198, 198], device='cuda:0')
tensor([  313,  2694,    30,   198,   198, 36169,   403,    42,   198,  5965],
       device='cuda:0')


Training:  73%|███████▎  | 732/1000 [03:39<01:14,  3.59it/s, loss=5.59]

tensor([21723,    28,    28,   198,   198,   314,   260,   198,    28,   198],
       device='cuda:0')
tensor([ 1038, 21723,    28,   198, 10576,   314,  3590,  1147,    42,  1188],
       device='cuda:0')


Training:  73%|███████▎  | 733/1000 [03:39<01:14,  3.59it/s, loss=5.46]

tensor([6737,  198,  198,   42,   28,  198,   28,    0,  198,  198],
       device='cuda:0')
tensor([   42,   198, 31000,  9677,    28,  3092,    81,    17,   198,   198],
       device='cuda:0')


Training:  73%|███████▎  | 734/1000 [03:39<01:14,  3.58it/s, loss=5.69]

tensor([   0,  198,  198,   28,  282,  198,  198, 4197,   28,  260],
       device='cuda:0')
tensor([17926, 24113, 20322,   750,  2775,    43,   253,  2112,   288,   198],
       device='cuda:0')


Training:  74%|███████▎  | 735/1000 [03:40<01:14,  3.55it/s, loss=5.69]

tensor([ 339,  339,  198, 3060,  260,  198,  260, 4197,   28,   93],
       device='cuda:0')
tensor([  744,   198,    57,   288,  4571,   327,   653, 11108,    29,  6691],
       device='cuda:0')


Training:  74%|███████▎  | 736/1000 [03:40<01:15,  3.51it/s, loss=5.39]

tensor([21723, 21723,    28,   198,   198,   325,     0,    28,   260,  4197],
       device='cuda:0')
tensor([47910, 21723,    28,   198,  2068,   410,   489,   418,   469, 23454],
       device='cuda:0')


Training:  74%|███████▎  | 737/1000 [03:40<01:13,  3.56it/s, loss=5.4] 

tensor([  198,  3060,   441,  6737,   198,   314,    28,   325,   198, 34818],
       device='cuda:0')
tensor([  339,   736,  6796,   198,  1348,  5569,   868,  1643,   637, 10372],
       device='cuda:0')


Training:  74%|███████▍  | 738/1000 [03:40<01:14,  3.54it/s, loss=5.4]

tensor([ 28,  28, 198, 198, 198, 403,  42, 198, 198, 314], device='cuda:0')
tensor([ 1756,    30,   198,   198, 36169,   403,    42,   198,  3681, 26365],
       device='cuda:0')


Training:  74%|███████▍  | 739/1000 [03:41<01:16,  3.43it/s, loss=5.65]

tensor([ 42, 555, 198,  28,  28, 198, 198, 198,  42, 198], device='cuda:0')
tensor([  253,  1035,  5740,   582,    30,   198,   198, 12850,  4545,    49],
       device='cuda:0')


Training:  74%|███████▍  | 740/1000 [03:41<01:17,  3.37it/s, loss=5]   

tensor([6737,   28,  198,  198,  198, 4192, 6426,   42,  198,  198],
       device='cuda:0')
tensor([ 6737,    30,   198,   198, 39776,  4192,  6426,    42,   198,  1348],
       device='cuda:0')


Training:  74%|███████▍  | 741/1000 [03:41<01:15,  3.41it/s, loss=5.11]

tensor([ 28, 555,   0,  28, 198,  28,  28, 198, 198,  28], device='cuda:0')
tensor([ 253, 4565,  477,   43, 7223, 6737,   28,  198, 2596, 3878],
       device='cuda:0')


Training:  74%|███████▍  | 742/1000 [03:42<01:14,  3.47it/s, loss=5.27]

tensor([ 28,  28,  28, 198,  42,  28, 260, 282, 339,  28], device='cuda:0')
tensor([ 550, 1075,  198,   54, 1738,  670, 2275,  355,  655,   30],
       device='cuda:0')


Training:  74%|███████▍  | 743/1000 [03:42<01:14,  3.47it/s, loss=5.27]

tensor([   0,   28,  198,  198,  198, 4192, 6426,   42,  198,  198],
       device='cuda:0')
tensor([ 2001,    30,   198,   198, 39776,  4192,  6426,    42,   198,    62],
       device='cuda:0')


Training:  74%|███████▍  | 744/1000 [03:42<01:15,  3.41it/s, loss=5.13]

tensor([  198,   314,    42,   198,   198, 21723,    28,   198,    28,   198],
       device='cuda:0')
tensor([ 8113, 21660,    42,   198,  5965,  1861,    28, 38061,    28,   314],
       device='cuda:0')


Training:  74%|███████▍  | 745/1000 [03:43<01:13,  3.45it/s, loss=5.15]

tensor([ 28, 339, 314, 549,  28, 198, 314, 253, 198, 260], device='cuda:0')
tensor([  347,   384, 48271,   468,    42,   384,   436,  2711,   288,   536],
       device='cuda:0')


Training:  75%|███████▍  | 746/1000 [03:43<01:12,  3.51it/s, loss=5.14]

tensor([  28,  198,  198,  198,   42, 2680, 2097,   42,  198,  198],
       device='cuda:0')
tensor([   30,   198,   198, 48902,  9232,  2680,  2097,    42,   198, 14229],
       device='cuda:0')


Training:  75%|███████▍  | 747/1000 [03:43<01:11,  3.54it/s, loss=5.17]

tensor([  924,   282,   198, 21723,    30,    28,   198,  3060,   198,   260],
       device='cuda:0')
tensor([ 924,   42,  957, 1450, 4445,  198,   57,  457,  429,  469],
       device='cuda:0')


Training:  75%|███████▍  | 748/1000 [03:43<01:09,  3.62it/s, loss=5.83]

tensor([ 339,  198,  339,   28,  198,  198,  260, 4197,  198,   42],
       device='cuda:0')
tensor([ 7697,   355, 25514,    43,   564,   281,   260,   198,  1527,  2832],
       device='cuda:0')


Training:  75%|███████▍  | 749/1000 [03:44<01:08,  3.66it/s, loss=5.83]

tensor([ 198,  260,   28,  260,  198, 6737,  198,  462,   30,  260],
       device='cuda:0')
tensor([  281,  3826,   282,   198, 11247,    42,   295,  1878,   351,   511],
       device='cuda:0')


Training:  75%|███████▌  | 750/1000 [03:44<01:09,  3.60it/s, loss=5.04]

tensor([ 42,  57,  42, 198, 198, 339, 314,  28,  28, 198], device='cuda:0')
tensor([ 6565, 38415,    42,   198,  1653,  1041,  4161,  8361,  1325,    28],
       device='cuda:0')


Training:  75%|███████▌  | 751/1000 [03:44<01:08,  3.63it/s, loss=5.2] 

tensor([ 28,  28, 198, 198,  28,  42, 198,  28, 198, 457], device='cuda:0')
tensor([ 1272,   198, 35097,  1029, 25865,  1687,   346,    30,  1206, 13461],
       device='cuda:0')


Training:  75%|███████▌  | 752/1000 [03:45<01:09,  3.58it/s, loss=5.04]

tensor([21723,    28,   198,   325,   198,   339,   198, 21723,    28,   260],
       device='cuda:0')
tensor([4778,  198, 2068,  582,  338,  416,  957,  599,  281, 1272],
       device='cuda:0')


Training:  75%|███████▌  | 753/1000 [03:45<01:08,  3.63it/s, loss=5.04]

tensor([  28,  198,  457,  253,   28,  198,  198,  198,  381, 8242],
       device='cuda:0')
tensor([   43,   339,   744,  2139,    30,   198,   198,    60, 15604,  8772],
       device='cuda:0')


Training:  75%|███████▌  | 754/1000 [03:45<01:09,  3.55it/s, loss=5.35]

tensor([ 28,  28, 198, 198, 260, 339, 457, 282,  28, 198], device='cuda:0')
tensor([2428,   28,  198, 5212, 5337,  392, 2275,  357, 2220,  288],
       device='cuda:0')


Training:  76%|███████▌  | 755/1000 [03:45<01:08,  3.58it/s, loss=5.23]

tensor([ 28,  28, 325, 457, 346,  28, 198, 198,  28, 198], device='cuda:0')
tensor([  424,  3786,   339,  8215,   346,    43,   198, 15017,    28,   451],
       device='cuda:0')


Training:  76%|███████▌  | 756/1000 [03:46<01:08,  3.55it/s, loss=5.23]

tensor([457,  42, 339, 346,  28, 198, 198,   0, 549,  28], device='cuda:0')
tensor([  948, 30437,  9984,   346,    42,   198,  9482,   486,   549,   288],
       device='cuda:0')


Training:  76%|███████▌  | 757/1000 [03:46<01:10,  3.45it/s, loss=5.1] 

tensor([ 28,  42,  28, 198, 339, 198, 198,  23,  28, 198], device='cuda:0')
tensor([ 3275,  8806,    28,   564,  1035,   198, 12018,  8806,    28,   423],
       device='cuda:0')


Training:  76%|███████▌  | 758/1000 [03:46<01:11,  3.37it/s, loss=4.99]

tensor([ 28,  28, 198,  28, 198, 339, 100, 288, 260,  28], device='cuda:0')
tensor([ 2016,    28,  1209,    28,   732,   506,  1690,  1980, 20322,    42],
       device='cuda:0')


Training:  76%|███████▌  | 759/1000 [03:46<01:12,  3.32it/s, loss=5.04]

tensor([  0, 198, 198,  30, 198, 325, 253, 314, 325,  28], device='cuda:0')
tensor([  198, 11952, 11737,    43,   654,   325,   384,   523, 23599,    30],
       device='cuda:0')


Training:  76%|███████▌  | 760/1000 [03:47<01:12,  3.32it/s, loss=4.72]

tensor([ 28, 282, 260, 339, 253,  28, 198, 198, 198, 253], device='cuda:0')
tensor([4109,  282,  732,  314, 2294,   43,  284,   28,  702,  253],
       device='cuda:0')


Training:  76%|███████▌  | 761/1000 [03:47<01:12,  3.32it/s, loss=5.12]

tensor([  28,   28,  198,  506,  260, 4197,   28,  198,  346,   28],
       device='cuda:0')
tensor([12687,   198,  4590,   281,   260,  6411,    30,  3315,   549,   260],
       device='cuda:0')


Training:  76%|███████▌  | 762/1000 [03:48<01:11,  3.33it/s, loss=4.91]

tensor([  28,   28,   28,  198,  339,  314,  339,  314,  441, 6737],
       device='cuda:0')
tensor([41063,   924,   198,  1653,  1041,   338,   384, 26365, 15270,  6737],
       device='cuda:0')


Training:  76%|███████▋  | 763/1000 [03:48<01:10,  3.37it/s, loss=4.91]

tensor([ 282,  198,  198,  198,   42, 4210,   42,   42,  198,  198],
       device='cuda:0')
tensor([   30,   198,   198,  2721,  5431,  4210,  5963,    42,   198, 34177],
       device='cuda:0')


Training:  76%|███████▋  | 764/1000 [03:48<01:10,  3.33it/s, loss=5.16]

tensor([198, 198, 549,  28, 260,  28,  28,  30, 198, 339], device='cuda:0')
tensor([  198, 48111,   549,   288,  4875,   601,  3287,    28,   837,   339],
       device='cuda:0')


Training:  76%|███████▋  | 765/1000 [03:48<01:10,  3.32it/s, loss=5.16]

tensor([  42,   28,  198,   28,  325,   28,  198,  198, 3060,  325],
       device='cuda:0')
tensor([11424,    28,   357,   868,  7219,    42,   198,    57,  3060,  5774],
       device='cuda:0')


Training:  77%|███████▋  | 766/1000 [03:49<01:09,  3.35it/s, loss=5.29]

tensor([ 28, 198, 339,  28,  28,  28, 198, 198, 198,  42], device='cuda:0')
tensor([  28,  564,  716,  547,  441,   30,  198,  198, 2721, 5431],
       device='cuda:0')


Training:  77%|███████▋  | 767/1000 [03:49<01:09,  3.36it/s, loss=5.29]

tensor([ 28,  28, 260,  28,  28, 198,  28, 260,  28, 198], device='cuda:0')
tensor([17072,   288,   451, 17324,    28,  2631,   284,  1341,    47,  1431],
       device='cuda:0')


Training:  77%|███████▋  | 768/1000 [03:49<01:09,  3.34it/s, loss=5.05]

tensor([  28, 4197,   28,  260,   28,  198,   28,  198,  339,    0],
       device='cuda:0')
tensor([  260, 35278,   281,   346,    28, 48275,    28,   355,  4662,   894],
       device='cuda:0')


Training:  77%|███████▋  | 769/1000 [03:50<01:07,  3.40it/s, loss=5.11]

tensor([  28,   28,  253, 4197,   28,  198,  198,  198,   42, 4501],
       device='cuda:0')
tensor([27044,   436,   260, 48945,    47,   198,   198, 23110,  3206,  4501],
       device='cuda:0')


Training:  77%|███████▋  | 770/1000 [03:50<01:05,  3.49it/s, loss=4.96]

tensor([  28,  253,  198,  198,  198,   69, 9620,   42, 2113,  198],
       device='cuda:0')
tensor([ 702,   30,  198,  198,   52,   69, 9620,  717, 2113,   51],
       device='cuda:0')


Training:  77%|███████▋  | 771/1000 [03:50<01:06,  3.46it/s, loss=4.96]

tensor([ 23, 260,  28, 198, 198,  28, 198, 288, 198, 198], device='cuda:0')
tensor([  335,  1272,    47,   198, 10768,    28,   685,    28,  5697,  2747],
       device='cuda:0')


Training:  77%|███████▋  | 772/1000 [03:50<01:06,  3.40it/s, loss=4.82]

tensor([  28,  441,   28, 4197,   28,  198,  198,  198,   69, 9620],
       device='cuda:0')
tensor([  523,  3694,   260, 11611,    30,   198,   198,    52,    69,  9620],
       device='cuda:0')


Training:  77%|███████▋  | 773/1000 [03:51<01:07,  3.36it/s, loss=5.25]

tensor([  339,   260,   282, 21723,    28,   260,     0,     0,    28,   198],
       device='cuda:0')
tensor([9713,  578,  957, 9983,  351,  278, 3285,  849,   42,  339],
       device='cuda:0')


Training:  77%|███████▋  | 774/1000 [03:51<01:06,  3.38it/s, loss=4.95]

tensor([288, 198, 346,  28, 260,  28,  28, 198, 198, 288], device='cuda:0')
tensor([   30, 30079,   346,   351,   451,  4386,    42,   198, 17777,   623],
       device='cuda:0')


Training:  78%|███████▊  | 775/1000 [03:51<01:05,  3.41it/s, loss=4.81]

tensor([ 325,  280,   28,   28,  198,  198,  198, 5431, 4210,   63],
       device='cuda:0')
tensor([35532,   280,  7288,   423,   198,   198,  2721,  5431,  4210,  5963],
       device='cuda:0')


Training:  78%|███████▊  | 776/1000 [03:52<01:05,  3.41it/s, loss=4.81]

tensor([   28, 21723,    28,   198,   198,   198,    69,  9620,   717,  2113],
       device='cuda:0')
tensor([ 957, 4033,   30,  198,  198,   52,   69, 9620,  717, 2113],
       device='cuda:0')


Training:  78%|███████▊  | 777/1000 [03:52<01:06,  3.33it/s, loss=4.83]

tensor([  0, 198, 253,  28,  28, 198, 260, 198, 198, 288], device='cuda:0')
tensor([  43,  325,  346, 8048,  198, 6228,  469, 1038, 1728,   30],
       device='cuda:0')


Training:  78%|███████▊  | 778/1000 [03:52<01:06,  3.33it/s, loss=4.59]

tensor([  198,    28,    28,   198,   198,   198, 15604,  8772,    42,   198],
       device='cuda:0')
tensor([41288,  1055,    47,   198,   198,    60, 15604,  8772,    42,   198],
       device='cuda:0')


Training:  78%|███████▊  | 779/1000 [03:53<01:06,  3.33it/s, loss=4.75]

tensor([  0,  28, 339, 257, 259,  28, 332, 288, 198,  28], device='cuda:0')
tensor([ 1134,   338,   297,    23,   259, 15480,   381,   253, 48945,    30],
       device='cuda:0')


Training:  78%|███████▊  | 780/1000 [03:53<01:04,  3.39it/s, loss=4.75]

tensor([339, 314, 441, 198,  28, 260, 339, 314,  28, 198], device='cuda:0')
tensor([ 384, 1250,  260, 2775,  327,  527,  384, 4242,   42,  198],
       device='cuda:0')


Training:  78%|███████▊  | 781/1000 [03:53<01:03,  3.44it/s, loss=5.17]

tensor([ 28, 198, 549,  28, 288, 198, 198, 260,  30, 198], device='cuda:0')
tensor([   42,  1303,  1272,  1690,    28,   198,   397, 25128,    30,   198],
       device='cuda:0')


Training:  78%|███████▊  | 782/1000 [03:53<01:02,  3.50it/s, loss=5.7] 

tensor([  0,  28, 198, 198, 339,  28, 198,  28,  28, 260], device='cuda:0')
tensor([8263,   42,  198, 5195, 2294,   28, 2662, 1272,  288,  260],
       device='cuda:0')


Training:  78%|███████▊  | 783/1000 [03:54<01:01,  3.55it/s, loss=5.01]

tensor([ 28,  28, 198, 198, 339, 260, 339, 282, 325,  28], device='cuda:0')
tensor([ 2223,    28,   198,  3528,   418,   338,  6968,  3786,  8177, 17600],
       device='cuda:0')


Training:  78%|███████▊  | 784/1000 [03:54<01:00,  3.56it/s, loss=5.29]

tensor([   28,   198,  6426,    28,    28, 21723,   253,   339, 21723,    28],
       device='cuda:0')
tensor([ 198,   55, 1134,  549,  957, 1036,  284,  957, 4083,  808],
       device='cuda:0')


Training:  78%|███████▊  | 785/1000 [03:54<01:00,  3.57it/s, loss=5.1] 

tensor([   28,    28,   282,   198, 21723,   282,    30,   260,   198,   260],
       device='cuda:0')
tensor([  318,   578,    42,   653, 24641, 14517,   335,   198, 15135,  5882],
       device='cuda:0')


Training:  79%|███████▊  | 786/1000 [03:54<01:00,  3.51it/s, loss=5.1]

tensor([  95,   28,   28,  198,  260, 3060,  253,  314,  282,  198],
       device='cuda:0')
tensor([  293, 20322,    28,   327,   339,   457,   357,  2073,    30,   198],
       device='cuda:0')


Training:  79%|███████▊  | 787/1000 [03:55<01:01,  3.49it/s, loss=5.33]

tensor([549, 198,   0,  28,  28, 260, 198,  28, 198, 339], device='cuda:0')
tensor([  260,   412, 34315, 16592,  1980,   469, 10166,   198,  3528,   856],
       device='cuda:0')


Training:  79%|███████▉  | 788/1000 [03:55<01:00,  3.51it/s, loss=5.33]

tensor([17072,   198,   198,   198,   679,     0,  8772,    42,   198,   198],
       device='cuda:0')
tensor([   30,   198,   198,    56, 13110, 22239,  8772,    42,   198,  2696],
       device='cuda:0')


Training:  79%|███████▉  | 789/1000 [03:55<01:01,  3.45it/s, loss=4.97]

tensor([ 984,  288,  253,   28,  260,  198,  198,  198, 2810, 4501],
       device='cuda:0')
tensor([ 1441,   441,   874,   288,   423,   198,   198, 10895,  2810,  8772],
       device='cuda:0')


Training:  79%|███████▉  | 790/1000 [03:56<01:01,  3.39it/s, loss=4.86]

tensor([  198,   198,  4728, 33256,  4210,    42,   198,   198,    28,   198],
       device='cuda:0')
tensor([  198,    59, 12455, 33256,  1933,    42,   198,  1780,    28,   523],
       device='cuda:0')


Training:  79%|███████▉  | 791/1000 [03:56<01:01,  3.40it/s, loss=4.86]

tensor([253, 314, 339, 314, 549,  28, 198, 198, 260, 260], device='cuda:0')
tensor([ 357,  338, 1041, 6753,  549,   28,  198,  788,  511,  957],
       device='cuda:0')


Training:  79%|███████▉  | 792/1000 [03:56<01:02,  3.35it/s, loss=4.98]

tensor([   28,    28,   198,   198,   198,  4728, 33256,  4210,    42,   198],
       device='cuda:0')
tensor([12575,    30,   198,   198,    59, 12455, 33256,  1933,    42,   198],
       device='cuda:0')


Training:  79%|███████▉  | 793/1000 [03:57<01:02,  3.32it/s, loss=5.05]

tensor([ 6737,  6737,   198,    42, 44004,    28,   198,   314,    28,   314],
       device='cuda:0')
tensor([   91,    28,  9133,   490, 44004,    43,  1041,  1921,  1041,  3060],
       device='cuda:0')


Training:  79%|███████▉  | 794/1000 [03:57<01:01,  3.34it/s, loss=5.05]

tensor([ 28, 339, 555,  42,  28, 198, 314, 253, 198, 198], device='cuda:0')
tensor([ 564,  253, 8150,  301,   42,  384,  314, 1573,   28,  339],
       device='cuda:0')


Training:  80%|███████▉  | 795/1000 [03:57<01:02,  3.27it/s, loss=5.2] 

tensor([  28,   28, 1119,   28,  198,  198,  198, 2810, 8772,   42],
       device='cuda:0')
tensor([27415,   581,  1119,    30,   198,   198, 11718,  2810,  8970,    42],
       device='cuda:0')


Training:  80%|███████▉  | 796/1000 [03:57<01:03,  3.22it/s, loss=5.1]

tensor([ 6737,    28,   339,   198,   198,   198, 46634,  5229,  8772,    42],
       device='cuda:0')
tensor([  346,   592,    30,   198,   198, 43029, 46634,  5229,  8772,    42],
       device='cuda:0')


Training:  80%|███████▉  | 797/1000 [03:58<01:02,  3.26it/s, loss=5.03]

tensor([ 314,   28, 4197,  302,  441,   28,   28,  198,  198,  198],
       device='cuda:0')
tensor([  280,   260,  1079,   302, 12471,  1238,    30,   198,   198, 43029],
       device='cuda:0')


Training:  80%|███████▉  | 798/1000 [03:58<01:02,  3.25it/s, loss=4.76]

tensor([ 552,  549,  339,  314,  100,   28,  198,  198,  198, 8292],
       device='cuda:0')
tensor([ 2945,   347,  1041,   506,  3213,    47,   198,   198, 13087,  8292],
       device='cuda:0')


Training:  80%|███████▉  | 799/1000 [03:58<01:01,  3.26it/s, loss=5.08]

tensor([ 28, 198, 540,  28, 198, 540,  28, 198, 540,  28], device='cuda:0')
tensor([   28,   787, 16981,    47,   787,  2234,    47,   787,  9060,    47],
       device='cuda:0')


Training:  80%|████████  | 800/1000 [03:59<01:00,  3.31it/s, loss=5.08]

tensor([   28,    28,  4197,    28,   260, 21723,   282,   198,   198,   198],
       device='cuda:0')
tensor([1489,  260, 5710,  282,  650, 3229,   30,  198,  198,   60],
       device='cuda:0')


Training:  80%|████████  | 801/1000 [03:59<01:00,  3.31it/s, loss=4.63]

tensor([198, 198,   0,  28,  28, 260,  28,  28, 198, 198], device='cuda:0')
tensor([  198,    64, 14765, 14104,   327, 14553,  4240,    30,   198,   198],
       device='cuda:0')


Training:  80%|████████  | 802/1000 [03:59<01:00,  3.28it/s, loss=4.69]

tensor([325, 260, 198, 198, 339,  28, 253,  28, 325, 198], device='cuda:0')
tensor([2373,   28,  198, 3528, 2161,  670,  357, 3786,   28,  339],
       device='cuda:0')


Training:  80%|████████  | 803/1000 [04:00<01:00,  3.26it/s, loss=4.8] 

tensor([  95,   28,  506,   42, 4197,   28,  198,  198,   42,  198],
       device='cuda:0')
tensor([ 277, 2574, 9289,  260, 3323,   47,  198,   63,   28,  787],
       device='cuda:0')


Training:  80%|████████  | 804/1000 [04:00<00:59,  3.29it/s, loss=4.77]

tensor([  325,   198,    42,    28,  4197,    28,   339,    28,   260, 21723],
       device='cuda:0')
tensor([ 198, 5812,  620,  260, 2455,  355, 5842,  282,  650, 4120],
       device='cuda:0')


Training:  80%|████████  | 805/1000 [04:00<00:58,  3.34it/s, loss=4.77]

tensor([  198,  4197,    28,   260, 21723,    28,   198,   314,    28,    42],
       device='cuda:0')
tensor([  504, 10243,   288,   957,  6221,    28,   451,  9154, 24626,    28],
       device='cuda:0')


Training:  81%|████████  | 806/1000 [04:00<00:58,  3.33it/s, loss=4.7] 

tensor([  95,   28,  198,  198,  198, 7113,   51, 6213, 8772,   42],
       device='cuda:0')
tensor([1119,   30,  198,  198,   70, 2113,   51, 6213, 8772,   42],
       device='cuda:0')


Training:  81%|████████  | 807/1000 [04:01<00:58,  3.28it/s, loss=4.55]

tensor([ 28,  28, 260, 339, 198, 198, 198,  28,  28, 198], device='cuda:0')
tensor([4972,  411,  338,   30,  198,  198, 9852,  323,   42,  198],
       device='cuda:0')


Training:  81%|████████  | 808/1000 [04:01<00:59,  3.23it/s, loss=4.34]

tensor([ 95,  28,  28,  30, 198, 198,  28, 541,  28, 260], device='cuda:0')
tensor([  105,  7557, 28823,    43,   198,  4809, 12724,   541,   288,  1372],
       device='cuda:0')


Training:  81%|████████  | 809/1000 [04:01<00:58,  3.29it/s, loss=5.45]

tensor([ 198,  198,   28,   30, 6737,   28,  198,  198,  339, 4197],
       device='cuda:0')
tensor([  198, 23641,  1457,   103,   402,    42,   198,  2427,   260,  3426],
       device='cuda:0')


Training:  81%|████████  | 810/1000 [04:02<00:56,  3.36it/s, loss=5.41]

tensor([  0,  28,  28, 198, 282,  29,  28,  28, 198, 198], device='cuda:0')
tensor([ 500,  441,  198, 9142, 1296,  929, 1573,   30,  198,  198],
       device='cuda:0')


Training:  81%|████████  | 811/1000 [04:02<00:55,  3.41it/s, loss=5.55]

tensor([  30,  302,  506,   28,  198, 4197,   28,  260,   28,  198],
       device='cuda:0')
tensor([ 1250, 19002,  1440,   198,   504, 17816,   282, 28676,    28,   284],
       device='cuda:0')


Training:  81%|████████  | 812/1000 [04:02<00:53,  3.50it/s, loss=5.44]

tensor([ 28,  30,  28, 198, 198, 260, 198,   0,  28,  28], device='cuda:0')
tensor([35853,  6737,    43,   198,  3280,   480, 24276, 22566,   441,   253],
       device='cuda:0')


Training:  81%|████████▏ | 813/1000 [04:02<00:53,  3.50it/s, loss=5.44]

tensor([ 28,  30, 198, 288, 198,  28, 260,  28, 198, 198], device='cuda:0')
tensor([  982,    42,   685,    28,  8913,   351, 33327,    17,   198,    49],
       device='cuda:0')


Training:  81%|████████▏ | 814/1000 [04:03<00:53,  3.49it/s, loss=5.15]

tensor([  28,   28,   30, 4083,    0, 6737,  198,  679,   28,  198],
       device='cuda:0')
tensor([ 318, 8233,  650, 5334,  478,  198,   56,  792,   17, 1209],
       device='cuda:0')


Training:  82%|████████▏ | 815/1000 [04:03<00:53,  3.45it/s, loss=5.15]

tensor([6737,   28,  260,   30,  198,  314,  260,    0,   28,  198],
       device='cuda:0')
tensor([5674,  411, 4778,   42,  451,  314, 7468,  258, 1132,  198],
       device='cuda:0')


Training:  82%|████████▏ | 816/1000 [04:03<00:55,  3.35it/s, loss=5.22]

tensor([ 42, 346,  28, 198,  28, 198,  28, 346,  28, 198], device='cuda:0')
tensor([ 9725,    95,    17, 38467,    28, 23632,  9725,    95,    30,   198],
       device='cuda:0')


Training:  82%|████████▏ | 817/1000 [04:04<00:54,  3.35it/s, loss=5.2] 

tensor([ 28,  42, 198, 198, 346, 457,  30, 555,  30, 198], device='cuda:0')
tensor([32062,    42,   198,  6121,   392,  7219,   750,  2030,    28,  4875],
       device='cuda:0')


Training:  82%|████████▏ | 818/1000 [04:04<00:52,  3.46it/s, loss=5.2]

tensor([ 4197,   198,   100,   198,   198, 41003,     0,  6737,   198,   260],
       device='cuda:0')
tensor([1248,  506, 2249,  198,   66, 2713,  478, 6737, 1523,  260],
       device='cuda:0')


Training:  82%|████████▏ | 819/1000 [04:04<00:52,  3.46it/s, loss=5.2]

tensor([ 30,  28,  28, 346, 260, 555,  30, 198, 198,  28], device='cuda:0')
tensor([8427,  346,  536, 1363,  253, 1945,   28,  198, 3528,  946],
       device='cuda:0')


Training:  82%|████████▏ | 820/1000 [04:05<00:53,  3.39it/s, loss=5.27]

tensor([  30,  314, 6737,  198,  325, 4197,  282,  260,  555,  198],
       device='cuda:0')
tensor([  384,  1477,   198,  2068,   260, 28382,   282,   253,   555,    28],
       device='cuda:0')


Training:  82%|████████▏ | 821/1000 [04:05<00:52,  3.43it/s, loss=5.27]

tensor([2810, 6426, 6426,   42,  198,  198, 4083,   28,   28,  198],
       device='cuda:0')
tensor([   55,  4192,  7854,    42,   198, 11952, 23271,  8710,    17,   492],
       device='cuda:0')


Training:  82%|████████▏ | 822/1000 [04:05<00:52,  3.41it/s, loss=5.16]

tensor([198, 198, 198, 198,  28,  42, 198, 198,  28, 198], device='cuda:0')
tensor([   47,   198,   198,  5345, 22152,    42,   198,  5230,    28,  5277],
       device='cuda:0')


Training:  82%|████████▏ | 823/1000 [04:06<00:51,  3.46it/s, loss=4.96]

tensor([198, 260,  28, 198,   0,  42,  28, 198, 198, 325], device='cuda:0')
tensor([  351, 30291,   198,    86, 27045, 26927,    28,   198,  9389,  1928],
       device='cuda:0')


Training:  82%|████████▏ | 824/1000 [04:06<00:50,  3.50it/s, loss=5.1] 

tensor([   28,   198,   260,   457,   346,    28,    28,   198, 20055,    28],
       device='cuda:0')
tensor([   43,   327,   339,   536, 13735, 20322,   198,    71, 13231,   670],
       device='cuda:0')


Training:  82%|████████▎ | 825/1000 [04:06<00:50,  3.48it/s, loss=5.1]

tensor([ 28,  30, 198, 260, 339, 457, 325,  28, 198, 198], device='cuda:0')
tensor([ 1450,    28,   351,  5337,   392,   654, 24575,    28,   198,  2193],
       device='cuda:0')


Training:  83%|████████▎ | 826/1000 [04:06<00:51,  3.40it/s, loss=4.72]

tensor([29086,    28,   198,   198,  6737,    28,   198,    28,   198,   198],
       device='cuda:0')
tensor([ 274,   28,  198,   96, 8854,   28, 9869,   28, 1028,  105],
       device='cuda:0')


Training:  83%|████████▎ | 827/1000 [04:07<00:51,  3.35it/s, loss=5.39]

tensor([ 28, 253,  28,  28, 198,   0,  42,  28, 260, 260], device='cuda:0')
tensor([  457,   719,   588,   198,    86, 21482,  3447,   327,   511,   260],
       device='cuda:0')


Training:  83%|████████▎ | 828/1000 [04:07<00:50,  3.38it/s, loss=5.16]

tensor([  325,   198, 15604,   198,   338,   314,   253,    28,   198,   198],
       device='cuda:0')
tensor([  198,    60,   602,   967,   384, 26365,  3763,    30,   198,   198],
       device='cuda:0')


Training:  83%|████████▎ | 829/1000 [04:07<00:49,  3.44it/s, loss=5.16]

tensor([ 28, 198, 198, 260,  28, 198, 198, 198,  42, 268], device='cuda:0')
tensor([   28,   198, 38984,   346,    28,   198,  6101,   298,  2679,   268],
       device='cuda:0')


Training:  83%|████████▎ | 830/1000 [04:08<00:48,  3.48it/s, loss=4.83]

tensor([   42,    42,   198,   198,   549,  2286, 23944,    28,   198,   198],
       device='cuda:0')
tensor([ 1508,    42,   198, 12800,  4618,  2286, 23944,    30,   198,   198],
       device='cuda:0')


Training:  83%|████████▎ | 831/1000 [04:08<00:48,  3.51it/s, loss=4.71]

tensor([4501, 2810, 8772,   42,  198,  198,  346,  260,   28,  198],
       device='cuda:0')
tensor([ 7113,  2810,  4501,    42,   198, 16817,  1980,   549,    17, 12193],
       device='cuda:0')


Training:  83%|████████▎ | 832/1000 [04:08<00:47,  3.52it/s, loss=4.93]

tensor([  42,   28,   30,  198,  198,   28, 6737,  260, 4197,   28],
       device='cuda:0')
tensor([  482,  1163,    28,   198,  9302,  1185,   288,   260, 44890,    29],
       device='cuda:0')


Training:  83%|████████▎ | 833/1000 [04:08<00:47,  3.50it/s, loss=4.93]

tensor([4628, 4501,   42,  198,  198,   28,  198,  260,   30,   28],
       device='cuda:0')
tensor([ 4628,  4501,    42,   198, 45496,    28, 20172,   468,   441,    30],
       device='cuda:0')


Training:  83%|████████▎ | 834/1000 [04:09<00:48,  3.45it/s, loss=4.63]

tensor([2680,   57, 4501,   42,  198,  198, 4197,   28,  441,   28],
       device='cuda:0')
tensor([ 2113,    57,  4501,    42,   198,   504,   701,   359, 23909,    43],
       device='cuda:0')


Training:  84%|████████▎ | 835/1000 [04:09<00:48,  3.39it/s, loss=4.88]

tensor([  28,   28,    0, 6737,  260,  288,  198, 4197,   28,  198],
       device='cuda:0')
tensor([ 1072,   510, 46823,   351,  1728,    28,   260,   550,   198, 20181],
       device='cuda:0')


Training:  84%|████████▎ | 836/1000 [04:09<00:49,  3.31it/s, loss=4.54]

tensor([   28,    28,    30,   198,   198,   198, 23317,    57,  4501,    42],
       device='cuda:0')
tensor([  346,  7270,    30,   198,   198,  4105, 23317,    57,  4501,    42],
       device='cuda:0')


Training:  84%|████████▎ | 837/1000 [04:10<00:49,  3.30it/s, loss=4.61]

tensor([ 198,  198,  198,  198, 3020, 2113,   57, 4501,   42,  198],
       device='cuda:0')
tensor([ 423,  198,  198,   67, 3020, 2113,   57, 4501,   42,  198],
       device='cuda:0')


Training:  84%|████████▍ | 838/1000 [04:10<00:47,  3.38it/s, loss=4.8] 

tensor([   28,   260,   198,     0,    28,   260, 34818,    30,   198,   198],
       device='cuda:0')
tensor([5344,  253, 3561,  839, 1980,  637,  391,   28,  198, 2193],
       device='cuda:0')


Training:  84%|████████▍ | 839/1000 [04:10<00:47,  3.39it/s, loss=4.8]

tensor([ 28, 339, 457, 253,  42,  28, 198, 549,  28, 260], device='cuda:0')
tensor([  338,   392,   457, 13784,  1915,   198,  7143,  1187,   411,   260],
       device='cuda:0')


Training:  84%|████████▍ | 840/1000 [04:11<00:46,  3.42it/s, loss=4.9] 

tensor([ 28, 260,  28,  28, 198, 260, 198, 260, 198, 198], device='cuda:0')
tensor([  281,  1272,  5721,    28,   429,   655,   288,   655,   198, 38110],
       device='cuda:0')


Training:  84%|████████▍ | 841/1000 [04:11<00:46,  3.41it/s, loss=4.9]

tensor([ 325,   28,  260,  198,   42,  260,   28, 4197,  282,   28],
       device='cuda:0')
tensor([ 441, 5091,  198,   63,   23,  259,  260, 4749,  905,  288],
       device='cuda:0')


Training:  84%|████████▍ | 842/1000 [04:11<00:46,  3.36it/s, loss=4.46]

tensor([  42,  198,  198, 4197,   28,  381,  198,  260,  314,  260],
       device='cuda:0')
tensor([  42,  198,  504, 1085,  754, 9699,  282,  357,  314, 1896],
       device='cuda:0')


Training:  84%|████████▍ | 843/1000 [04:11<00:46,  3.36it/s, loss=4.81]

tensor([   28,   198,    28,   325,    28,   198,     0, 29086,    28,   198],
       device='cuda:0')
tensor([   28,   346,   523,   441,    47,   377,   608, 29086,    28,  2505],
       device='cuda:0')


Training:  84%|████████▍ | 844/1000 [04:12<00:46,  3.37it/s, loss=4.89]

tensor([  346,    28,    28,   260,   198,   198,  4192,   198, 37184,    28],
       device='cuda:0')
tensor([ 2745,  3895,   351,  3878,    30,   408, 19550,  2828, 37184,    28],
       device='cuda:0')


Training:  84%|████████▍ | 845/1000 [04:12<00:45,  3.39it/s, loss=4.89]

tensor([198, 198,  28,  28, 198, 253,   0,  28,  28,  28], device='cuda:0')
tensor([ 198, 7993, 5612,   28,  702,  345,  370,  990, 5249,   28],
       device='cuda:0')


Training:  85%|████████▍ | 846/1000 [04:12<00:45,  3.38it/s, loss=4.63]

tensor([  28,  198,  198,  198,  198, 9333,   42,  198,  198,   28],
       device='cuda:0')
tensor([ 1378,    30,   198,   198, 38371,  9333,    42,   198, 10539,    28],
       device='cuda:0')


Training:  85%|████████▍ | 847/1000 [04:12<00:44,  3.41it/s, loss=4.63]

tensor([  28,  198,   28,   28,  198,  198,  198, 3020, 2113,   57],
       device='cuda:0')
tensor([  28,  732, 1745,   47,  198,  198,   67, 3020, 2113,   57],
       device='cuda:0')


Training:  85%|████████▍ | 848/1000 [04:13<00:44,  3.41it/s, loss=4.74]

tensor([ 288,   28,  198,  198, 6737,  260,   28,  253,  198,  198],
       device='cuda:0')
tensor([2428,   42,  650, 2988,  288,  549,  436,   28,  198, 3681],
       device='cuda:0')


Training:  85%|████████▍ | 849/1000 [04:13<00:44,  3.42it/s, loss=4.96]

tensor([198, 198, 457, 441, 198,  42, 198, 441,  28, 198], device='cuda:0')
tensor([ 198, 2683,  359,  253, 4469,   28,  359,  346,   47,  198],
       device='cuda:0')


Training:  85%|████████▌ | 850/1000 [04:13<00:42,  3.50it/s, loss=5.11]

tensor([ 28, 325,  28, 282, 198, 260,  28,  28,  28, 198], device='cuda:0')
tensor([ 2526,  1209,  2630,    43,   288, 10297,  1272,   805,   198,  5195],
       device='cuda:0')


Training:  85%|████████▌ | 851/1000 [04:14<00:42,  3.54it/s, loss=5.11]

tensor([  28,   28,  198,   28,  260,  198,  457, 6737,  198,  260],
       device='cuda:0')
tensor([3032,  198, 5195,  511,  564,  392, 2815,   43,  327,  638],
       device='cuda:0')


Training:  85%|████████▌ | 852/1000 [04:14<00:41,  3.58it/s, loss=4.85]

tensor([ 198,  198, 2680,   28,   28,  260,   28,  198,  198,   28],
       device='cuda:0')
tensor([  198,    61,   462,  3497,   288, 16945,  8047,    30,  1249,    28],
       device='cuda:0')


Training:  85%|████████▌ | 853/1000 [04:14<00:41,  3.58it/s, loss=4.85]

tensor([6737,   28,  198,  198,   95,   28,  260,   28,  198,  198],
       device='cuda:0')
tensor([6737,  284,  198, 4038, 1390,  288, 1972, 1092,  260,  701],
       device='cuda:0')


Training:  85%|████████▌ | 854/1000 [04:14<00:42,  3.44it/s, loss=4.87]

tensor([ 6737,    28,    28,   198,   198,  4501,    28, 34818,   198,    28],
       device='cuda:0')
tensor([ 2240,  7268,    28,   198,    57,  1643,   637, 15006,  2240,  7904],
       device='cuda:0')


Training:  86%|████████▌ | 855/1000 [04:15<00:41,  3.49it/s, loss=4.95]

tensor([ 325,  253,  198,   28,  198,  198,  260, 6737,  198,   28],
       device='cuda:0')
tensor([ 6824,   253, 26156,    28,   198,  2068,  7471,   623,  3506,   876],
       device='cuda:0')


Training:  86%|████████▌ | 856/1000 [04:15<00:40,  3.52it/s, loss=5.04]

tensor([ 30, 260,  28, 198, 198, 346,  28, 441, 198, 198], device='cuda:0')
tensor([  288, 10419,    28,   198,  1937, 10419,   523,  1188,   260,  1386],
       device='cuda:0')


Training:  86%|████████▌ | 857/1000 [04:15<00:40,  3.51it/s, loss=5.04]

tensor([ 28, 198, 260, 506, 100,  28, 198,  28, 198, 198], device='cuda:0')
tensor([   28,   327,  1839,   506, 15215,    28,  8913,    28,   284,  7241],
       device='cuda:0')


Training:  86%|████████▌ | 858/1000 [04:16<00:41,  3.45it/s, loss=4.89]

tensor([41003,   198,   198,  6213,    42,    28,   462,   482,    28,   198],
       device='cuda:0')
tensor([   42,   198,    51, 10942,   441,   295,   672,   482,    28,  3506],
       device='cuda:0')


Training:  86%|████████▌ | 859/1000 [04:16<00:41,  3.41it/s, loss=4.87]

tensor([  28,  198,  288,   30,  260,   28,  198,  198,  198, 6426],
       device='cuda:0')
tensor([  28,  685, 1683,  351,  549,   30,  198,  198,   55, 6426],
       device='cuda:0')


Training:  86%|████████▌ | 860/1000 [04:16<00:40,  3.47it/s, loss=4.91]

tensor([6189,  198,  198,  198,  339,  457,  325, 2494, 6737,  260],
       device='cuda:0')
tensor([1970,   30,  198, 8653,  339, 2526,  832, 2494,  284, 3287],
       device='cuda:0')


Training:  86%|████████▌ | 861/1000 [04:17<00:39,  3.53it/s, loss=4.92]

tensor([   0,  549,  198,  198,  198,  198, 2680, 2680,   55, 6426],
       device='cuda:0')
tensor([44228,    17,   423,   198,   198, 41074,  2680, 35135,    55,  2285],
       device='cuda:0')


Training:  86%|████████▌ | 862/1000 [04:17<00:38,  3.57it/s, loss=4.76]

tensor([ 381,   28,  198,  198,   28,  198,   28, 2608,   28,  260],
       device='cuda:0')
tensor([  381,    28,   198,  3528,  1188,  2276,   635,  5585,   327, 13987],
       device='cuda:0')


Training:  86%|████████▋ | 863/1000 [04:17<00:38,  3.60it/s, loss=4.94]

tensor([   28,   260,    28,   198,   198,   198,  2680,  2680, 25630,  4210],
       device='cuda:0')
tensor([  288,   874,    30,   198,   198, 41074,  2680, 19642, 25630,  5431],
       device='cuda:0')


Training:  86%|████████▋ | 864/1000 [04:17<00:37,  3.61it/s, loss=5.08]

tensor([198, 288,  28, 260,  28, 198, 198,  28, 198,  28], device='cuda:0')
tensor([4888, 6737,  990, 1029,   43,  198,   63,   28,  965, 2585],
       device='cuda:0')


Training:  86%|████████▋ | 865/1000 [04:18<00:37,  3.60it/s, loss=5.08]

tensor([  198,    28,   549,    28,   198, 34818,   253,   555,   381,    28],
       device='cuda:0')
tensor([ 6836, 29820,  1272,    42,   637, 34818,   253,   754, 10934, 15786],
       device='cuda:0')


Training:  87%|████████▋ | 866/1000 [04:18<00:37,  3.57it/s, loss=4.66]

tensor([   28, 21723,    28,   198,    28,    28,    28,   198,   198,  4501],
       device='cuda:0')
tensor([  957,  5717,    28, 13735,   441,   549,    43,   198,    57,   744],
       device='cuda:0')


Training:  87%|████████▋ | 867/1000 [04:18<00:36,  3.61it/s, loss=4.83]

tensor([   71, 20055,    42,    42,   198,   198,    28,   198,   572,    28],
       device='cuda:0')
tensor([   71, 20055,  9536,    42,   198,  2696,    28,  4074,   572, 45927],
       device='cuda:0')


Training:  87%|████████▋ | 868/1000 [04:18<00:36,  3.62it/s, loss=4.89]

tensor([ 339, 4083,   28,  282,  260, 4083, 6737,   93,    0,  282],
       device='cuda:0')
tensor([ 469, 1076, 1191,  355,  469, 7003,   29,  102,  520,  781],
       device='cuda:0')


Training:  87%|████████▋ | 869/1000 [04:19<00:36,  3.63it/s, loss=4.79]

tensor([6737, 4083,   28,   28,   28,  198,  198,  198,   69, 9620],
       device='cuda:0')
tensor([  653, 14119, 21723, 48384,    17,   198,   198,    52,    69,  5229],
       device='cuda:0')


Training:  87%|████████▋ | 870/1000 [04:19<00:36,  3.59it/s, loss=4.79]

tensor([ 28,  28, 260,  28, 282, 506, 100,  28, 198, 198], device='cuda:0')
tensor([2205,  282, 1123, 3261, 9446,  506, 2112,   47,  198,  198],
       device='cuda:0')


Training:  87%|████████▋ | 871/1000 [04:19<00:36,  3.52it/s, loss=4.81]

tensor([  198,   198,    69,  9620,  8772,   198,   718, 25089,    42,   198],
       device='cuda:0')
tensor([  198,    52,    69,  5229,  9248,  6016,   718, 25089,    42,   198],
       device='cuda:0')


Training:  87%|████████▋ | 872/1000 [04:20<00:36,  3.53it/s, loss=4.81]

tensor([506, 198, 198,  28, 282, 346, 457, 260, 260,  28], device='cuda:0')
tensor([   30,   198,    63,   926,   457,   339,  3984,   282, 21683,  1800],
       device='cuda:0')


Training:  87%|████████▋ | 873/1000 [04:20<00:36,  3.51it/s, loss=4.67]

tensor([21723,    28,    28,   260,    30,    28,   198,   198, 21723,    28],
       device='cuda:0')
tensor([22534, 48384,    23, 12575, 15534,    42,   198,  5965,  5295,   332],
       device='cuda:0')


Training:  87%|████████▋ | 874/1000 [04:20<00:35,  3.53it/s, loss=4.67]

tensor([ 198, 2097,   42,   42,  198,  198,  253,   28,  720,   28],
       device='cuda:0')
tensor([39813,  2097, 25466,    42,   198,  5315,  1123,  2552,  5612,   288],
       device='cuda:0')


Training:  88%|████████▊ | 875/1000 [04:20<00:35,  3.53it/s, loss=4.67]

tensor([ 6737,    28, 21723,    28,    28,   198,   198,   198, 41003,     0],
       device='cuda:0')
tensor([  258,   469, 31923,   974,    30,   198,   198,    66,  2062,  7430],
       device='cuda:0')


Training:  88%|████████▊ | 876/1000 [04:21<00:35,  3.49it/s, loss=4.62]

tensor([   28,   441,     0,    28, 21723,    28,   260,    28,  6737,   198],
       device='cuda:0')
tensor([  536, 14053,   609,   957,  2112,   351, 24478,   621, 13056,   198],
       device='cuda:0')


Training:  88%|████████▊ | 877/1000 [04:21<00:35,  3.46it/s, loss=5]   

tensor([   28,    28,   198,   198,  4197,    28,   260,   506,   260, 21723],
       device='cuda:0')
tensor([1671,   28,  198,  504, 4033,  282, 3996,  284,  653, 6349],
       device='cuda:0')


Training:  88%|████████▊ | 878/1000 [04:21<00:34,  3.56it/s, loss=4.88]

tensor([ 3609,   260, 21723,   282,    28,   198,   198,   253,  4197,    28],
       device='cuda:0')
tensor([23487,   469,  1761, 12630,    28,   198, 16721,   260,  1048,  2359],
       device='cuda:0')


Training:  88%|████████▊ | 879/1000 [04:22<00:33,  3.60it/s, loss=5.15]

tensor([  42,  198,  198, 4501,  325,  441, 1992,  322,  346,   28],
       device='cuda:0')
tensor([   42,   198,    57,  2161,   536, 13168,  1992,   322,   346, 44228],
       device='cuda:0')


Training:  88%|████████▊ | 880/1000 [04:22<00:33,  3.62it/s, loss=5.15]

tensor([  984,   198,   198,   198, 15604,  8242,  4728,   198,   679,    42],
       device='cuda:0')
tensor([   30,   198,   198,    50, 15604,    59,  4728,    56,  4153,    42],
       device='cuda:0')


Training:  88%|████████▊ | 881/1000 [04:22<00:33,  3.54it/s, loss=4.54]

tensor([  42,   28,   28,  260, 4197,   28,  198,  198,  279,   28],
       device='cuda:0')
tensor([ 2767, 20322,   418,   260, 29289,    43,   198,  1082,   105,  3379],
       device='cuda:0')


Training:  88%|████████▊ | 882/1000 [04:22<00:33,  3.49it/s, loss=4.71]

tensor([549,  28,  28,  28, 198, 198, 325,  93,   0,  28], device='cuda:0')
tensor([ 1771,   368,  3497,    42,   198, 12192,    29, 23724, 45927,  8759],
       device='cuda:0')


Training:  88%|████████▊ | 883/1000 [04:23<00:33,  3.45it/s, loss=5.05]

tensor([  28,  339,   28,  441,  339,  540,  198,  198,  198, 4728],
       device='cuda:0')
tensor([1991,  346,  523,  355,  787,   30,  198,  198,   59, 4728],
       device='cuda:0')


Training:  88%|████████▊ | 884/1000 [04:23<00:32,  3.53it/s, loss=5.09]

tensor([   28,   198,   198, 20055,    28,   100,    28,   198,    28,   100],
       device='cuda:0')
tensor([   28,   198,    71,  3592,   506,  6621,    28,   905,   506, 15786],
       device='cuda:0')


Training:  88%|████████▊ | 885/1000 [04:23<00:32,  3.54it/s, loss=5.09]

tensor([6737,  552,   42,  198,  198,   28,  549,   28,  339,  260],
       device='cuda:0')
tensor([   88,   552,    28,   198,  3528,  3727,   549,   638,   288, 27397],
       device='cuda:0')


Training:  89%|████████▊ | 886/1000 [04:24<00:32,  3.50it/s, loss=4.47]

tensor([  288,   540,    28,    28,   260,  4197, 21723,    28,   198,   339],
       device='cuda:0')
tensor([  787,   540, 11393,   282,   260,  9202,  1670,   198,  2193, 20322],
       device='cuda:0')


Training:  89%|████████▊ | 887/1000 [04:24<00:32,  3.51it/s, loss=4.8] 

tensor([21723,   282,   282,    28,    28,    28,   260,    28,   198,   198],
       device='cuda:0')
tensor([16087,   358,  3075,   874,   441,   288,  2606,    28,   198, 29752],
       device='cuda:0')


Training:  89%|████████▉ | 888/1000 [04:24<00:32,  3.49it/s, loss=4.8]

tensor([   42,    28,   198,   260, 21723,    28,   198,   198,   198,  4728],
       device='cuda:0')
tensor([10278,    28,   281,   480, 14553,    30,   198,   198,    59,  4728],
       device='cuda:0')


Training:  89%|████████▉ | 889/1000 [04:24<00:31,  3.48it/s, loss=4.62]

tensor([  28,  198,  198,  198, 2097,   42,   42,  198,  198,   28],
       device='cuda:0')
tensor([   47,   198,   198, 39813,  2097, 25466,    42,   198,  5345,    28],
       device='cuda:0')


Training:  89%|████████▉ | 890/1000 [04:25<00:31,  3.51it/s, loss=5.14]

tensor([ 28, 198,  28, 260, 198,  28, 457, 441,  28, 260], device='cuda:0')
tensor([   28, 14067,   335,    28,  1675,   392,   359,   614,   281,  6548],
       device='cuda:0')


Training:  89%|████████▉ | 891/1000 [04:25<00:31,  3.50it/s, loss=5.14]

tensor([  549,   260,   260,   198, 21723, 21723,    28,   198,   198,   198],
       device='cuda:0')
tensor([  284,  1188,    28,   957, 47910, 21723,    30,   198,   198,    59],
       device='cuda:0')


Training:  89%|████████▉ | 892/1000 [04:25<00:30,  3.49it/s, loss=5]   

tensor([   0,  288,  260, 4197,   28,   28,  198, 6213,  260,   28],
       device='cuda:0')
tensor([18266,   284,   260, 27508,   655,   198,    51,  7380,   767,   260],
       device='cuda:0')


Training:  89%|████████▉ | 893/1000 [04:26<00:30,  3.51it/s, loss=4.84]

tensor([ 28, 198, 338, 339, 457,  30, 260, 198, 339, 325], device='cuda:0')
tensor([   28,   967,  5337,   392,  3568,  1523,   198, 39855,  2161,   457],
       device='cuda:0')


Training:  89%|████████▉ | 894/1000 [04:26<00:29,  3.56it/s, loss=5.15]

tensor([  28,    0,   28,  198,  339,  457, 2593, 6737,   28,   28],
       device='cuda:0')
tensor([  267, 30250,   198,  5519,   339,   475,  2593,  6737,   767,    28],
       device='cuda:0')


Training:  90%|████████▉ | 895/1000 [04:26<00:28,  3.62it/s, loss=5.25]

tensor([  260,   260,    28,   260,   288,   260, 21723,    28,    28,   198],
       device='cuda:0')
tensor([ 4145, 46160,   284, 42521,   288,   957,  1904,  4778,    43,   198],
       device='cuda:0')


Training:  90%|████████▉ | 896/1000 [04:26<00:28,  3.68it/s, loss=5.17]

tensor([   0,   28,  555,   28,   28,  198,  288,  260, 4197,  282],
       device='cuda:0')
tensor([ 7576,   253, 10001, 29562,   198,  8155,   281,   260,  1450,  1670],
       device='cuda:0')


Training:  90%|████████▉ | 897/1000 [04:27<00:27,  3.69it/s, loss=5.15]

tensor([   0,  198,  198,   28,   28, 6737,  339,  260, 6737,   28],
       device='cuda:0')
tensor([   28,   198, 25885, 26429, 12101,   638,   288, 47850, 20322,    42],
       device='cuda:0')


Training:  90%|████████▉ | 898/1000 [04:27<00:27,  3.71it/s, loss=5.3] 

tensor([6737,   28,  198, 6737,   28,  260,   28,  441,  253,   28],
       device='cuda:0')
tensor([ 372,  323,   82, 1484,  284, 7576,  359,  702,  827, 1800],
       device='cuda:0')


Training:  90%|████████▉ | 899/1000 [04:27<00:27,  3.70it/s, loss=5.1]

tensor([ 28,  28,  28, 198, 260, 282, 260,  28, 198, 198], device='cuda:0')
tensor([3726, 4157,   28, 1980, 1556,  282, 1029,   28,  198,   68],
       device='cuda:0')


Training:  90%|█████████ | 900/1000 [04:27<00:27,  3.67it/s, loss=4.97]

tensor([21723,   506,   325,   260, 21723,    28,   198,   198,   198,  4728],
       device='cuda:0')
tensor([10054,  2526,  4571,   957,  6943,    30,   198,   198,    59,  4728],
       device='cuda:0')


Training:  90%|█████████ | 901/1000 [04:28<00:27,  3.65it/s, loss=5.27]

tensor([6737,   28,  339,   28,   28,   28,  260,   28,  198,  198],
       device='cuda:0')
tensor([20641,   645,   346,   599,   277,   351,  1272,    47,   198,   198],
       device='cuda:0')


Training:  90%|█████████ | 902/1000 [04:28<00:27,  3.62it/s, loss=5.27]

tensor([6737,   28,  288,  789,  339,  260, 4197,   42,  198,  198],
       device='cuda:0')
tensor([ 7501,  1114,   287,   789, 43228,   260, 31049,    42,   198, 21553],
       device='cuda:0')


Training:  90%|█████████ | 903/1000 [04:28<00:27,  3.57it/s, loss=4.98]

tensor([ 984,    0, 6737,   28,  282,  260,   28,   28,   28,   42],
       device='cuda:0')
tensor([  252,  1207,  6737,   578,   284, 43537,   318,  1076, 35216,    42],
       device='cuda:0')


Training:  90%|█████████ | 904/1000 [04:29<00:26,  3.57it/s, loss=5.15]

tensor([257,  28, 198, 198, 325, 260,  28,  28, 198, 260], device='cuda:0')
tensor([ 318, 4699,  198, 2068,  963,  451, 1861,   30, 1626,   29],
       device='cuda:0')


Training:  90%|█████████ | 905/1000 [04:29<00:26,  3.60it/s, loss=5.03]

tensor([ 93, 552,  28, 282, 282, 198, 339, 260, 555,  28], device='cuda:0')
tensor([   88,  9535,  2751,  1352,   198,  3528,  7471,   253, 40590, 27044],
       device='cuda:0')


Training:  91%|█████████ | 906/1000 [04:29<00:25,  3.62it/s, loss=4.98]

tensor([ 93,  28, 198, 339, 457, 288, 260, 198, 457, 253], device='cuda:0')
tensor([1222,   28,  347,  339, 2422,  411,   28,  339, 1217,  665],
       device='cuda:0')


Training:  91%|█████████ | 907/1000 [04:29<00:25,  3.61it/s, loss=4.84]

tensor([   42,    28,   198,    28,   198,   198,   198,  2680, 24625, 17376],
       device='cuda:0')
tensor([  589,    28,  7706,    47,   198,   198,    56,  2680, 24625, 33702],
       device='cuda:0')


Training:  91%|█████████ | 908/1000 [04:30<00:25,  3.61it/s, loss=5.02]

tensor([  28,  198,  198,  457,  441,  555,   28,  198, 8292,  290],
       device='cuda:0')
tensor([   30,   198,  2683,   457,   253,  4132,    28,   330, 13468,   290],
       device='cuda:0')


Training:  91%|█████████ | 909/1000 [04:30<00:24,  3.65it/s, loss=5.2] 

tensor([ 198,  198,  260, 4197,   28,  282,  260, 4314, 6737,   28],
       device='cuda:0')
tensor([  198, 28925,   260, 14568,  6956,   282,   278,  4314,   358,    43],
       device='cuda:0')


Training:  91%|█████████ | 910/1000 [04:30<00:24,  3.69it/s, loss=5.2]

tensor([  0,  28,  28,  28, 260,  28, 260,  28, 198, 198], device='cuda:0')
tensor([ 1319,   549,  3287,   282,  3878,   564, 20727,    30,   198,  2705],
       device='cuda:0')


Training:  91%|█████████ | 911/1000 [04:31<00:23,  3.73it/s, loss=4.98]

tensor([6737,   28,  260,   28,  198,  339,  339,   28,  282,  260],
       device='cuda:0')
tensor([1038,  564, 2112,  198, 3528,  338, 1165, 1743,  282,  260],
       device='cuda:0')


Training:  91%|█████████ | 912/1000 [04:31<00:23,  3.72it/s, loss=5.33]

tensor([  198,   198,   198,  2680, 24625,   389,  7113,  4728,   428,  4501],
       device='cuda:0')
tensor([  198,   198,    56,  2680, 24625,   389,  7113,  4728,    50,  3911],
       device='cuda:0')


Training:  91%|█████████▏| 913/1000 [04:31<00:23,  3.70it/s, loss=4.83]

tensor([   28,   198,   260, 21723,    28,   198,   325, 21723,   198,   260],
       device='cuda:0')
tensor([   28,   284,   650,  1911,   198,  2068, 17831,  2754,   282,   469],
       device='cuda:0')


Training:  91%|█████████▏| 914/1000 [04:31<00:23,  3.70it/s, loss=5.02]

tensor([198, 457, 325, 549, 198, 260, 288,  28, 198, 198], device='cuda:0')
tensor([ 339, 3060, 1928,   28,  284, 6819, 1147,   43,  198, 2193],
       device='cuda:0')


Training:  92%|█████████▏| 915/1000 [04:32<00:23,  3.66it/s, loss=4.87]

tensor([ 198,  198,  198,   28,   28,  198,  441,  260, 4197,   28],
       device='cuda:0')
tensor([   28,   198, 22204,  3930,    28,   359,   511,   260,  2321,  8770],
       device='cuda:0')


Training:  92%|█████████▏| 916/1000 [04:32<00:22,  3.65it/s, loss=5.07]

tensor([   0,   28,  260,   28,  198,  271,  339,   28, 4197,  361],
       device='cuda:0')
tensor([  524,   284, 23414,   198,    68,   388, 17072,   260,  8423,    29],
       device='cuda:0')


Training:  92%|█████████▏| 917/1000 [04:32<00:22,  3.68it/s, loss=4.94]

tensor([ 198,  381,   42,  198,   28,  339,   28,  441,  260, 4197],
       device='cuda:0')
tensor([  60, 4768,   28,  346,  338, 1535,  359,  656,  653, 8641],
       device='cuda:0')


Training:  92%|█████████▏| 918/1000 [04:32<00:22,  3.61it/s, loss=4.94]

tensor([   28, 21723,    28,   100,   198,   198,   325,    42,     0,   198],
       device='cuda:0')
tensor([ 957, 8664,  506, 9596,  198, 2068, 3423,   85,  936,  260],
       device='cuda:0')


Training:  92%|█████████▏| 919/1000 [04:33<00:23,  3.51it/s, loss=5.02]

tensor([  282,   441,    28,   260,   198,   198, 21723,    28,   198,     0],
       device='cuda:0')
tensor([ 536,  441,  963,   28,  198, 5965, 3506, 8739,  267, 1878],
       device='cuda:0')


Training:  92%|█████████▏| 920/1000 [04:33<00:22,  3.49it/s, loss=5.02]

tensor([ 42, 198, 198,  28, 549, 457,  28, 198, 260,  28], device='cuda:0')
tensor([   43,   198, 14344,  1928,   339,  8949,    28,   284,  3310,  1188],
       device='cuda:0')


Training:  92%|█████████▏| 921/1000 [04:33<00:23,  3.43it/s, loss=4.57]

tensor([ 28, 198, 198,  28, 198, 260,  28,  28, 198, 260], device='cuda:0')
tensor([   30,   198,  2696,    28,   411,  8949, 19461,    28,   411,   957],
       device='cuda:0')


Training:  92%|█████████▏| 922/1000 [04:34<00:22,  3.41it/s, loss=4.79]

tensor([   28,   198,   339,    28,   198,   100,    28,   198, 21723,    28],
       device='cuda:0')
tensor([   28,   347, 17072,  1012,   506,   100,    28, 13987,  8135,  1301],
       device='cuda:0')


Training:  92%|█████████▏| 923/1000 [04:34<00:22,  3.49it/s, loss=5.08]

tensor([3911, 4728,  428, 3911, 9620,   42,  198,  198,  260,  260],
       device='cuda:0')
tensor([7113, 4728,   50, 3911, 9620,   42,  198, 3825,  511,  957],
       device='cuda:0')


Training:  92%|█████████▏| 924/1000 [04:34<00:21,  3.51it/s, loss=5.08]

tensor([314, 540,  28, 325, 260, 339, 339,  28,  28, 198], device='cuda:0')
tensor([ 787,  555, 2093, 2216,  564,  338, 8177, 2767,  198, 5195],
       device='cuda:0')


Training:  92%|█████████▎| 925/1000 [04:34<00:21,  3.44it/s, loss=4.87]

tensor([   0, 6737,  339,  441,   28,  282,  198,  198,  339,  441],
       device='cuda:0')
tensor([ 2164,   338,   536,  8018,   737,    28,   198, 27737,   536,   339],
       device='cuda:0')


Training:  93%|█████████▎| 926/1000 [04:35<00:22,  3.32it/s, loss=5.02]

tensor([   28,   549,   260, 21723,    28,   198,   198,   260,   282,   260],
       device='cuda:0')
tensor([ 5852,   429,   469, 15062,    28,   198,  3280,  1556,   282, 19888],
       device='cuda:0')


Training:  93%|█████████▎| 927/1000 [04:35<00:22,  3.31it/s, loss=5.02]

tensor([282,  28, 198, 198,  42,  30, 260, 549, 198, 284], device='cuda:0')
tensor([ 2717,    17,   198, 11114,  1063,   282,  1022,    28,  5681,  7864],
       device='cuda:0')


Training:  93%|█████████▎| 928/1000 [04:35<00:21,  3.33it/s, loss=5.02]

tensor([ 28, 325, 441, 198, 198,  28, 260,  28, 282, 198], device='cuda:0')
tensor([ 3786,   325,    42,   198, 14413,   335,   540,  1737,    28,   282],
       device='cuda:0')


Training:  93%|█████████▎| 929/1000 [04:36<00:21,  3.33it/s, loss=5.01]

tensor([   42,    28,   198,   198,   198, 20055,   288,   260,  4197,    28],
       device='cuda:0')
tensor([17264,    17,   423,   198,    71,   518,   282,   354,  1850,    42],
       device='cuda:0')


Training:  93%|█████████▎| 930/1000 [04:36<00:20,  3.34it/s, loss=5]   

tensor([  0, 253, 282,   0,  28, 198, 198, 457, 325, 339], device='cuda:0')
tensor([  715, 11415,  1088,   383,    42,   198,  1882,  3060,   457,   787],
       device='cuda:0')


Training:  93%|█████████▎| 931/1000 [04:36<00:20,  3.36it/s, loss=5.01]

tensor([ 28,  28, 260, 260, 198, 198, 260,  28,  28, 260], device='cuda:0')
tensor([  808,   288,  7012,    28,   198, 19671,   601,  1554,   282,  1123],
       device='cuda:0')


Training:  93%|█████████▎| 932/1000 [04:36<00:19,  3.40it/s, loss=5.01]

tensor([ 339,  346,   28,   28,  198,  198, 6737, 4197,   28,   28],
       device='cuda:0')
tensor([  585, 17072,  2265,    28,   198, 21175,   253,  3506,  4313,   284],
       device='cuda:0')


Training:  93%|█████████▎| 933/1000 [04:37<00:19,  3.42it/s, loss=5.11]

tensor([ 28,  42,  28,  28, 198, 198,  95,   0, 198,  42], device='cuda:0')
tensor([   90,   424,  1147,    30,   198, 46292,    95,    17,   948,  1970],
       device='cuda:0')


Training:  93%|█████████▎| 934/1000 [04:37<00:18,  3.51it/s, loss=4.77]

tensor([   28,   457,    28,  4197,    28,   198,   198, 17072,    28,    28],
       device='cuda:0')
tensor([  339,   699,   260,  2139,    42,   198,  9393, 17072,   441, 30493],
       device='cuda:0')


Training:  94%|█████████▎| 935/1000 [04:37<00:18,  3.52it/s, loss=4.77]

tensor([  346,    28, 21723,    28,   260,  4197,    28,   198,   198,   198],
       device='cuda:0')
tensor([20322,   957, 21723,  2126,   260,   905,    30,   198,   198,    62],
       device='cuda:0')


Training:  94%|█████████▎| 936/1000 [04:38<00:18,  3.48it/s, loss=5.1] 

tensor([    0, 17072,    28,    28,   198,    28,   198,   198,   198,  9620],
       device='cuda:0')
tensor([21965, 17072,   719,    28,   965,    47,   198,   198,  3911,  3945],
       device='cuda:0')


Training:  94%|█████████▎| 937/1000 [04:38<00:18,  3.42it/s, loss=5.12]

tensor([  28,   28,  198,   28,    0, 4197,  541,   28,  198,  260],
       device='cuda:0')
tensor([  874,    43,  9725,    95,   253, 12724, 13285,    43, 41916,   253],
       device='cuda:0')


Training:  94%|█████████▍| 938/1000 [04:38<00:18,  3.39it/s, loss=5.01]

tensor([   0,  260, 4197,   42, 6737,   28,  198,  198,  198, 9620],
       device='cuda:0')
tensor([ 282,  650,  651,  456,  105,   47,  198,  198, 3911, 3945],
       device='cuda:0')


Training:  94%|█████████▍| 939/1000 [04:39<00:17,  3.45it/s, loss=4.89]

tensor([100,  28,  28,  28, 198, 198, 339,  28, 253,  28], device='cuda:0')
tensor([  100, 17072,  8177,    47,   198, 12908,  3661,   325,  8177,    28],
       device='cuda:0')


Training:  94%|█████████▍| 940/1000 [04:39<00:17,  3.48it/s, loss=5.01]

tensor([   28,    28,   282,   198,   198,   339, 21723,    28,    28,   260],
       device='cuda:0')
tensor([ 337, 4048,   43,  198, 2596,  957, 2629, 2606,  314, 5951],
       device='cuda:0')


Training:  94%|█████████▍| 941/1000 [04:39<00:16,  3.47it/s, loss=5.01]

tensor([2680, 4628, 4501,   42,  198,  198,   28,  198,   28,  198],
       device='cuda:0')
tensor([31540,  4628,  8772,    42,   198, 34056,    28, 38061,    28,   469],
       device='cuda:0')


Training:  94%|█████████▍| 942/1000 [04:39<00:16,  3.42it/s, loss=4.95]

tensor([  28,  260,  339,   30,  198,  198,  314,  260, 4197,   28],
       device='cuda:0')
tensor([1607,  284, 6917,   30,  198, 1348,  314,  260, 4569,   28],
       device='cuda:0')


Training:  94%|█████████▍| 943/1000 [04:40<00:16,  3.42it/s, loss=4.98]

tensor([ 28, 260,  28, 198, 198, 198, 341,  42, 198, 198], device='cuda:0')
tensor([  359,  7270,    47,   198,   198,    62, 10942,    42,   198, 35251],
       device='cuda:0')


Training:  94%|█████████▍| 944/1000 [04:40<00:16,  3.44it/s, loss=4.98]

tensor([4197,   28,  260,   30,  339,   28,  198,  198,  198, 9620],
       device='cuda:0')
tensor([ 905,  314, 2932,  284, 3599,   30,  198,  198, 3911, 3945],
       device='cuda:0')


Training:  94%|█████████▍| 945/1000 [04:40<00:16,  3.44it/s, loss=4.91]

tensor([  28,   28,   28,  282,  198,  198,  198,   42, 2285, 3438],
       device='cuda:0')
tensor([13735,   739, 34104,    30,   198,   198,    54,  5135,  2285, 18630],
       device='cuda:0')


Training:  95%|█████████▍| 946/1000 [04:41<00:15,  3.45it/s, loss=4.91]

tensor([  0,  28, 457, 260,  28,  28,  28, 198, 198, 253], device='cuda:0')
tensor([ 2275,   392,  3408,  1272,  1176,  8322,    28,   198, 16721,   653],
       device='cuda:0')


Training:  95%|█████████▍| 947/1000 [04:41<00:15,  3.46it/s, loss=4.72]

tensor([ 198,  198,  381,   73,   42, 6565,   57,   42,  198,  198],
       device='cuda:0')
tensor([  198,    60,  3907,    73, 29146,  6565,  3438,    42,   198,  5195],
       device='cuda:0')


Training:  95%|█████████▍| 948/1000 [04:41<00:14,  3.49it/s, loss=4.61]

tensor([6737,   28,  198,  198, 3060,   28,   28,   42,  198,  198],
       device='cuda:0')
tensor([  313,    42,   198,    57,  3287,   787, 33974,    30,   198,   198],
       device='cuda:0')


Training:  95%|█████████▍| 949/1000 [04:41<00:14,  3.50it/s, loss=4.9] 

tensor([  57, 3060,   42,  198,  198,  325,   28,  339,  198, 3060],
       device='cuda:0')
tensor([  57, 3438,   42,  198, 2068, 2988,  338,   28,  339,  868],
       device='cuda:0')


Training:  95%|█████████▌| 950/1000 [04:42<00:14,  3.52it/s, loss=4.7]

tensor([260,  28, 325,  28, 198, 198, 449, 339,  28, 260], device='cuda:0')
tensor([17072, 36034, 22576,    28,   198,  4370,   449, 30493,   411,   957],
       device='cuda:0')


Training:  95%|█████████▌| 951/1000 [04:42<00:14,  3.49it/s, loss=4.7]

tensor([   0,   28,  198,  198,  449,  339, 3060,   28,  198,   28],
       device='cuda:0')
tensor([1225,   17,  198, 4370,  449,  339,  441,   28,  965,   28],
       device='cuda:0')


Training:  95%|█████████▌| 952/1000 [04:42<00:14,  3.43it/s, loss=5.12]

tensor([ 42, 260,  28, 198, 253,  28, 260,  28, 198, 198], device='cuda:0')
tensor([ 288,  685,   28,  564, 2093,  288, 1003,   30,  198,   63],
       device='cuda:0')


Training:  95%|█████████▌| 953/1000 [04:43<00:13,  3.46it/s, loss=5.22]

tensor([   28,    28,   253,   555,    28,    28,   198,   198,   198, 46634],
       device='cuda:0')
tensor([  331, 26365,   253,  6243,  2139,    30,   198,   198, 43029,  1754],
       device='cuda:0')


Training:  95%|█████████▌| 954/1000 [04:43<00:13,  3.51it/s, loss=5.05]

tensor([   0,   28,   28,   28,  198,  198,  198, 3945,   63,   42],
       device='cuda:0')
tensor([1503,  346, 5173,   30,  198,  198, 3911, 3945,   63,   42],
       device='cuda:0')


Training:  96%|█████████▌| 955/1000 [04:43<00:12,  3.55it/s, loss=5.05]

tensor([ 28,  28,  28, 198, 549,  28, 555,  28,  28, 198], device='cuda:0')
tensor([ 9154,  5569,    28, 13804,   441,   253, 22415,   555,    43,   198],
       device='cuda:0')


Training:  96%|█████████▌| 956/1000 [04:43<00:12,  3.52it/s, loss=4.91]

tensor([ 28, 198,  42, 198,  28, 260, 339, 282, 198, 260], device='cuda:0')
tensor([   30, 14755,    28,  1690,   429,   338, 10172,   198,  6228,  2112],
       device='cuda:0')


Training:  96%|█████████▌| 957/1000 [04:44<00:12,  3.57it/s, loss=4.99]

tensor([ 28, 549,  28, 260,  28,  28,  28, 198, 198,  28], device='cuda:0')
tensor([7200,  874,  429,  451, 1796, 6724,   28,  198, 6882,  281],
       device='cuda:0')


Training:  96%|█████████▌| 958/1000 [04:44<00:11,  3.57it/s, loss=4.99]

tensor([  28,   28,   28,  198,  198, 3060, 6737,  260,   28,  198],
       device='cuda:0')
tensor([  315,   851,    28,   198,    57, 14941,   411, 10419,   623,  3497],
       device='cuda:0')


Training:  96%|█████████▌| 959/1000 [04:44<00:11,  3.54it/s, loss=4.47]

tensor([ 346,  198,  198, 3060,  253, 4197,   28,  260,  506, 4197],
       device='cuda:0')
tensor([   42,   198,    57,   744,   260,  4132,   282,  6661,   260, 25729],
       device='cuda:0')


Training:  96%|█████████▌| 960/1000 [04:44<00:11,  3.49it/s, loss=4.47]

tensor([  0,  28,  28,  28, 198, 198, 339, 253,  28, 339], device='cuda:0')
tensor([ 254,  349,  269,   30,  198, 2596,  325,  357,  347,  357],
       device='cuda:0')


Training:  96%|█████████▌| 961/1000 [04:45<00:11,  3.47it/s, loss=4.74]

tensor([6574,   30,  198,  198,  198,   42,   42,  198,  198,  260],
       device='cuda:0')
tensor([ 1448,    30,   198,   198,    73, 25089,    42,   198,  9629,   732],
       device='cuda:0')


Training:  96%|█████████▌| 962/1000 [04:45<00:10,  3.51it/s, loss=5.1] 

tensor([   28,   260,  6737,    28,  4197,    28,   339, 21723,    28,   325],
       device='cuda:0')
tensor([ 2853,    96,  6737,   260,  7429,   837, 13987,  1924,   868,  4137],
       device='cuda:0')


Training:  96%|█████████▋| 963/1000 [04:45<00:10,  3.53it/s, loss=5.04]

tensor([  28,  198,  198,  198, 2285,   54,   42,   42,  198,  198],
       device='cuda:0')
tensor([   30,   198,   198,  7430,  8107,    54, 10030,    42,   198, 45496],
       device='cuda:0')


Training:  96%|█████████▋| 964/1000 [04:46<00:10,  3.60it/s, loss=4.98]

tensor([   28,    28,   198,   339,   198,   302,  3060,    28,  6737, 21723],
       device='cuda:0')
tensor([  588,   198,  5195, 12765,   416,   339,  1089,  3806,   957,  3497],
       device='cuda:0')


Training:  96%|█████████▋| 965/1000 [04:46<00:09,  3.64it/s, loss=5.14]

tensor([6737,  282,  314,  457,   28,  198,  198, 4197, 4197,   28],
       device='cuda:0')
tensor([ 4196,   384,   392,   437,    28,   198,   504, 42502, 15731,  4502],
       device='cuda:0')


Training:  97%|█████████▋| 966/1000 [04:46<00:09,  3.71it/s, loss=5.04]

tensor([6737,  339,   28, 4083,  282,  260,   42,  198,  198,  198],
       device='cuda:0')
tensor([10937,  1209,   650, 26467,   282, 12669,    30,   198,   198, 29719],
       device='cuda:0')


Training:  97%|█████████▋| 967/1000 [04:47<00:08,  3.72it/s, loss=4.63]

tensor([  28,  260, 1038,  282,  198,  198,  351,  260, 1038,   28],
       device='cuda:0')
tensor([15032,   480, 10172,    28,   198, 16654,   259,   480,  1038,  2397],
       device='cuda:0')


Training:  97%|█████████▋| 968/1000 [04:47<00:08,  3.69it/s, loss=4.94]

tensor([  339,   549,    28,    30,   198,   198,   198,  2680, 24625,    55],
       device='cuda:0')
tensor([ 4875,   549,  3287,    30,   198,   198, 41074,  2680, 35135,    55],
       device='cuda:0')


Training:  97%|█████████▋| 969/1000 [04:47<00:08,  3.70it/s, loss=5.13]

tensor([ 28, 260, 381,  28, 260,  42,   0,  28,  28, 198], device='cuda:0')
tensor([  281,   544,   381,   411,  4360,  4565,  1802, 11064,    47,   198],
       device='cuda:0')


Training:  97%|█████████▋| 970/1000 [04:47<00:08,  3.75it/s, loss=4.89]

tensor([ 28,  28, 198, 198, 339,  28, 198,  28, 198, 260], device='cuda:0')
tensor([ 4463,    28,   198,  2427,  1176,    28, 37494,    28,   284, 33974],
       device='cuda:0')


Training:  97%|█████████▋| 971/1000 [04:48<00:07,  3.74it/s, loss=5.05]

tensor([  260,   260, 21723,    28,   198,   198, 21723,    28,   260,    28],
       device='cuda:0')
tensor([  351,   957,  8664,    17,   198,  5965,  2606,   284,  3696, 36644],
       device='cuda:0')


Training:  97%|█████████▋| 972/1000 [04:48<00:07,  3.71it/s, loss=4.81]

tensor([   28,    28,    28,   198,   198,   339,   198,   260,    28, 21723],
       device='cuda:0')
tensor([ 2874,  1592,    43,   198,  3528,    28,  1953,  4649, 13987,  3289],
       device='cuda:0')


Training:  97%|█████████▋| 973/1000 [04:48<00:07,  3.64it/s, loss=4.81]

tensor([ 28, 325, 253,  42,  28, 198, 325, 288,  28, 260], device='cuda:0')
tensor([1251,  325, 2627,  277,  198, 2068,  685, 1683,  351,  468],
       device='cuda:0')


Training:  97%|█████████▋| 974/1000 [04:48<00:07,  3.51it/s, loss=4.07]

tensor([  198,   381,    73,    42,   198,    42,   198,   198, 21723,    28],
       device='cuda:0')
tensor([   60,  3907,    73, 23675,    73,    42,   198,  5965,  2606, 10937],
       device='cuda:0')


Training:  98%|█████████▊| 975/1000 [04:49<00:07,  3.50it/s, loss=5.09]

tensor([  28,  253,   42,   28,   28,  260, 4197, 4314,   28,  198],
       device='cuda:0')
tensor([16568,  8767,   301,  2177,   351,   634,   278,  6842,    28,   198],
       device='cuda:0')


Training:  98%|█████████▊| 976/1000 [04:49<00:06,  3.56it/s, loss=5.01]

tensor([   42,   198,   198,   198,    71, 20055,    42,   198,   198,   260],
       device='cuda:0')
tensor([   30,   198,   198, 29719,    71, 27163,    42,   198,  5212, 14942],
       device='cuda:0')


Training:  98%|█████████▊| 977/1000 [04:49<00:06,  3.59it/s, loss=4.84]

tensor([  549,   198,   198,   198,  4728,   428,    71, 20055,    42,    42],
       device='cuda:0')
tensor([   30,   198,   198,    59,  4728, 19436,    71,  2721, 38744,    42],
       device='cuda:0')


Training:  98%|█████████▊| 978/1000 [04:50<00:06,  3.62it/s, loss=4.73]

tensor([ 28, 198, 260, 549,  28, 506,  28, 198, 198,  28], device='cuda:0')
tensor([   28,   284, 18089,  7065,  9446,  5182,    43,   198,  3528,    28],
       device='cuda:0')


Training:  98%|█████████▊| 979/1000 [04:50<00:05,  3.67it/s, loss=4.65]

tensor([4197,  282,    0,   28,  260, 4083,   29,   28,  100,   28],
       device='cuda:0')
tensor([ 5151, 11948,  5559,   335,   469,   725,  6221,   506,  4132,    28],
       device='cuda:0')


Training:  98%|█████████▊| 980/1000 [04:50<00:05,  3.70it/s, loss=4.83]

tensor([  42,  253, 3060,   28,  260,   28,  198,  198,   28,   28],
       device='cuda:0')
tensor([ 744,  339, 2090,  282, 9970,   30,  198, 2696, 3472, 1303],
       device='cuda:0')


Training:  98%|█████████▊| 981/1000 [04:50<00:05,  3.64it/s, loss=4.83]

tensor([   28,   198,   198,   198,  4728,   428,    71, 20055,    42,    42],
       device='cuda:0')
tensor([   30,   198,   198,    59,  4728, 15494,    71, 20055,  9536,    42],
       device='cuda:0')


Training:  98%|█████████▊| 982/1000 [04:51<00:04,  3.61it/s, loss=4.78]

tensor([  28,  282,  260,  282,   42,  198,  359,   28,   28, 4083],
       device='cuda:0')
tensor([4658,  282, 4929, 9686,  198, 4948, 3100, 6616,  480, 2184],
       device='cuda:0')


Training:  98%|█████████▊| 983/1000 [04:51<00:04,  3.65it/s, loss=4.67]

tensor([  28, 4197,   28,  198,  198,   28,   28,   30,  506,    0],
       device='cuda:0')
tensor([  260,  1532,    28,   198, 16075,  3310,  1869, 18598,   283,   494],
       device='cuda:0')


Training:  98%|█████████▊| 984/1000 [04:51<00:04,  3.63it/s, loss=4.67]

tensor([  198,   198,   198,    71, 20055,    42,   198,   198,    28,    28],
       device='cuda:0')
tensor([  198,   198, 29719,    71, 27163,    42,   198,  1780,  8296,    28],
       device='cuda:0')


Training:  98%|█████████▊| 985/1000 [04:51<00:04,  3.57it/s, loss=4.58]

tensor([17072,    28,    23,    28,  4197,    28,   282,   198,   198,   494],
       device='cuda:0')
tensor([17072,   263,   432,   260,  2240, 17816,    28,   198, 15024,   494],
       device='cuda:0')


Training:  99%|█████████▊| 986/1000 [04:52<00:03,  3.62it/s, loss=4.94]

tensor([  28,   28, 4197,  282,  260,   28,  198,  198,   28,  325],
       device='cuda:0')
tensor([3934,  260, 3102,  284, 3568,   47,  198, 6882, 3786,  392],
       device='cuda:0')


Training:  99%|█████████▊| 987/1000 [04:52<00:03,  3.63it/s, loss=4.87]

tensor([  28, 6189,  260,  253, 4197,   28,   28,  198,  260,  260],
       device='cuda:0')
tensor([   93,   284,   702,   253, 27508, 18030,   198,  3825, 12885,   739],
       device='cuda:0')


Training:  99%|█████████▉| 988/1000 [04:52<00:03,  3.62it/s, loss=4.87]

tensor([ 28, 549,  30,  30, 198, 198,  28, 198, 302,  28], device='cuda:0')
tensor([4875, 1272, 3287,   30,  198, 1780,   17,  416,  588, 1805],
       device='cuda:0')


Training:  99%|█████████▉| 989/1000 [04:53<00:03,  3.55it/s, loss=4.81]

tensor([   28,   198,   198,    28,    28,   789, 17072,    28,   198,    28],
       device='cuda:0')
tensor([  28,  198, 3528, 1062, 3005,  474, 2442,   43, 3472,   28],
       device='cuda:0')


Training:  99%|█████████▉| 990/1000 [04:53<00:02,  3.60it/s, loss=5.32]

tensor([  28,  100, 4197,   28,  198,  198,    0,   93, 6737,  284],
       device='cuda:0')
tensor([  506,  9202, 15308,    28,   198,  3733,    29,    96,  2467,   821],
       device='cuda:0')


Training:  99%|█████████▉| 991/1000 [04:53<00:02,  3.67it/s, loss=5.02]

tensor([  282,    30,   198,   339,   253,   260,   260,   198, 21723,    28],
       device='cuda:0')
tensor([ 1163,   198, 26001,   325,  1042,   614,    28,   957,  5717,    28],
       device='cuda:0')


Training:  99%|█████████▉| 992/1000 [04:53<00:02,  3.64it/s, loss=5.13]

tensor([6737,  198,  198,  198,   42, 2097,   42,  198,  198,   28],
       device='cuda:0')
tensor([   47,   198,   198,  4796, 32598,  2097,    42,   198, 19525,    28],
       device='cuda:0')


Training:  99%|█████████▉| 993/1000 [04:54<00:01,  3.62it/s, loss=5.21]

tensor([4197,   28,  198, 4197,  282,  198,  346, 4083,   28,   28],
       device='cuda:0')
tensor([  599,    28,  3449,  2843,   198, 20314,   650,    99,   549,   288],
       device='cuda:0')


Training:  99%|█████████▉| 994/1000 [04:54<00:01,  3.62it/s, loss=5.22]

tensor([ 198,   28,   28,  260,  260,   28,  198,  198,  198, 4192],
       device='cuda:0')
tensor([ 1643,   982,   284, 16011,   982,    30,   198,   198, 39776,  4192],
       device='cuda:0')


Training: 100%|█████████▉| 995/1000 [04:54<00:01,  3.64it/s, loss=5.04]

tensor([   28,   198,   198,   198,  4192,  6426, 15604,   198,   198,    28],
       device='cuda:0')
tensor([   47,   198,   198, 39776,  4192,  6426,    42,   198,  3482,  3861],
       device='cuda:0')


Training: 100%|█████████▉| 996/1000 [04:54<00:01,  3.51it/s, loss=5.04]

tensor([  198,  4197,    28,   253, 21723,    28,   198,   260,   260,   198],
       device='cuda:0')
tensor([14229, 12557,   325,   957,  3289,    28,   284,  4160,   198,   504],
       device='cuda:0')


Training: 100%|█████████▉| 997/1000 [04:55<00:00,  3.54it/s, loss=5.04]

tensor([549,  28, 325, 253, 198, 198, 314, 100,   0,  42], device='cuda:0')
tensor([ 357,  868,  325,   28,  198, 8113,  506,  354, 2490,   85],
       device='cuda:0')


Training: 100%|█████████▉| 998/1000 [04:55<00:00,  3.58it/s, loss=5.03]

tensor([  346,    28,   198,   339, 11199,    28,    28,   198,   198,    28],
       device='cuda:0')
tensor([  346,    28,   355,   328,   101,   432, 31886,   198,  6882, 43624],
       device='cuda:0')


Training: 100%|█████████▉| 999/1000 [04:55<00:00,  3.59it/s, loss=4.8] 

tensor([ 28, 198,  42,  30, 260,  93,   0,  28, 198,  28], device='cuda:0')
tensor([  198,    54,  4985,   284,   430, 33659,  1718,    28,   441,   253],
       device='cuda:0')


Training: 100%|██████████| 1000/1000 [04:56<00:00,  3.59it/s, loss=4.82]

tensor([ 28, 198, 198, 198,  42,  42,  42,  42, 198, 198], device='cuda:0')
tensor([   17,   198,   198, 17321,  5819,  2154,  4501,    42,   198,    57],
       device='cuda:0')


Training: 100%|██████████| 1000/1000 [04:56<00:00,  3.38it/s, loss=4.82]
