In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
# plt.switch_backend('agg')
import matplotlib.ticker as ticker

import os
import urllib
import re
import random
import json
from typing import List, Dict, Optional, Any, Tuple
import glob

import numpy as np
import torch
import torch.nn as nn
from tqdm import tqdm
import pickle
from torch.nn import functional as F

from collections import OrderedDict, Counter

In [2]:
import tokenizers
#from tokenizers import Tokenizer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.pre_tokenizers import WhitespaceSplit
from tokenizers.pre_tokenizers import Punctuation

from tokenizers import normalizers
from tokenizers.normalizers import Lowercase, Strip, Replace, Sequence
from tokenizers.trainers import UnigramTrainer

In [3]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(1)

In [4]:
TILDE_DATA = './data/tilde'
!mkdir -p $TILDE_DATA


In [5]:
#!pip install sacrebleu
!pip show sacrebleu

Name: sacrebleu
Version: 1.5.1
Summary: Hassle-free computation of shareable, comparable, and reproducible BLEU, chrF, and TER scores
Home-page: https://github.com/mjpost/sacrebleu
Author: Matt Post
Author-email: post@cs.jhu.edu
License: Apache License 2.0
Location: /home/gstrazds/anaconda3/envs/tw131/lib/python3.8/site-packages
Requires: portalocker
Required-by: 


In [6]:
# from google.colab import drive
# drive.mount('/content/drive')

#Text preprocessing

In [7]:
num_bpe_merges = 10000
vocab_size = 5500
joint_vocab_size = 2*vocab_size

!echo BPE_ops=$num_bpe_merges vocab_size=$vocab_size joint_vocab_size=$joint_vocab_size

BPE_ops=10000 vocab_size=5500 joint_vocab_size=11000


In [8]:
#!pip install subword-nmt
!pip show subword-nmt

Name: subword-nmt
Version: 0.3.7
Summary: Unsupervised Word Segmentation for Neural Machine Translation and Text Generation
Home-page: https://github.com/rsennrich/subword-nmt
Author: Rico Sennrich
Author-email: None
License: MIT
Location: /home/gstrazds/anaconda3/envs/tw131/lib/python3.8/site-packages
Requires: 
Required-by: 


In [9]:
# # Read Hemingway texts from URL. There are Hemingway's "A Farewell to arms"
# text_en = urllib.request.urlopen('http://www.ltn.lv/~guntis/translation_dataset/dataset_en_small.txt').read().decode("utf-8", "ignore")
# text_lv = urllib.request.urlopen('http://www.ltn.lv/~guntis/translation_dataset/dataset_lv_small.txt').read().decode("utf-8-sig", "ignore")

# HEMINGWAY_SRC_EN = f'{HEMINGWAY_DATA}/hemingway.en.txt'
# HEMINGWAY_SRC_LV = f'{HEMINGWAY_DATA}/hemingway.lv.txt'

# with open(HEMINGWAY_SRC_EN, 'w') as f:
#     f.write(text_en)

# with open(HEMINGWAY_SRC_LV, 'w') as f:
#     f.write(text_lv)

In [10]:
TILDE_ALL_EN = f'{TILDE_DATA}/all.norm2.en'
TILDE_ALL_LV = f'{TILDE_DATA}/all.norm2.lv'

TILDE_TOK_EN = f'{TILDE_DATA}/combined.en.tok.txt'
TILDE_TOK_LV = f'{TILDE_DATA}/combined.lv.tok.txt'

!echo $TILDE_DATA/combined.lv.tok.txt $TILDE_TOK_EN

./data/tilde/combined.lv.tok.txt ./data/tilde/combined.en.tok.txt


In [11]:
# !git clone https://github.com/moses-smt/mosesdecoder.git

In [12]:
# Normalize and tokenize texts
cmd1 = f"cat {TILDE_ALL_EN} | mosesdecoder/scripts/tokenizer/normalize-punctuation.perl -l en | mosesdecoder/scripts/tokenizer/tokenizer.perl -a -l en > {TILDE_TOK_EN}"
#! $cmd1

cmd2 = f"cat {TILDE_ALL_LV} | mosesdecoder/scripts/tokenizer/normalize-punctuation.perl -l lv | mosesdecoder/scripts/tokenizer/tokenizer.perl -a -l lv > {TILDE_TOK_LV}"
#! $cmd2

In [13]:
# # Normalize and tokenize texts

# #!cat hemingway.en.txt | mosesdecoder/scripts/tokenizer/normalize-punctuation.perl -l en \
# !cat hemingway.en.txt \
#   | mosesdecoder/scripts/tokenizer/tokenizer.perl -a -l en > hemingway.en.tok.txt

# # !cat hemingway.lv.txt | mosesdecoder/scripts/tokenizer/normalize-punctuation.perl -l lv \
# !cat hemingway.lv.txt \
#   | mosesdecoder/scripts/tokenizer/tokenizer.perl -a -l lv > hemingway.lv.tok.txt

In [14]:
#!mosesdecoder/scripts/recaser/train-truecaser.perl -corpus $TILDE_DATA/combined.en.tok.txt -model $TILDE_DATA/tc_model.en
#!mosesdecoder/scripts/recaser/train-truecaser.perl -corpus $TILDE_DATA/combined.lv.tok.txt -model $TILDE_DATA/tc_model.lv

In [15]:
cmd1 = f"mosesdecoder/scripts/recaser/truecase.perl -model {TILDE_DATA}/tc_model.en < {TILDE_DATA}/combined.en.tok.txt > {TILDE_DATA}/combined.en.tc.txt"
#! $cmd1

cmd2 = f"mosesdecoder/scripts/recaser/truecase.perl -model {TILDE_DATA}/tc_model.lv < {TILDE_DATA}/combined.lv.tok.txt > {TILDE_DATA}/combined.lv.tc.txt"
#! $cmd2


In [16]:
# !subword-nmt learn-joint-bpe-and-vocab --input en.tc.txt lv.tc.txt -s 10000 -o tokens.txt --write-vocabulary token_freq.en.txt token_freq.lv.txt
!mkdir -p $TILDE_DATA/bpe 

!echo subword-nmt learn-joint-bpe-and-vocab --input $TILDE_DATA/combined.lv.tc.txt $TILDE_DATA/combined.en.tc.txt -s $num_bpe_merges -o $TILDE_DATA/bpe/tokens.lven --write-vocabulary $TILDE_DATA/bpe/token_freq.en $TILDE_DATA/bpe/token_freq.lv

## !subword-nmt learn-joint-bpe-and-vocab --input $HEMINGWAY_DATA/hemingway.en.tc.txt -s $num_bpe_merges -o $HEMINGWAY_DATA/bpe/tokens.en --write-vocabulary $HEMINGWAY_DATA/bpe/token_freq.en
## !subword-nmt learn-joint-bpe-and-vocab --input $HEMINGWAY_DATA/hemingway.lv.tc.txt -s $num_bpe_merges -o $HEMINGWAY_DATA/bpe/tokens.lv --write-vocabulary $HEMINGWAY_DATA/bpe/token_freq.lv

subword-nmt learn-joint-bpe-and-vocab --input ./data/tilde/combined.lv.tc.txt ./data/tilde/combined.en.tc.txt -s 10000 -o ./data/tilde/bpe/tokens.lven --write-vocabulary ./data/tilde/bpe/token_freq.en ./data/tilde/bpe/token_freq.lv


In [17]:
def build_vocab(freq_file, vocab_size):
    vocab = Counter(['<unk>', '<pad>', '<eos>'])
    with open(freq_file, 'r') as f:
        for line in f.readlines():
            token, num_occurs = line.split()
            # vocab.append(token)
            vocab[token] += int(num_occurs)
    return vocab #[:vocab_size]
#     return vocab[:vocab_size]

en_vocab = build_vocab(f'{TILDE_DATA}/bpe/token_freq.en', vocab_size)
lv_vocab = build_vocab(f'{TILDE_DATA}/bpe/token_freq.lv', vocab_size)

joint_vocab = Counter(en_vocab)
joint_vocab.update(lv_vocab)

if False:
    with open(f'{TILDE_DATA}/bpe/vocab.en', 'w') as f:
        for i, token in enumerate(en_vocab):
            # f.write(f"{token} {i + 1} \n")
            f.write(f"{token} {en_vocab[token]} \n")

    with open(f'{TILDE_DATA}/bpe/vocab.lv', 'w') as f:
        for i, token in enumerate(lv_vocab):
            # f.write(f"{token} {i + 1} \n")
            f.write(f"{token} {lv_vocab[token]} \n")

    with open(f'{TILDE_DATA}/bpe/vocab.lven', 'w') as f:
        for i, token in enumerate(joint_vocab):
            # f.write(f"{token} {i + 1} \n")
            f.write(f"{token} {joint_vocab[token]} \n")



In [18]:
print("en_vocab:", len(en_vocab), "lv_vocab:", len(lv_vocab), "joint_vocab", len(joint_vocab))

en_vocab: 10099 lv_vocab: 6477 joint_vocab 10519


In [19]:
#!subword-nmt apply-bpe -c $HEMINGWAY_DATA/bpe/tokens.en --vocabulary $HEMINGWAY_DATA/bpe/vocab.en --vocabulary-threshold 1 < $HEMINGWAY_DATA/hemingway.en.tc.txt > $HEMINGWAY_DATA/hemingway.en.BPE.txt
#!subword-nmt apply-bpe -c $HEMINGWAY_DATA/bpe/tokens.lv --vocabulary $HEMINGWAY_DATA/bpe/vocab.lv --vocabulary-threshold 1 < $HEMINGWAY_DATA/hemingway.lv.tc.txt > $HEMINGWAY_DATA/hemingway.lv.BPE.txt

# !subword-nmt apply-bpe -c $HEMINGWAY_DATA/bpe/tokens.lven --vocabulary $HEMINGWAY_DATA/bpe/token_freq.en --vocabulary-threshold 1 < $HEMINGWAY_DATA/hemingway.en.tc.txt > $HEMINGWAY_DATA/hemingway.en.BPE.txt
# !subword-nmt apply-bpe -c $HEMINGWAY_DATA/bpe/tokens.lven --vocabulary $HEMINGWAY_DATA/bpe/token_freq.lv --vocabulary-threshold 1 < $HEMINGWAY_DATA/hemingway.lv.tc.txt > $HEMINGWAY_DATA/hemingway.lv.BPE.txt


cmd1 = f"subword-nmt apply-bpe -c {TILDE_DATA}/bpe/tokens.lven --vocabulary {TILDE_DATA}/bpe/vocab.lven --vocabulary-threshold 1 < {TILDE_DATA}/combined.en.tc.txt > {TILDE_DATA}/combined.en.BPE.txt"
#! $cmd1

cmd2 = f"subword-nmt apply-bpe -c {TILDE_DATA}/bpe/tokens.lven --vocabulary {TILDE_DATA}/bpe/vocab.lven --vocabulary-threshold 1 < {TILDE_DATA}/combined.lv.tc.txt > {TILDE_DATA}/combined.lv.BPE.txt"
#! $cmd2

In [20]:
if False:
    special_tokens = ['<unk>', '<pad>', '<eos>', '<sep>'] #, '<S>', '</S>', '<bos>', '<eos>', '<sep>', '<NONE>', '<|>']
                  
    normalizer = normalizers.Sequence([Strip(), Lowercase()])
    pre_tokenizer = Whitespace()

    model = tokenizers.models.WordLevel(unk_token='<unk>')
    # model = tokenizers.models.WordPiece()
    tokenizer = tokenizers.Tokenizer(model=model)


    tokenizer.add_special_tokens(special_tokens)
    tokenizer.normalizer = normalizer
    tokenizer.pre_tokenizer = pre_tokenizer

    # filelist = glob.glob(PTHRU_DIR+"valid/*.pthru")
    # filelist.extend( glob.glob(PTHRU_DIR+"test/*.pthru"))
    # filelist.extend( glob.glob(PTHRU_DIR+"train/*.pthru"))


    # token_strs = [tok for (tok, span) in pre_tokenizer.pre_tokenize_str(str1)]
    # print(token_strs)

    # filelist = glob.glob(PTHRU_DIR+"valid/*.pthru")

    filelist = glob.glob(f"{TILDE_DATA}/combined.*.BPE.txt")

    filelist = sorted(filelist)
    print(len(filelist), filelist[:10])


    # unigram_trainer = tokenizers.trainers.UnigramTrainer()
    # trainer = tokenizers.trainers.WordPieceTrainer(vocab_size=vocab_size)
    trainer = tokenizers.trainers.WordLevelTrainer(vocab_size=joint_vocab_size, special_tokens=special_tokens)

    tokenizer.train(files=filelist, trainer=trainer)

    vocab_dict = tokenizer.get_vocab(with_added_tokens=False)
    print("ACTUAL VOCAB SIZE =", len(vocab_dict))
    print(vocab_dict)


In [21]:
# !! ACTUAL VOCAB SIZE = 900 (first try when joint_vocab but separate --vocabulary token_freq.lang)
# ACTUAL VOCAB SIZE = 8637
# ACTUAL VOCAB SIZE = 9048 (CUDA out of memory)

In [22]:
with open(f'{TILDE_DATA}/combined.lv.BPE.txt', 'r') as f:
    text_input = f.read()

with open(f'{TILDE_DATA}/combined.en.BPE.txt', 'r') as f:
    text_output = f.read()

#MinGPT

In [23]:
import random
import numpy as np
import torch
import torch.nn as nn
from torch.nn import functional as F

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

def top_k_logits(logits, k):
    v, ix = torch.topk(logits, k)
    out = logits.clone()
    out[out < v[:, [-1]]] = -float('Inf')
    return out

def calculate_attention_token(attention, top_k, model):
    logits = model.head(attention)
    logits = logits[:, -1, :]
    logits = top_k_logits(logits, top_k)

    probs = F.softmax(logits)

    _, ix = torch.topk(probs, k=1, dim=-1)
    ix = torch.multinomial(probs, num_samples=top_k)

    return ix[0]


@torch.no_grad()
def sample(model, x, steps, temperature=1.0, sample=False, top_k=None, output_attention=False):
    """
    take a conditioning sequence of indices in x (of shape (b,t)) and predict the next token in
    the sequence, feeding the predictions back into the model each time. Clearly the sampling
    has quadratic complexity unlike an RNN that is only linear, and has a finite context window
    of block_size, unlike an RNN that has an infinite context window.
    """
    block_size = model.get_block_size()
    model.eval()
    attention_state = [[] for _ in model.blocks]

    for k in range(steps):
        x_cond = x if x.size(1) <= block_size else x[:, -block_size:] # crop context if needed
        logits, _ = model(x_cond)
        # pluck the logits at the final step and scale by temperature
        logits = logits[:, -1, :] / temperature
        # optionally crop probabilities to only the top k options
        if top_k is not None:
            logits = top_k_logits(logits, top_k)
        # apply softmax to convert to probabilities
        probs = F.softmax(logits, dim=-1)
        # sample from the distribution or take the most likely
        if sample:
            ix = torch.multinomial(probs, num_samples=1)
        else:
            _, ix = torch.topk(probs, k=1, dim=-1)

        if output_attention:
            b, t = x.size()

            for block_id in range(len(model.blocks)):
                att = model.blocks[block_id].attn.att
                attention_state[block_id].append(att)

        # append to the sequence and continue
        x = torch.cat((x, ix), dim=1)

    if output_attention:
        return x, attention_state

    return x


In [24]:
"""
GPT model:
- the initial stem consists of a combination of token encoding and a positional encoding
- the meat of it is a uniform sequence of Transformer blocks
    - each Transformer is a sequential combination of a 1-hidden-layer MLP block and a self-attention block
    - all blocks feed into a central residual pathway similar to resnets
- the final decoder is a linear projection into a vanilla Softmax classifier
"""

import math
import logging

import torch
import torch.nn as nn
from torch.nn import functional as F

logger = logging.getLogger(__name__)

class GPTConfig:
    """ base GPT config, params common to all GPT versions """
    embd_pdrop = 0.1
    resid_pdrop = 0.1
    attn_pdrop = 0.1

    def __init__(self, vocab_size, block_size, **kwargs):
        self.vocab_size = vocab_size
        self.block_size = block_size
        for k,v in kwargs.items():
            setattr(self, k, v)

class GPT1Config(GPTConfig):
    """ GPT-1 like network roughly 125M params """
    n_layer = 12
    n_head = 12
    n_embd = 768

class CausalSelfAttention(nn.Module):
    """
    A vanilla multi-head masked self-attention layer with a projection at the end.
    It is possible to use torch.nn.MultiheadAttention here but I am including an
    explicit implementation here to show that there is nothing too scary here.
    """

    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        # key, query, value projections for all heads
        self.key = nn.Linear(config.n_embd, config.n_embd)
        self.query = nn.Linear(config.n_embd, config.n_embd)
        self.value = nn.Linear(config.n_embd, config.n_embd)
        # regularization
        self.attn_drop = nn.Dropout(config.attn_pdrop)
        self.resid_drop = nn.Dropout(config.resid_pdrop)
        # output projection
        self.proj = nn.Linear(config.n_embd, config.n_embd)
        # causal mask to ensure that attention is only applied to the left in the input sequence
        self.register_buffer("mask", torch.tril(torch.ones(config.block_size, config.block_size))
                                     .view(1, 1, config.block_size, config.block_size))
        self.n_head = config.n_head
        self.att = None

    def forward(self, x, layer_past=None):
        B, T, C = x.size()

        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
        k = self.key(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        q = self.query(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        v = self.value(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)

        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
        att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf'))
        att = F.softmax(att, dim=-1)
        att = self.attn_drop(att)
        y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side

        # output projection
        y = self.resid_drop(self.proj(y))

        self.att = att

        return y

class Block(nn.Module):
    """ an unassuming Transformer block """

    def __init__(self, config):
        super().__init__()
        self.ln1 = nn.LayerNorm(config.n_embd)
        self.ln2 = nn.LayerNorm(config.n_embd)
        self.attn = CausalSelfAttention(config)
        self.mlp = nn.Sequential(
            nn.Linear(config.n_embd, 4 * config.n_embd),
            nn.GELU(),
            nn.Linear(4 * config.n_embd, config.n_embd),
            nn.Dropout(config.resid_pdrop),
        )

    def forward(self, x):
        x = x + self.attn(self.ln1(x))
        x = x + self.mlp(self.ln2(x))
        return x

class GPT(nn.Module):
    """  the full GPT language model, with a context size of block_size """

    def __init__(self, config):
        super().__init__()

        # input embedding stem
        self.tok_emb = nn.Embedding(config.vocab_size, config.n_embd)
        self.pos_emb = nn.Parameter(torch.zeros(1, config.block_size, config.n_embd))
        self.drop = nn.Dropout(config.embd_pdrop)
        # transformer
        self.blocks = nn.Sequential(*[Block(config) for _ in range(config.n_layer)])
        # decoder head
        self.ln_f = nn.LayerNorm(config.n_embd)
        self.head = nn.Linear(config.n_embd, config.vocab_size, bias=False)

        self.block_size = config.block_size
        self.apply(self._init_weights)

        logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters()))

    def get_block_size(self):
        return self.block_size

    def _init_weights(self, module):
        if isinstance(module, (nn.Linear, nn.Embedding)):
            module.weight.data.normal_(mean=0.0, std=0.02)
            if isinstance(module, nn.Linear) and module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def configure_optimizers(self, train_config):
        """
        This long function is unfortunately doing something very simple and is being very defensive:
        We are separating out all parameters of the model into two buckets: those that will experience
        weight decay for regularization and those that won't (biases, and layernorm/embedding weights).
        We are then returning the PyTorch optimizer object.
        """

        # separate out all parameters to those that will and won't experience regularizing weight decay
        decay = set()
        no_decay = set()
        whitelist_weight_modules = (torch.nn.Linear, )
        blacklist_weight_modules = (torch.nn.LayerNorm, torch.nn.Embedding)
        for mn, m in self.named_modules():
            for pn, p in m.named_parameters():
                fpn = '%s.%s' % (mn, pn) if mn else pn # full param name

                if pn.endswith('bias'):
                    # all biases will not be decayed
                    no_decay.add(fpn)
                elif pn.endswith('weight') and isinstance(m, whitelist_weight_modules):
                    # weights of whitelist modules will be weight decayed
                    decay.add(fpn)
                elif pn.endswith('weight') and isinstance(m, blacklist_weight_modules):
                    # weights of blacklist modules will NOT be weight decayed
                    no_decay.add(fpn)

        # special case the position embedding parameter in the root GPT module as not decayed
        no_decay.add('pos_emb')

        # validate that we considered every parameter
        param_dict = {pn: p for pn, p in self.named_parameters()}
        inter_params = decay & no_decay
        union_params = decay | no_decay
        assert len(inter_params) == 0, "parameters %s made it into both decay/no_decay sets!" % (str(inter_params), )
        assert len(param_dict.keys() - union_params) == 0, "parameters %s were not separated into either decay/no_decay set!" \
                                                    % (str(param_dict.keys() - union_params), )

        # create the pytorch optimizer object
        optim_groups = [
            {"params": [param_dict[pn] for pn in sorted(list(decay))], "weight_decay": train_config.weight_decay},
            {"params": [param_dict[pn] for pn in sorted(list(no_decay))], "weight_decay": 0.0},
        ]
        optimizer = torch.optim.AdamW(optim_groups, lr=train_config.learning_rate, betas=train_config.betas)
        return optimizer

    def forward(self, idx, targets=None):
        b, t = idx.size()
        assert t <= self.block_size, "Cannot forward, model block size is exhausted."

        # forward the GPT model
        token_embeddings = self.tok_emb(idx) # each index maps to a (learnable) vector
        position_embeddings = self.pos_emb[:, :t, :] # each position maps to a (learnable) vector
        x = self.drop(token_embeddings + position_embeddings)
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.head(x)

        # if we are given some desired targets also calculate the loss
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))

        return logits, loss


In [25]:
"""
Simple training loop; Boilerplate that could apply to any arbitrary neural network,
so nothing in this file really has anything to do with GPT specifically.
"""

import sacrebleu
import math
import logging
from random import choice

from tqdm import tqdm
import numpy as np

import torch
import torch.optim as optim
from torch.optim.lr_scheduler import LambdaLR
from torch.utils.data.dataloader import DataLoader

logger = logging.getLogger(__name__)

def clean_tokens(sentence):
    return sentence.replace('@@ ', '').replace(' @', '').replace('@ ', '')

class TrainerConfig:
    # optimization parameters
    max_epochs = 10
    batch_size = 64
    learning_rate = 3e-4
    betas = (0.9, 0.95)
    grad_norm_clip = 1.0
    weight_decay = 0.1 # only applied on matmul weights
    # learning rate decay params: linear warmup followed by cosine decay to 10% of original
    lr_decay = False
    warmup_tokens = 375e6 # these two numbers come from the GPT-3 paper, but may not be good defaults elsewhere
    final_tokens = 260e9 # (at what point we reach 10% of original LR)
    # checkpoint settings
    ckpt_path = None
    num_workers = 0 # for DataLoader

    def __init__(self, **kwargs):
        for k,v in kwargs.items():
            setattr(self, k, v)

class Trainer:

    def __init__(self, model, train_dataset, test_dataset, valid_dataset, config):
        self.model = model
        self.train_dataset = train_dataset
        self.test_dataset = test_dataset
        self.valid_dataset = valid_dataset
        self.config = config

        # take over whatever gpus are on the system
        self.device = 'cpu'
        if torch.cuda.is_available():
            self.device = torch.cuda.current_device()
            self.model = torch.nn.DataParallel(self.model).to(self.device)

    def save_checkpoint(self, postfix=''):
        # DataParallel wrappers keep raw model object in .module attribute
        raw_model = self.model.module if hasattr(self.model, "module") else self.model
        checkpoint_path = self.config.ckpt_path + postfix + '.pt'
        logger.info("saving %s", checkpoint_path)
        torch.save(raw_model.state_dict(), checkpoint_path)

    def train(self):
        model, config = self.model, self.config
        raw_model = model.module if hasattr(self.model, "module") else model
        optimizer = raw_model.configure_optimizers(config)

        def run_epoch(split):
            is_train = split == 'train'
            model.train(is_train)
            data = self.train_dataset
            if split == 'test':
                data = self.test_dataset
            elif split == 'valid':
                data = self.valid_dataset
                model.eval()
            loader = DataLoader(data, shuffle=True, pin_memory=True,
                                batch_size=config.batch_size, # if is_train else 8,
                                num_workers=config.num_workers)

            losses = []
            pbar = tqdm(enumerate(loader), total=len(loader)) if is_train else enumerate(loader)
#             predicted_tokids = None
            context_list = []
            translation_results = []
            eval_results = []
            x_total = None
            y_total = None
            for it, (x, y) in pbar:

                # place data on the correct device
                x = x.to(self.device)
                y = y.to(self.device)

                # forward the model
                with torch.set_grad_enabled(is_train):
                    logits, loss = model(x, y)
                    loss = loss.mean() # collapse all losses if they are scattered on multiple gpus
                    losses.append(loss.item())
                    if split == 'valid':
                        intent = (x == valid_dataset.tokenizer_input.encode(['<eos>'])[0]).nonzero(as_tuple=True) #[0]
                        #print(valid_dataset.tokenizer_input.encode(['<eos>']))
                        #print(intent)
                        #print(x.shape, y.shape, logits.shape)
                        #for i in range(len(intent[0])):
                        #    print(x[i][intent[1][i]], end=", ")
                        #print()

                        probs = F.softmax(logits, dim=-1)
                        #print(probs.shape)
                        for i in range(len(probs)):
                            # sample from the distribution or take the most likely
                            _, predicted = torch.topk(probs[i], k=1, dim=-1)
                            if len(predicted.shape) > 1:
                                # print("PREDICTED:", predicted.shape, predicted)
                                predicted = predicted.squeeze()
                                if len(predicted.shape) > 1:
                                    print("AFTER predicted.squeeze(1):", predicted.shape)
                            sep = intent[1][i]
                            # print("sep=", sep)
                            #print("***CONTEXT")
                            context = clean_tokens(data.tokenizer_input.decode(x[i][:sep - 1], True))
                            #print(context)
                            #print("***COMPLETION")
                            completion = clean_tokens(data.tokenizer_output.decode(predicted[sep:], True))
                            #print(completion)
                            #print("***REAL")
                            real = clean_tokens(data.tokenizer_output.decode(y[i][sep:], True))
                            #print(real)
                            context_list.append(context)
                            translation_results.append(completion)
                            eval_results.append(real)

#                         probs = F.softmax(logits, dim=-1)
#                         # sample from the distribution or take the most likely
#                         _, predicted = torch.topk(probs, k=1, dim=-1)

#                         if predicted_tokids is None:
#                             predicted_tokids = [predicted]
#                             x_total = x
#                             y_total = y
#                         else:
#                             predicted_tokids.append(predicted)
#                             x_total = torch.cat((x_total, x), dim=0)
#                             y_total = torch.cat((y_total, y), dim=0)
                        

                if is_train:
                    # backprop and update the parameters
                    model.zero_grad()
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_norm_clip)
                    optimizer.step()

                    # decay the learning rate based on our progress
                    if config.lr_decay:
                        self.tokens += (y >= 0).sum() # number of tokens processed this step (i.e. label is not -100)
                        if self.tokens < config.warmup_tokens:
                            # linear warmup
                            lr_mult = float(self.tokens) / float(max(1, config.warmup_tokens))
                        else:
                            # cosine learning rate decay
                            progress = float(self.tokens - config.warmup_tokens) / float(max(1, config.final_tokens - config.warmup_tokens))
                            lr_mult = max(0.1, 0.5 * (1.0 + math.cos(math.pi * progress)))
                        lr = config.learning_rate * lr_mult
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr
                    else:
                        lr = config.learning_rate

                    # report progress
                    pbar.set_description(f"epoch {epoch+1} iter {it}: train loss {loss.item():.5f}. mean loss: {float(np.mean(losses)):.5f}. lr {lr:e}")

            if split == 'train':
                train_loss = float(np.mean(losses))
                print(f"train loss: {train_loss}")
                return train_loss

            if split == 'test':
                test_loss = float(np.mean(losses))
                print(f"test loss: {test_loss}")
                return test_loss

            if split == 'valid':
                test_loss = float(np.mean(losses))
                print(f"valid loss: {test_loss}")

#                 eval_results = []
#                 translation_results = []
#                 context_list = []

#                 for idx in range(len(logits_total)):
#                     intent = (x_total[idx] == valid_dataset.tokenizer_input.encode(['<eos>'])[0]).nonzero(as_tuple=True)[0][0]
#                     probs = F.softmax(logits_total[idx], dim=-1)
#                     # sample from the distribution or take the most likely
#                     _, predicted = torch.topk(probs, k=1, dim=-1)
#                 for idx in range(len(predicted_tokids)):
#                     intent = (x_total[idx] == valid_dataset.tokenizer_input.encode(['<eos>'])[0]).nonzero(as_tuple=True)[0][0]
#                     predicted = predicted_tokids[idx]
#                     print("***CONTEXT")
#                     context = clean_tokens(data.tokenizer_input.decode(x_total[idx][:intent - 1], True))
#                     print("***COMPLETION")
#                     completion = clean_tokens(data.tokenizer_output.decode(predicted[intent:], True))
#                     print("***REAL")
#                     real = clean_tokens(data.tokenizer_output.decode(y_total[idx][intent:], True))

#                     context_list.append(context)
#                     translation_results.append(completion)
#                     eval_results.append(real)
                
                with open('valid.txt', 'w') as f:
                    f.write("\n".join(translation_results))

                with open('eval.txt', 'w') as f:
                    f.write("\n".join(eval_results))

                with open('context.txt', 'w') as f:
                    f.write("\n".join(context_list))


                !cat valid.txt | mosesdecoder/scripts/tokenizer/detokenizer.perl -l lv > valid.detok.txt
                !cat eval.txt | mosesdecoder/scripts/tokenizer/detokenizer.perl -l lv > eval.detok.txt
                !cat context.txt | mosesdecoder/scripts/tokenizer/detokenizer.perl -l lv > context.detok.txt

                with open('eval.detok.txt', 'r') as f:
                    eval_results = [l.strip() for l in f.readlines()]
                with open('valid.detok.txt', 'r') as f:
                    translation_results = [l.strip() for l in f.readlines()]
                with open('context.detok.txt', 'r') as f:
                    context_list = [l.strip() for l in f.readlines()]

#                 idx = choice(range(len(context_list)))
                valid_sentences = ['the driver wore a cap and his face was thin and very tanned.',
                                   'outside it was getting dark.',
                                   'the two girls were asleep.',
                                   'I would like to have had the uniform off although I did not care much about the outward forms.',
                                   'I watched the flashes on San Gabriele.',
                                   'I asked.',
                                   '"no.']

                idx_list = [i for i, sentence in enumerate(eval_results) if sentence in valid_sentences]
                
                for idx in idx_list:
                    print(f'Input:            {context_list[idx]}')
                    print(f'Predicted output: {translation_results[idx]}')
                    print(f'Real output:      {eval_results[idx]}')
                    print('--------------------------------------------------')

                refs = [eval_results]
                sys = translation_results
                bleu = sacrebleu.corpus_bleu(sys, refs)
                print(f'BLEU: {bleu.score}')
                print('##############################################################')

                return test_loss, bleu.score

        train_loss_list = []
        test_loss_list = []
        valid_loss_list = []
        valid_bleu_list = []
        best_loss = float('inf')
        best_bleu = 0.0
        bleu_score = -1.0
        self.tokens = 0 # counter used for learning rate decay
        for epoch in range(config.max_epochs):

            train_loss = run_epoch('train')
            train_loss_list.append(train_loss)
            if self.test_dataset is not None:
                test_loss = run_epoch('test')
                test_loss_list.append(test_loss)

            if self.valid_dataset is not None:
                valid_loss, bleu_score = run_epoch('valid')
                valid_loss_list.append(valid_loss)
                valid_bleu_list.append(bleu_score)

            # supports early stopping based on the test loss, or just save always if no test set is provided
            # good_model = self.test_dataset is None or test_loss < best_loss
            good_model = self.valid_dataset is None or bleu_score > best_bleu
            if self.config.ckpt_path is not None and good_model:
                best_loss = test_loss
                best_bleu = bleu_score
                self.save_checkpoint("_best")

            if epoch % 10 == 0:
                self.save_checkpoint(f"_{epoch}")

            self.save_checkpoint("_last")

        return train_loss_list, test_loss_list, valid_loss_list, valid_bleu_list


#Training

In [26]:

class Tokenizer:
    def __init__(self, data, vocab_size, vocab):
        self.vocab_size = vocab_size
        self.vocab = set(vocab)
        self.vocab_size = len(vocab)
        if self.vocab_size != vocab_size:
            logger.warn(f"Tokenizer len(vocab) != vocab_size: {len(self.vocab)} {vocab_size}")
        print(f"Tokenizer vocab_size={vocab_size} len(vocab)={len(self.vocab)}")
        self.stoi = {ch: i for i, ch in enumerate(vocab)}
        self.itos = {i: ch for i, ch in enumerate(vocab)}
    
    def tokenize(self, data, block_size):
        tokenized_text = data.split()
        # Filter empty strings
        tokenized_text = [x for x in tokenized_text if x]
        result = []
        for tokenized in tokenized_text:
            # In case other single # found, replace them with <unk> special token, marking the element as unknown
            if tokenized in self.vocab:
                result.append(tokenized)
            else:
                logger.warn(f"Tokenizer UNKNOWN TOKEN: |{tokenized}|")
                result.append('<unk>')

        # in case the sentence is longer, than block_size, we trim the sentence
        return result[:block_size]
    
    def encode(self, data):
        return [self.stoi[s] for s in data]
    
    def decode(self, data, clean_paddings=False):
        if hasattr(data, "shape") and len(data.shape) > 1:
            print("WARNING, unexpected data.shape:", data.shape)
            print(data)
        text = ' '.join([self.itos[int(i)] for i in data if int(i) >= 0])

        if not clean_paddings:
            return text
        return text.replace('<pad>', '').replace('  ', '')

In [27]:
# vocab_size = 10000

# vocab_input = None
# if os.path.exists('vocab_input.pkl'):
#     with open('vocab_input.pkl', 'rb') as f:
#         vocab_input = pickle.load(f)
        
# vocab_output = None
# if os.path.exists('vocab_output.pkl'):
#     with open('vocab_output.pkl', 'rb') as f:
#         vocab_output = pickle.load(f)

# building vocabluary can take some time. ~5 minutes for 10_000 tokens for each tokenizer. 
tokenizer_input = Tokenizer(text_input, vocab_size, list(joint_vocab))
tokenizer_output = Tokenizer(text_output, vocab_size, list(joint_vocab))

  logger.warn(f"Tokenizer len(vocab) != vocab_size: {len(self.vocab)} {vocab_size}")
Tokenizer len(vocab) != vocab_size: 10519 5500
Tokenizer len(vocab) != vocab_size: 10519 5500


Tokenizer vocab_size=5500 len(vocab)=10519
Tokenizer vocab_size=5500 len(vocab)=10519


In [28]:
# with open('vocab_input.pkl', 'wb') as f:
#     pickle.dump(tokenizer_input.vocab, f)

# with open('vocab_output.pkl', 'wb') as f:
#     pickle.dump(tokenizer_output.vocab, f)

In [29]:
assert len(text_input.splitlines()) == len(text_output.splitlines()), \
   f"{len(text_input.splitlines())} {len(text_output.splitlines())}"
# assert len(text_lv.splitlines()) == len(text_en.splitlines())
# assert len(text_lv.splitlines()) == len(text_input.splitlines())
line_idxs = list(range(len(text_input.splitlines())))
random.shuffle(line_idxs)
print(len(line_idxs), len(text_input.splitlines()))
# print(line_idxs[:10], line_idxs[-10:])

train_dataset_size = round(0.75 * len(line_idxs))
test_dataset_size = round(0.15 * len(line_idxs))
valid_dataset_size = round(0.1 * len(line_idxs))

train_idxs = line_idxs[:train_dataset_size]
test_idxs = line_idxs[train_dataset_size:train_dataset_size + test_dataset_size]
valid_idxs = line_idxs[-valid_dataset_size:]

assert len(train_idxs) + len(valid_idxs) + len(test_idxs) == len(line_idxs)
assert set(line_idxs) == set(train_idxs) | set(valid_idxs) | set(test_idxs)

1613611 1613611


In [30]:
print(text_input[:200])
print(f"{len(text_input.splitlines())}")

28 gadus vec@@ s pa@@ vār@@ s atra@@ sts mir@@ is S@@ an@@ f@@ ran@@ c@@ isko liel@@ veik@@ al@@ ā
28 gadus vec@@ s pa@@ vār@@ s , kurš nesen pār@@ cē@@ l@@ ies uz S@@ an@@ f@@ ran@@ c@@ isko , š@@ on
1613611


In [31]:
# Shuffle texts by lines
# texts = list(zip(text_output.splitlines(), text_input.splitlines()))
# random.shuffle(texts)
# output_texts, input_texts = zip(*texts)

In [32]:
# Split texts into train, test and validation datasets
# train_dataset_size = round(0.75 * len(output_texts))
# test_dataset_size = round(0.15 * len(output_texts))
# valid_dataset_size = round(0.1 * len(output_texts))

# train_input = input_texts[:train_dataset_size]
# test_input = input_texts[train_dataset_size:train_dataset_size + test_dataset_size]
# valid_input = input_texts[-valid_dataset_size:]

# train_output = output_texts[:train_dataset_size]
# test_output = output_texts[train_dataset_size:train_dataset_size + test_dataset_size]
# valid_output = output_texts[-valid_dataset_size:]

def separate_lines(text, train_idxs, valid_idxs, test_idxs):
    text_lines = text.splitlines()
    train_lines = [text_lines[idx] for idx in train_idxs]
    valid_lines = [text_lines[idx] for idx in valid_idxs]
    test_lines = [text_lines[idx] for idx in test_idxs]
    return train_lines, valid_lines, test_lines

train_input, valid_input, test_input = separate_lines(text_input, train_idxs, valid_idxs, test_idxs)

train_output, valid_output, test_output = separate_lines(text_output, train_idxs, valid_idxs, test_idxs)

print(len(train_input), len(valid_input), len(test_input))
assert len(train_input) == len(train_output)
assert len(valid_input) == len(valid_output)
assert len(test_input) == len(test_output)


In [None]:

#print(train_idxs[:20])
#print(valid_idxs[:20])
#print(test_idxs[:20])

with open("test_set.idxs", "w") as f:
    for idx in test_idxs:
        f.write(f"{idx}\n")
with open("valid_set.idxs", "w") as f:
    for idx in valid_idxs:
        f.write(f"{idx}\n")
with open("train_set.idxs", "w") as f:
    for idx in train_idxs:
        f.write(f"{idx}\n")

In [62]:
# Sanity checks
train_set = set(train_idxs)
valid_set = set(valid_idxs)
test_set = set(test_idxs)

print(len(train_idxs), len(valid_idxs), len(test_idxs))
print(len(train_set), len(valid_set), len(test_set))

assert len(train_set) ==  len(train_idxs)
assert len(test_set) ==  len(test_idxs)
assert len(valid_set) ==  len(valid_idxs)

assert len( train_set & valid_set ) == 0
assert len( train_set & test_set ) == 0
assert len( test_set & valid_set ) == 0

print("CHECK train_idxs")
input_lines = text_input.splitlines()
for i in range(len(train_idxs)):
    assert train_input[i] == input_lines[train_idxs[i]], f"[{i}]:\n\t{train_input[i]}\n\t{input_lines[train_idxs[i]]}"
print(i)
print()
print("CHECK valid_idxs")
for i in range(len(valid_idxs)):
    assert valid_input[i] == input_lines[valid_idxs[i]], f"[{i}]:\n\t{valid_input[i]}\n\t{input_lines[valid_idxs[i]]}"
print(i)
print()
print("CHECK test_idxs")
for i in range(len(test_idxs)):
    assert test_input[i] == input_lines[test_idxs[i]], f"[{i}]:\n\t{test_input[i]}\n\t{input_lines[test_idxs[i]]}"
print(i)


1210208 161361 242042
1210208 161361 242042
CHECK train_idxs
1210207

CHECK valid_idxs
161360

CHECK test_idxs
242041


In [33]:

with open('data/tilde/train2.lv', 'w') as f:
    f.write("\n".join(train_input))

with open('data/tilde/test2.lv', 'w') as f:
    f.write("\n".join(test_input))

with open('data/tilde/valid2.lv', 'w') as f:
    f.write("\n".join(valid_input))


with open('data/tilde/train2.en', 'w') as f:
    f.write("\n".join(train_output))

with open('data/tilde/test2.en', 'w') as f:
    f.write("\n".join(test_output))

with open('data/tilde/valid2.en', 'w') as f:
    f.write("\n".join(valid_output))


In [79]:

# with open('data/tilde/train2.lv', 'w') as f:
#     f.write("\n".join(train_input))

# with open('data/tilde/train2.en', 'w') as f:
#     f.write("\n".join(train_output))


assert _eval_limit == 10000

with open('data/tilde/test2_10000.lv', 'w') as f:
    f.write("\n".join(test_input[:_eval_limit]))

with open('data/tilde/valid2_10000.lv', 'w') as f:
    f.write("\n".join(valid_input[:_eval_limit]))


with open('data/tilde/test2_10000.en', 'w') as f:
    f.write("\n".join(test_output[:_eval_limit]))

with open('data/tilde/valid2_10000.en', 'w') as f:
    f.write("\n".join(valid_output[:_eval_limit]))


In [34]:
from torch.utils.data import Dataset

class WordDataset(Dataset):

    def __init__(self, output_text, input_text, tokenizer_output, tokenizer_input, block_size):
        self.tokenizer_output = tokenizer_output
        self.tokenizer_input = tokenizer_input

        self.block_size = block_size * 2 + 1
        self.output_text = [tokenizer_output.tokenize(t, block_size) for t in output_text]
        self.input_text = [tokenizer_input.tokenize(t, block_size) for t in input_text]

    def __len__(self):
        return len(self.output_text)

    def __getitem__(self, idx):
        """
        The idea is to get the input sentence
        and translate it to output sentence (sentences could be on any language).

        In the init method we already split a sentence into tokens and filled with spaces,
        to have an equal sentence size. In this method we just encode the tokens to
        ids (a list of numbers), and we're trying to map ids sequences
        """

        tokenized_input_text = self.tokenizer_input.encode(self.input_text[idx])
        tokenized_output_text = self.tokenizer_output.encode(self.output_text[idx])

        dix = tokenized_input_text + self.tokenizer_output.encode(['<eos>']) + tokenized_output_text
        if len(dix) < self.block_size:
            dix += self.tokenizer_output.encode(['<pad>']) * (self.block_size - len(dix))

        x = torch.tensor(dix[:-1], dtype=torch.long)
        y = torch.tensor(dix[1:], dtype=torch.long)
        y[:len(tokenized_input_text) - 1] = -100

        return x, y

In [35]:
block_size = 100  # the estimate how long lines the text could be (token count)

import datetime
start_time = datetime.datetime.now()
print(f"================ encode Datasets - Start time: {start_time}")

# for faster debuging of Out of Memory during validation
_train_limit = len(train_output)   # 10000  # len(train_output)
_eval_limit = 10000   # -1    # 5000

train_dataset = WordDataset(train_output[:_train_limit], train_input[:_train_limit],
                            tokenizer_output, tokenizer_input, block_size)

if _eval_limit > 0:
    test_dataset = WordDataset(test_output[:_eval_limit], test_input[:_eval_limit],
                               tokenizer_output, tokenizer_input, block_size)
    valid_dataset = WordDataset(valid_output[:_eval_limit], valid_input[:_eval_limit],
                                tokenizer_output, tokenizer_input, block_size)
else:
    test_dataset = WordDataset(test_output, test_input,
                               tokenizer_output, tokenizer_input, block_size)
    valid_dataset = WordDataset(valid_output, valid_input,
                                tokenizer_output, tokenizer_input, block_size)

finish_time = datetime.datetime.now()
print(f"================ encode Datasets - Finished : {finish_time} -- elapsed: {finish_time-start_time}")




In [36]:
# NOTE: fixed, no longer shows UNKNOWN TOKEN

# joint_vocab -s 10000
# UNKNOWN TOKEN

# |;@@| (2040)  # I &@@ apos@@ ;@@ m
# |q@@| (148)
# |R| (40)
# |v| (409)
len(train_dataset)

1210208

In [37]:
number_of_heads = 8
number_of_layers = 6

# from mingpt.model import GPT, GPTConfig
embd_pdrop = 0.1
resid_pdrop = 0.1
attn_pdrop = 0.1

max_vocab = max(tokenizer_input.vocab_size, tokenizer_output.vocab_size)
mconf = GPTConfig(max_vocab, train_dataset.block_size,
                  n_layer=number_of_layers, n_head=number_of_heads, n_embd=512,
                  embd_pdrop=embd_pdrop, resid_pdrop=resid_pdrop, attn_pdrop=attn_pdrop)

model = GPT(mconf)

In [38]:
# from mingpt.trainer import Trainer, TrainerConfig

tokens_per_epoch = len(train_dataset) * block_size
train_epochs = 100
_batch_size = 128

# initialize a trainer instance and kick off training
tconf = TrainerConfig(max_epochs=train_epochs, 
                      batch_size=_batch_size, learning_rate=3e-4,
                      lr_decay=True, warmup_tokens=tokens_per_epoch, final_tokens=train_epochs*tokens_per_epoch,
                      ckpt_path='minGPT-Tilde-LV-EN-translator_model',
                      num_workers=1, weight_decay=0.0001, betas=(0.9, 0.98))
trainer = Trainer(model, train_dataset, test_dataset, valid_dataset, tconf)

In [39]:
param_count = sum([param.nelement() for param in model.parameters()])

print(f'Parameters count: {param_count}')

Parameters count: 29789696


In [40]:
# Parameters count: 28628480

In [41]:
train_loss_list, test_loss_list, valid_loss_list, valid_bleu_list = trainer.train()

epoch 1 iter 9454: train loss 0.37617. mean loss: 0.69961. lr 2.999637e-04: 100%|██████████| 9455/9455 [40:53<00:00,  3.85it/s]  

train loss: 0.6996148201601017





test loss: 0.36083900023110305
valid loss: 0.3672173690946796
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 29.023342985987902
##############################################################


epoch 2 iter 9454: train loss 0.28366. mean loss: 0.34594. lr 2.995702e-04: 100%|██████████| 9455/9455 [40:05<00:00,  3.93it/s]  

train loss: 0.34593802052537703





test loss: 0.28575764124906516
valid loss: 0.28697750821143764
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 36.45319734511089
##############################################################


epoch 3 iter 9454: train loss 0.27194. mean loss: 0.29497. lr 2.987451e-04: 100%|██████████| 9455/9455 [40:15<00:00,  3.91it/s]  

train loss: 0.2949696494876109





test loss: 0.26050413767748243
valid loss: 0.2612633097775375
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 39.23990755373973
##############################################################


epoch 4 iter 9454: train loss 0.29803. mean loss: 0.27224. lr 2.974907e-04: 100%|██████████| 9455/9455 [40:16<00:00,  3.91it/s]  

train loss: 0.2722373852633346





test loss: 0.2475800348233573
valid loss: 0.2479765484981899
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 40.65377349379643
##############################################################


epoch 5 iter 9454: train loss 0.24641. mean loss: 0.25836. lr 2.958108e-04: 100%|██████████| 9455/9455 [40:24<00:00,  3.90it/s]  

train loss: 0.25836024345196723





test loss: 0.23765818001348762
valid loss: 0.2390122851238975
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 41.7472891931476
##############################################################


epoch 6 iter 9454: train loss 0.24769. mean loss: 0.24861. lr 2.937100e-04: 100%|██████████| 9455/9455 [40:22<00:00,  3.90it/s]  

train loss: 0.24861045683867586





test loss: 0.232530456177796
valid loss: 0.23358417311801186
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 42.45037871859485
##############################################################


epoch 7 iter 9454: train loss 0.27818. mean loss: 0.24121. lr 2.911946e-04: 100%|██████████| 9455/9455 [40:34<00:00,  3.88it/s]  

train loss: 0.24120904027564635





test loss: 0.22838797769214533
valid loss: 0.22867248009277297
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 43.07260916009157
##############################################################


epoch 8 iter 9454: train loss 0.26262. mean loss: 0.23528. lr 2.882717e-04: 100%|██████████| 9455/9455 [40:14<00:00,  3.92it/s]  

train loss: 0.23527597282564494





test loss: 0.22521848580505274
valid loss: 0.22536798613735393
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 43.65638135527434
##############################################################


epoch 9 iter 9454: train loss 0.24683. mean loss: 0.23033. lr 2.849498e-04: 100%|██████████| 9455/9455 [40:49<00:00,  3.86it/s]  

train loss: 0.2303313877013926





test loss: 0.22130452568017983
valid loss: 0.22205081242549268
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 44.12289428224524
##############################################################


epoch 10 iter 9454: train loss 0.22590. mean loss: 0.22611. lr 2.812385e-04: 100%|██████████| 9455/9455 [40:27<00:00,  3.90it/s]  

train loss: 0.2261057166685067





test loss: 0.21927649612668193
valid loss: 0.21927647613271883
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 44.41774529250004
##############################################################


epoch 11 iter 9454: train loss 0.21838. mean loss: 0.22241. lr 2.771485e-04: 100%|██████████| 9455/9455 [40:14<00:00,  3.92it/s]  

train loss: 0.2224081448686621





test loss: 0.2157665193080902
valid loss: 0.2173823997189727
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 44.74593714633582
##############################################################


epoch 12 iter 9454: train loss 0.20836. mean loss: 0.21917. lr 2.726915e-04: 100%|██████████| 9455/9455 [40:41<00:00,  3.87it/s]  

train loss: 0.2191733108601956





test loss: 0.21537618395648425
valid loss: 0.2158568824016595
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 45.00463234087414
##############################################################


epoch 13 iter 9454: train loss 0.20313. mean loss: 0.21626. lr 2.678805e-04: 100%|██████████| 9455/9455 [40:31<00:00,  3.89it/s]  

train loss: 0.21625527119365462





test loss: 0.21373417615136014
valid loss: 0.21526709632782998
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 45.269307256196036
##############################################################


epoch 14 iter 9454: train loss 0.19538. mean loss: 0.21363. lr 2.627293e-04: 100%|██████████| 9455/9455 [40:26<00:00,  3.90it/s]  

train loss: 0.21363016688306768





test loss: 0.21166858850400658
valid loss: 0.21385281617882884
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 45.41680600923591
##############################################################


epoch 15 iter 9454: train loss 0.22387. mean loss: 0.21117. lr 2.572529e-04: 100%|██████████| 9455/9455 [40:28<00:00,  3.89it/s]  

train loss: 0.21116990949948714





test loss: 0.21177052640462224
valid loss: 0.21240954553779168
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 45.59820519734781
##############################################################


epoch 16 iter 9454: train loss 0.21955. mean loss: 0.20891. lr 2.514669e-04: 100%|██████████| 9455/9455 [40:26<00:00,  3.90it/s]  

train loss: 0.20890747796663336





test loss: 0.20916303897960276
valid loss: 0.21087813735762728
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 45.74848845976228
##############################################################


epoch 17 iter 9454: train loss 0.22276. mean loss: 0.20676. lr 2.453881e-04: 100%|██████████| 9455/9455 [40:24<00:00,  3.90it/s]  

train loss: 0.20675669054326906





test loss: 0.20803766782525218
valid loss: 0.21013710638390312
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 45.943301461503374
##############################################################


epoch 18 iter 9454: train loss 0.18144. mean loss: 0.20483. lr 2.390341e-04: 100%|██████████| 9455/9455 [40:29<00:00,  3.89it/s]  

train loss: 0.2048345522424142





test loss: 0.2080847983118854
valid loss: 0.20898202495484414
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 46.09281382821358
##############################################################


epoch 19 iter 9454: train loss 0.20288. mean loss: 0.20297. lr 2.324231e-04: 100%|██████████| 9455/9455 [40:29<00:00,  3.89it/s]  

train loss: 0.20296576087087756





test loss: 0.207252168202702
valid loss: 0.20917992859701567
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 46.153103416634124
##############################################################


epoch 20 iter 9454: train loss 0.21421. mean loss: 0.20121. lr 2.255743e-04: 100%|██████████| 9455/9455 [40:30<00:00,  3.89it/s]  

train loss: 0.2012118115619274





test loss: 0.20678809643546237
valid loss: 0.20792522818981846
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 46.406856579350915
##############################################################


epoch 21 iter 9454: train loss 0.16354. mean loss: 0.19950. lr 2.185074e-04: 100%|██████████| 9455/9455 [40:34<00:00,  3.88it/s]  

train loss: 0.19949543326226318





test loss: 0.20507498575916774
valid loss: 0.2068335598782648
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 46.628576311842565
##############################################################


epoch 22 iter 9454: train loss 0.20187. mean loss: 0.19793. lr 2.112428e-04: 100%|██████████| 9455/9455 [40:35<00:00,  3.88it/s]  

train loss: 0.19792634532887868





test loss: 0.20490934690342674
valid loss: 0.2065088828153248
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 46.66234109392401
##############################################################


epoch 23 iter 9454: train loss 0.18201. mean loss: 0.19637. lr 2.038015e-04: 100%|██████████| 9455/9455 [40:33<00:00,  3.89it/s]  

train loss: 0.19637382556243527





test loss: 0.2043918065632446
valid loss: 0.2047792848529695
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 46.7183603453369
##############################################################


epoch 24 iter 9454: train loss 0.19402. mean loss: 0.19490. lr 1.962049e-04: 100%|██████████| 9455/9455 [40:45<00:00,  3.87it/s]  

train loss: 0.19490345557614774





test loss: 0.20384961781622488
valid loss: 0.20556440704231022
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 46.8894360556575
##############################################################


epoch 25 iter 9454: train loss 0.17490. mean loss: 0.19347. lr 1.884750e-04: 100%|██████████| 9455/9455 [40:28<00:00,  3.89it/s]  

train loss: 0.19346737188835234





test loss: 0.2050862210460856
valid loss: 0.20475829854796204
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 46.88064001940165
##############################################################


epoch 26 iter 9454: train loss 0.20170. mean loss: 0.19209. lr 1.806340e-04: 100%|██████████| 9455/9455 [40:36<00:00,  3.88it/s]  

train loss: 0.19208716121790334





test loss: 0.2030955924640728
valid loss: 0.20354613605179364
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 47.109954288906174
##############################################################


epoch 27 iter 9454: train loss 0.15688. mean loss: 0.19080. lr 1.727047e-04: 100%|██████████| 9455/9455 [40:26<00:00,  3.90it/s]  

train loss: 0.19080226524427163





test loss: 0.20222395696217502
valid loss: 0.2042009289128871
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 47.27382218675211
##############################################################


epoch 28 iter 9454: train loss 0.19092. mean loss: 0.18950. lr 1.647098e-04: 100%|██████████| 9455/9455 [40:33<00:00,  3.89it/s]  

train loss: 0.18949553765581123





test loss: 0.20100688934326172
valid loss: 0.2018579433235941
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 47.23330071376662
##############################################################


epoch 29 iter 9454: train loss 0.20935. mean loss: 0.18825. lr 1.566725e-04: 100%|██████████| 9455/9455 [40:31<00:00,  3.89it/s]  

train loss: 0.18825023457494633





test loss: 0.2010484296309797
valid loss: 0.20340422550334206
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 47.409370359589175
##############################################################


epoch 30 iter 9454: train loss 0.18692. mean loss: 0.18699. lr 1.486160e-04: 100%|██████████| 9455/9455 [40:39<00:00,  3.88it/s]  

train loss: 0.1869891067997354





test loss: 0.2011563600618628
valid loss: 0.20210107538519026
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 47.560529580980116
##############################################################


epoch 31 iter 9454: train loss 0.18474. mean loss: 0.18582. lr 1.405634e-04: 100%|██████████| 9455/9455 [40:35<00:00,  3.88it/s]  

train loss: 0.18581868690009598





test loss: 0.2004196458979498
valid loss: 0.2022080445968652
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 47.588476973404354
##############################################################


epoch 32 iter 9454: train loss 0.18383. mean loss: 0.18469. lr 1.325381e-04: 100%|██████████| 9455/9455 [40:44<00:00,  3.87it/s]  

train loss: 0.1846891991002165





test loss: 0.1996266534434089
valid loss: 0.20358680679073818
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 47.661625183435305
##############################################################


epoch 33 iter 9454: train loss 0.19077. mean loss: 0.18353. lr 1.245631e-04: 100%|██████████| 9455/9455 [40:39<00:00,  3.88it/s]  

train loss: 0.183534984508877





test loss: 0.19887856818452665
valid loss: 0.20092384743539593
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 47.73602168591616
##############################################################


epoch 34 iter 9454: train loss 0.17051. mean loss: 0.18246. lr 1.166616e-04: 100%|██████████| 9455/9455 [40:33<00:00,  3.89it/s]  

train loss: 0.1824570844313381





test loss: 0.1995796736659883
valid loss: 0.20057060956200468
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 47.86665113728899
##############################################################


epoch 35 iter 9454: train loss 0.20175. mean loss: 0.18137. lr 1.088562e-04: 100%|██████████| 9455/9455 [40:38<00:00,  3.88it/s]  

train loss: 0.18136992486679385





test loss: 0.19864992457854597
valid loss: 0.20065974283821975
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 47.88581977204235
##############################################################


epoch 36 iter 9454: train loss 0.19712. mean loss: 0.18034. lr 1.011696e-04: 100%|██████████| 9455/9455 [40:36<00:00,  3.88it/s]  

train loss: 0.18034425895091782





test loss: 0.19923306586621684
valid loss: 0.1994970382391652
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 48.02032018617099
##############################################################


epoch 37 iter 9454: train loss 0.16863. mean loss: 0.17932. lr 9.362393e-05: 100%|██████████| 9455/9455 [40:42<00:00,  3.87it/s]  

train loss: 0.17932144434032207





test loss: 0.19972689393200452
valid loss: 0.20203917501847954
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 48.123408130312036
##############################################################


epoch 38 iter 9454: train loss 0.18200. mean loss: 0.17840. lr 8.624092e-05: 100%|██████████| 9455/9455 [40:31<00:00,  3.89it/s]  

train loss: 0.1783985232452688





test loss: 0.19790536768828768
valid loss: 0.2002376178397408
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 48.13085537766696
##############################################################


epoch 39 iter 9454: train loss 0.17471. mean loss: 0.17744. lr 7.904189e-05: 100%|██████████| 9455/9455 [40:34<00:00,  3.88it/s]  

train loss: 0.17744279825800882





test loss: 0.19784491424319112
valid loss: 0.2005579250900051
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 48.277171792616514
##############################################################


epoch 40 iter 9454: train loss 0.16865. mean loss: 0.17654. lr 7.204763e-05: 100%|██████████| 9455/9455 [42:55<00:00,  3.67it/s]  

train loss: 0.17654110790559818





test loss: 0.1980528703218774
valid loss: 0.1986544368010533
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 48.31917366907318
##############################################################


epoch 41 iter 9454: train loss 0.18227. mean loss: 0.17573. lr 6.527832e-05: 100%|██████████| 9455/9455 [48:10<00:00,  3.27it/s]  

train loss: 0.17573457686816618





test loss: 0.19793730427192735
valid loss: 0.19966118282909634
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 48.395523990694976
##############################################################


epoch 42 iter 9454: train loss 0.17776. mean loss: 0.17487. lr 5.875349e-05: 100%|██████████| 9455/9455 [41:09<00:00,  3.83it/s]  

train loss: 0.17486677899964584





test loss: 0.19823104633560665
valid loss: 0.19842650524423092
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 48.45918003592048
##############################################################


epoch 43 iter 9454: train loss 0.15216. mean loss: 0.17407. lr 5.249197e-05: 100%|██████████| 9455/9455 [49:23<00:00,  3.19it/s]  

train loss: 0.1740701851581279





test loss: 0.19738954437684408
valid loss: 0.19837068266506436
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 48.463979358418534
##############################################################


epoch 44 iter 9454: train loss 0.16014. mean loss: 0.17329. lr 4.651183e-05: 100%|██████████| 9455/9455 [44:45<00:00,  3.52it/s]  

train loss: 0.17329278724777328





test loss: 0.19883684493318388
valid loss: 0.1986763286816923
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 48.49418865821421
##############################################################


epoch 45 iter 9454: train loss 0.18177. mean loss: 0.17261. lr 4.083033e-05: 100%|██████████| 9455/9455 [45:44<00:00,  3.45it/s]  

train loss: 0.1726077829235953





test loss: 0.1967835024565081
valid loss: 0.1982685452020621
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 48.53441302345551
##############################################################


epoch 46 iter 9454: train loss 0.17749. mean loss: 0.17192. lr 3.546385e-05: 100%|██████████| 9455/9455 [49:01<00:00,  3.21it/s]  

train loss: 0.17192277030173211





test loss: 0.19700185803672934
valid loss: 0.19870617042613936
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 48.63323423462656
##############################################################


epoch 47 iter 9454: train loss 0.14104. mean loss: 0.17129. lr 3.042790e-05: 100%|██████████| 9455/9455 [48:59<00:00,  3.22it/s]  

train loss: 0.1712914384810903





test loss: 0.19920070186445984
valid loss: 0.19794500553155248
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 48.602968233400496
##############################################################


epoch 48 iter 9454: train loss 0.20204. mean loss: 0.17076. lr 3.000000e-05: 100%|██████████| 9455/9455 [49:05<00:00,  3.21it/s]  

train loss: 0.17075890384058925





test loss: 0.1962979328406008
valid loss: 0.19930218726019316
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 48.66759613322437
##############################################################


epoch 49 iter 9454: train loss 0.18914. mean loss: 0.17063. lr 3.000000e-05: 100%|██████████| 9455/9455 [49:06<00:00,  3.21it/s]  

train loss: 0.1706344845905183





test loss: 0.19695725414571882
valid loss: 0.19833394526680814
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 48.66008366042197
##############################################################


epoch 50 iter 9454: train loss 0.19319. mean loss: 0.17053. lr 3.000000e-05: 100%|██████████| 9455/9455 [48:58<00:00,  3.22it/s]  

train loss: 0.17052941837051064





test loss: 0.19821986424017557
valid loss: 0.1983115865459925
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 48.67672420109884
##############################################################


epoch 51 iter 9454: train loss 0.16709. mean loss: 0.17043. lr 3.000000e-05: 100%|██████████| 9455/9455 [49:03<00:00,  3.21it/s]  

train loss: 0.17042742783291387





test loss: 0.19660844493515883
valid loss: 0.1980167914040481
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 48.67497569816444
##############################################################


epoch 52 iter 9454: train loss 0.14669. mean loss: 0.17029. lr 3.000000e-05: 100%|██████████| 9455/9455 [48:59<00:00,  3.22it/s]  

train loss: 0.17029285709625322





test loss: 0.19613911400112924
valid loss: 0.19867383434048183
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 48.669754173433546
##############################################################


epoch 53 iter 9454: train loss 0.18860. mean loss: 0.17019. lr 3.000000e-05: 100%|██████████| 9455/9455 [49:01<00:00,  3.21it/s]  

train loss: 0.17018792826321819





test loss: 0.1963504245386848
valid loss: 0.19885536179512361
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 48.633725855044034
##############################################################


epoch 54 iter 9454: train loss 0.19103. mean loss: 0.17009. lr 3.000000e-05: 100%|██████████| 9455/9455 [48:54<00:00,  3.22it/s]  

train loss: 0.1700896316777771





test loss: 0.19719673655455625
valid loss: 0.19910381168504304
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 48.678039226953736
##############################################################


epoch 55 iter 9454: train loss 0.16854. mean loss: 0.16998. lr 3.000000e-05: 100%|██████████| 9455/9455 [48:51<00:00,  3.23it/s]  

train loss: 0.16997740764283176





test loss: 0.19753534261938893
valid loss: 0.1987545712839199
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 48.671821181483054
##############################################################


epoch 56 iter 9454: train loss 0.15969. mean loss: 0.16990. lr 3.000000e-05: 100%|██████████| 9455/9455 [49:09<00:00,  3.21it/s]  

train loss: 0.16990491046217118





test loss: 0.19853379002100305
valid loss: 0.19891305418708657
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 48.68985469988521
##############################################################


epoch 57 iter 9454: train loss 0.19053. mean loss: 0.16982. lr 3.000000e-05: 100%|██████████| 9455/9455 [48:59<00:00,  3.22it/s]  

train loss: 0.16981655132953355





test loss: 0.1975858030439932
valid loss: 0.19993347684039345
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 48.6112342089157
##############################################################


epoch 58 iter 9454: train loss 0.15873. mean loss: 0.16972. lr 3.000000e-05: 100%|██████████| 9455/9455 [49:07<00:00,  3.21it/s]  

train loss: 0.1697181096000056





test loss: 0.19676471586468852
valid loss: 0.19872604235063626
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 48.68102753441791
##############################################################


epoch 59 iter 9454: train loss 0.15122. mean loss: 0.16964. lr 3.000000e-05: 100%|██████████| 9455/9455 [48:59<00:00,  3.22it/s]  

train loss: 0.16963610662032158





test loss: 0.19659348963936674
valid loss: 0.19825328452677665
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 48.69478228381596
##############################################################


epoch 60 iter 9454: train loss 0.17749. mean loss: 0.16955. lr 3.000000e-05: 100%|██████████| 9455/9455 [49:07<00:00,  3.21it/s]  

train loss: 0.1695509258352342





test loss: 0.1966380807040613
valid loss: 0.19968913797336288
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 48.80495715291374
##############################################################


epoch 61 iter 9454: train loss 0.17364. mean loss: 0.16949. lr 3.000000e-05: 100%|██████████| 9455/9455 [41:09<00:00,  3.83it/s]  

train loss: 0.16949200037923962





test loss: 0.197903176463103
valid loss: 0.19911990418464323
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 48.77611517190088
##############################################################


epoch 62 iter 9454: train loss 0.18548. mean loss: 0.16941. lr 3.000000e-05: 100%|██████████| 9455/9455 [40:27<00:00,  3.89it/s]  

train loss: 0.1694119602709709





test loss: 0.19700280850446678
valid loss: 0.19770609974106657
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 48.73129038408612
##############################################################


epoch 63 iter 9454: train loss 0.19856. mean loss: 0.16932. lr 3.000000e-05: 100%|██████████| 9455/9455 [40:22<00:00,  3.90it/s]  

train loss: 0.1693200916122785





test loss: 0.19739639212059068
valid loss: 0.19915987060794346
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 48.69322224466296
##############################################################


epoch 64 iter 9454: train loss 0.16781. mean loss: 0.16928. lr 3.000000e-05: 100%|██████████| 9455/9455 [40:24<00:00,  3.90it/s]  

train loss: 0.16927531727450287





test loss: 0.19742316384858724
valid loss: 0.1974438459058351
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 48.694955531176085
##############################################################


epoch 65 iter 9454: train loss 0.17940. mean loss: 0.16921. lr 3.000000e-05: 100%|██████████| 9455/9455 [40:41<00:00,  3.87it/s]  

train loss: 0.16920688348447502





test loss: 0.1977805968704103
valid loss: 0.19841452565374254
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 48.66952644559219
##############################################################


epoch 66 iter 9454: train loss 0.16454. mean loss: 0.16913. lr 3.000000e-05: 100%|██████████| 9455/9455 [40:24<00:00,  3.90it/s]  

train loss: 0.1691279811432978





test loss: 0.19820142565648766
valid loss: 0.19872611289537406
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 48.75508320564905
##############################################################


epoch 67 iter 9454: train loss 0.15356. mean loss: 0.16907. lr 3.000000e-05: 100%|██████████| 9455/9455 [40:27<00:00,  3.90it/s]  

train loss: 0.16907428263877197





test loss: 0.19878603838667086
valid loss: 0.19846681477148323
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 48.69902080717255
##############################################################


epoch 68 iter 9454: train loss 0.17852. mean loss: 0.16900. lr 3.000000e-05: 100%|██████████| 9455/9455 [40:30<00:00,  3.89it/s]  

train loss: 0.16899603326001664





test loss: 0.19722337586970268
valid loss: 0.19716574911829793
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 48.72282505274609
##############################################################


epoch 69 iter 9454: train loss 0.17666. mean loss: 0.16895. lr 3.000000e-05: 100%|██████████| 9455/9455 [40:42<00:00,  3.87it/s]  

train loss: 0.16895140435867748





test loss: 0.19724931596200676
valid loss: 0.19788130438780482
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 48.694214787543046
##############################################################


epoch 70 iter 9454: train loss 0.16366. mean loss: 0.16886. lr 3.000000e-05: 100%|██████████| 9455/9455 [40:37<00:00,  3.88it/s]  

train loss: 0.16885569068346623





test loss: 0.19807158353962476
valid loss: 0.19866539915151235
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 48.67482783960515
##############################################################


epoch 71 iter 9454: train loss 0.14708. mean loss: 0.16880. lr 3.000000e-05: 100%|██████████| 9455/9455 [40:29<00:00,  3.89it/s]  

train loss: 0.1687989980904341





test loss: 0.19863804288302797
valid loss: 0.19871157634107373
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 48.75267128174804
##############################################################


epoch 72 iter 9454: train loss 0.18545. mean loss: 0.16888. lr 3.478137e-05: 100%|██████████| 9455/9455 [40:48<00:00,  3.86it/s]  

train loss: 0.16888382901973917





test loss: 0.1970400936618636
valid loss: 0.1990582248832606
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 48.77091486473016
##############################################################


epoch 73 iter 9454: train loss 0.16607. mean loss: 0.16933. lr 4.010518e-05: 100%|██████████| 9455/9455 [40:40<00:00,  3.87it/s]  

train loss: 0.16932679753827393





test loss: 0.19701051315929316
valid loss: 0.19844734027415892
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 48.71952830943483
##############################################################


epoch 74 iter 9454: train loss 0.17592. mean loss: 0.16978. lr 4.574611e-05: 100%|██████████| 9455/9455 [40:52<00:00,  3.86it/s]  

train loss: 0.16978250025662714





test loss: 0.19702990202209616
valid loss: 0.1986257650806934
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 48.67154966305596
##############################################################


epoch 75 iter 9454: train loss 0.15324. mean loss: 0.17027. lr 5.168789e-05: 100%|██████████| 9455/9455 [40:32<00:00,  3.89it/s]  

train loss: 0.17026908665911222





test loss: 0.1970403081254114
valid loss: 0.19885391657111012
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 48.65970051180914
##############################################################


epoch 76 iter 9454: train loss 0.17562. mean loss: 0.17086. lr 5.791337e-05: 100%|██████████| 9455/9455 [40:48<00:00,  3.86it/s]  

train loss: 0.1708561425444501





test loss: 0.19795357992377463
valid loss: 0.1994199643406687
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 48.50685082254446
##############################################################


epoch 77 iter 9454: train loss 0.16062. mean loss: 0.17144. lr 6.440458e-05: 100%|██████████| 9455/9455 [40:39<00:00,  3.88it/s]  

train loss: 0.17143972114650488





test loss: 0.1991728174535534
valid loss: 0.20112929785553413
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 48.576434063098034
##############################################################


epoch 78 iter 9454: train loss 0.16938. mean loss: 0.17204. lr 7.114279e-05: 100%|██████████| 9455/9455 [40:38<00:00,  3.88it/s]  

train loss: 0.17204295803099823





test loss: 0.198777608101881
valid loss: 0.20105428126039385
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 48.388325429701375
##############################################################


epoch 79 iter 9454: train loss 0.19931. mean loss: 0.17270. lr 7.810857e-05: 100%|██████████| 9455/9455 [40:42<00:00,  3.87it/s]  

train loss: 0.17269757152832985





test loss: 0.19935764485522162
valid loss: 0.2001549912781655
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 48.46197795570353
##############################################################


epoch 80 iter 9454: train loss 0.16519. mean loss: 0.17332. lr 8.528180e-05: 100%|██████████| 9455/9455 [40:36<00:00,  3.88it/s]  

train loss: 0.17332037286916654





test loss: 0.19935164025312738
valid loss: 0.20039620731450333
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 48.394469102833625
##############################################################


epoch 81 iter 9454: train loss 0.16398. mean loss: 0.17404. lr 9.264179e-05: 100%|██████████| 9455/9455 [40:32<00:00,  3.89it/s]  

train loss: 0.17403672320511152





test loss: 0.19996607020685944
valid loss: 0.1999004967982256
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 48.34497538940401
##############################################################


epoch 82 iter 9454: train loss 0.16953. mean loss: 0.17468. lr 1.001673e-04: 100%|██████████| 9455/9455 [40:36<00:00,  3.88it/s]  

train loss: 0.17467848808440123





test loss: 0.19998389256151416
valid loss: 0.2004095364975024
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 48.328365736030136
##############################################################


epoch 83 iter 9454: train loss 0.17688. mean loss: 0.17543. lr 1.078366e-04: 100%|██████████| 9455/9455 [40:49<00:00,  3.86it/s]  

train loss: 0.17542714966566458





test loss: 0.19928333823439442
valid loss: 0.20070798827123038
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 48.25512303314671
##############################################################


epoch 84 iter 9454: train loss 0.14551. mean loss: 0.17606. lr 1.156276e-04: 100%|██████████| 9455/9455 [40:35<00:00,  3.88it/s]  

train loss: 0.176057551307126





test loss: 0.19937179341346403
valid loss: 0.2012943697126606
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 48.267756170799515
##############################################################


epoch 85 iter 9454: train loss 0.14598. mean loss: 0.17677. lr 1.235178e-04: 100%|██████████| 9455/9455 [40:36<00:00,  3.88it/s]  

train loss: 0.17677004064030902





test loss: 0.19948889616923995
valid loss: 0.20097028794167918
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 48.1103120350795
##############################################################


epoch 86 iter 9454: train loss 0.17430. mean loss: 0.17749. lr 1.314843e-04: 100%|██████████| 9455/9455 [40:46<00:00,  3.86it/s]  

train loss: 0.17749227563235445





test loss: 0.19963548172123824
valid loss: 0.2004886892022966
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 48.06296938168104
##############################################################


epoch 87 iter 9454: train loss 0.17235. mean loss: 0.17818. lr 1.395044e-04: 100%|██████████| 9455/9455 [40:31<00:00,  3.89it/s]  

train loss: 0.17818400097202208





test loss: 0.20057790528369857
valid loss: 0.20243212976787664
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 48.05716546219544
##############################################################


epoch 88 iter 9454: train loss 0.18126. mean loss: 0.17888. lr 1.475547e-04: 100%|██████████| 9455/9455 [40:33<00:00,  3.88it/s]  

train loss: 0.17887666114797546





test loss: 0.20142068968543522
valid loss: 0.2021064054739626
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 47.98130348808379
##############################################################


epoch 89 iter 9454: train loss 0.15231. mean loss: 0.17952. lr 1.556120e-04: 100%|██████████| 9455/9455 [40:29<00:00,  3.89it/s]  

train loss: 0.17952325096021943





test loss: 0.2009637961282006
valid loss: 0.20178766020491154
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 48.02709045658239
##############################################################


epoch 90 iter 9454: train loss 0.19307. mean loss: 0.18014. lr 1.636532e-04: 100%|██████████| 9455/9455 [40:41<00:00,  3.87it/s]  

train loss: 0.1801365806683228





test loss: 0.20121427691435512
valid loss: 0.2021710883967484
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 47.9492516730295
##############################################################


epoch 91 iter 9454: train loss 0.16281. mean loss: 0.18075. lr 1.716550e-04: 100%|██████████| 9455/9455 [40:50<00:00,  3.86it/s]  

train loss: 0.1807451027322862





test loss: 0.20152146831343445
valid loss: 0.20421527024311356
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 47.79275772477171
##############################################################


epoch 92 iter 9454: train loss 0.16926. mean loss: 0.18136. lr 1.795943e-04: 100%|██████████| 9455/9455 [40:40<00:00,  3.87it/s]  

train loss: 0.18135790154298484





test loss: 0.2012810803289655
valid loss: 0.20241985102243062
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 47.87051795395237
##############################################################


epoch 93 iter 9454: train loss 0.19372. mean loss: 0.18189. lr 1.874481e-04: 100%|██████████| 9455/9455 [40:33<00:00,  3.88it/s]  

train loss: 0.18188677887116583





test loss: 0.20202962111068679
valid loss: 0.20348373508151574
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 47.79933448355385
##############################################################


epoch 94 iter 9454: train loss 0.18075. mean loss: 0.18242. lr 1.951940e-04: 100%|██████████| 9455/9455 [40:42<00:00,  3.87it/s]  

train loss: 0.18242199795230996





test loss: 0.20235751663582235
valid loss: 0.20378367527376248
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 47.729281126161595
##############################################################


epoch 95 iter 9454: train loss 0.20112. mean loss: 0.18285. lr 2.028094e-04: 100%|██████████| 9455/9455 [48:19<00:00,  3.26it/s]  

train loss: 0.18285113244949353





test loss: 0.20127036186713207
valid loss: 0.20184963073911547
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 47.58523074831814
##############################################################


epoch 96 iter 9454: train loss 0.19049. mean loss: 0.18330. lr 2.102724e-04: 100%|██████████| 9455/9455 [49:13<00:00,  3.20it/s]  

train loss: 0.18330140532418449





test loss: 0.20173621347433404
valid loss: 0.20424867619442033
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 47.61227113404924
##############################################################


epoch 97 iter 9454: train loss 0.17060. mean loss: 0.18368. lr 2.175614e-04: 100%|██████████| 9455/9455 [49:10<00:00,  3.20it/s]  

train loss: 0.1836807532451696





test loss: 0.20253652296488797
valid loss: 0.20350500846965403
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 47.73258574655407
##############################################################


epoch 98 iter 9454: train loss 0.20826. mean loss: 0.18401. lr 2.246556e-04: 100%|██████████| 9455/9455 [49:23<00:00,  3.19it/s]  

train loss: 0.18401224735330363





test loss: 0.20398859604250028
valid loss: 0.2038825695650487
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 47.51812430259157
##############################################################


epoch 99 iter 9454: train loss 0.17610. mean loss: 0.18428. lr 2.315342e-04: 100%|██████████| 9455/9455 [49:34<00:00,  3.18it/s]  

train loss: 0.18428413553851927





test loss: 0.20255496154857588
valid loss: 0.2034222972166689
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 47.53716286367059
##############################################################


epoch 100 iter 9454: train loss 0.16800. mean loss: 0.18453. lr 2.381776e-04: 100%|██████████| 9455/9455 [49:15<00:00,  3.20it/s]  

train loss: 0.18452731620408064





test loss: 0.20424895290332504
valid loss: 0.20371557772159576
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
Detokenizer Version $Revision: 4134 $
Language: lv
BLEU: 47.603698119136624
##############################################################


In [73]:
epochs = range(len(test_loss_list))
# plt.subplots(nrows=number_of_layers, ncols=number_of_heads, figsize=(30, 20))
fig, axs = plt.subplots(nrows=3, ncols=1, figsize=(20, 10))
axs[0].plot(epochs, train_loss_list)
axs[0].plot(epochs, test_loss_list)
axs[0].plot(epochs, valid_loss_list)
axs[0].set_title('Train vs Valid/Test loss')
# axs[0].set_xlabel('Epochs')
axs[0].set_ylabel('Loss')

axs[1].plot(epochs, test_loss_list)
axs[1].set_title('Validation & Test losses')
# axs[1].set_xlabel('Epochs')
axs[1].set_ylabel('Loss')

axs[1].plot(epochs, valid_loss_list)
#axs[1].set_title('Validation loss')
#axs[1].set_xlabel('Epochs')
#axs[1].set_ylabel('Loss')

axs[2].plot(epochs, valid_bleu_list)
axs[2].set_title('Validation BLEU')
axs[2].set_xlabel('Epochs')
axs[2].set_ylabel('BLEU')

plt.show()

  plt.show()


In [74]:
plt.savefig("tilde_losses.png")


#Evaluate

In [76]:
print(train_loss_list)
print()
max_bleu = max(valid_bleu_list)
best_bleu_epoch = valid_bleu_list.index(max_bleu)
print(f"Max BLEU: [{best_bleu_epoch}] {max_bleu}")

[0.6996148201601017, 0.34593802052537703, 0.2949696494876109, 0.2722373852633346, 0.25836024345196723, 0.24861045683867586, 0.24120904027564635, 0.23527597282564494, 0.2303313877013926, 0.2261057166685067, 0.2224081448686621, 0.2191733108601956, 0.21625527119365462, 0.21363016688306768, 0.21116990949948714, 0.20890747796663336, 0.20675669054326906, 0.2048345522424142, 0.20296576087087756, 0.2012118115619274, 0.19949543326226318, 0.19792634532887868, 0.19637382556243527, 0.19490345557614774, 0.19346737188835234, 0.19208716121790334, 0.19080226524427163, 0.18949553765581123, 0.18825023457494633, 0.1869891067997354, 0.18581868690009598, 0.1846891991002165, 0.183534984508877, 0.1824570844313381, 0.18136992486679385, 0.18034425895091782, 0.17932144434032207, 0.1783985232452688, 0.17744279825800882, 0.17654110790559818, 0.17573457686816618, 0.17486677899964584, 0.1740701851581279, 0.17329278724777328, 0.1726077829235953, 0.17192277030173211, 0.1712914384810903, 0.17075890384058925, 0.1706344

In [45]:
checkpoint = torch.load('minGPT-Tilde-LV-EN-translator_model_best.pt')
model.load_state_dict(checkpoint)

<All keys matched successfully>

In [46]:
with open('tilde_train_loss.txt', 'w') as f:
    f.write('\n'.join([str(s) for s in train_loss_list]))


In [47]:
with open('tilde_test_loss.txt', 'w') as f:
    f.write('\n'.join([str(s) for s in test_loss_list]))

with open('tilde_valid_loss.txt', 'w') as f:
    f.write('\n'.join([str(s) for s in valid_loss_list]))

with open('tilde_valid_bleu.txt', 'w') as f:
    f.write('\n'.join([str(s) for s in valid_bleu_list]))

In [48]:
from random import choice

for _ in range(5):
    idx = choice(range(len(valid_output)))

    context = valid_input[idx]
    encoded_input = tokenizer_input.encode(tokenizer_input.tokenize(context, block_size))
    x = torch.tensor(encoded_input, dtype=torch.long)[None,...].to(trainer.device)
    y = sample(model, x, block_size, temperature=1.0, sample=False, top_k=10)[0]

    intent = len(encoded_input) + 1

    predicted = y[intent:]
    completion = tokenizer_output.decode(predicted, True)
    print(f'Input:            {context}')
    print(f'Predicted output: {completion}')
    print(f'Real output:      {valid_output[idx]}')
    print('--------------------------------------------------')

Input:            šim ziņo@@ jumam var sek@@ ot tiesību akta priekšlikums .
Predicted output: this report may be followed by a legislative proposal . 
Real output:      this report may be accompanied by a legislative proposal .
--------------------------------------------------
Input:            viņi pa@@ met savu valsti , jo tajā nav kar@@ jer@@ as iespēju , piemērotu al@@ gu vai līdzekļu pēt@@ niecībai , pētniecības centr@@ u dur@@ vis ir slēg@@ tas , jo tiem trūkst gan līdzekļu , gan organizācijas , lai uzņem@@ tu jaunas grupas un attīst@@ ītu jaunas ide@@ jas .
Predicted output: they leave their country because they do not have the care@@ er option , apply the sal@@ ary or the funds for research , the do@@ or of research centres is closed because they lack both resources and organisations to take up new groups and develop new ideas .
Real output:      they leave because there are no care@@ er pro@@ spects , suitable sal@@ aries or funds for research , and the do@@ ors to research c

In [49]:
idx = choice(range(len(valid_output)))

context = valid_input[idx]
encoded_input = tokenizer_input.encode(tokenizer_input.tokenize(context, block_size))
x = torch.tensor(encoded_input, dtype=torch.long)[None,...].to(trainer.device)
y, attention_state = sample(model, x, block_size, temperature=1.0, sample=False, top_k=10, output_attention=True)

intent = len(encoded_input) + 1

predicted = y[0][intent:]
completion = tokenizer_output.decode(predicted,)
print(f'Input:            {context}')
print(f'Predicted output: {completion}')
print(f'Real output:      {valid_output[idx]}')
print('--------------------------------------------------')


Input:            III pielikuma apakš@@ vir@@ s@@ rakst@@ a &quot; Ne@@ tie@@ šie pasākumi &quot; 6. punkts
Predicted output: Annex III , sub@@ t@@ itle &apos; In@@ direct actions &apos; , paragraph 6 <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
Real output:      Annex III , sub@@ t@@ itle &quot; In@@ direct A@@ ctions , &quot; paragraph 6
--------------------------------------------------


In [50]:
fig, plots = plt.subplots(nrows=number_of_layers, ncols=number_of_heads, figsize=(30, 20))

axis_text = tokenizer_input.decode(encoded_input, True).split()

axis_text.append('<eos>')

axis_text += tokenizer_input.decode(predicted, True).split()

limit = len(axis_text)
for bi in range(number_of_layers):
    for hi in range(number_of_heads):
        attetion_plot = torch.zeros(limit, limit)
        for di in range(limit):
            attetion_plot[:di, :di] = attention_state[bi][di][0,hi,:di,:di].data

        ax = plots[bi][hi]
        ax.matshow(attetion_plot.numpy(), cmap='bone')

        # Set up axes
        ax.set_xticklabels([''] + axis_text, rotation=90)
        ax.set_yticklabels([''] + axis_text)

        # Show label at every tick
        ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
        ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

        # Set up a title
        ax.set_title(f'Block {bi + 1} Head {hi + 1}', size=25, pad=30)
        
plt.show()

AttributeError: 'list' object has no attribute 'shape'

In [None]:
# In case the previous cell is not plotting anything, uncomment the code below and execute. After that, the plotting should be fine.
# %matplotlib inline
# import numpy as np
# x = np.linspace(0, 10, 100)

# fig = plt.figure()
# plt.plot(x, np.sin(x), '-')
# plt.plot(x, np.cos(x), '--');

#Calculate BLEU

In [None]:
def clean_tokens(sentence):
    return sentence.replace('@@ ', '').replace(' @', '').replace('@ ', '')

In [None]:

if False:
    translation_results = []
    eval_text = []
    bleu_results = []
    for idx, context in enumerate(valid_input):
        encoded_input = tokenizer_input.encode(tokenizer_input.tokenize(context, block_size))
        x = torch.tensor(encoded_input, dtype=torch.long)[None,...].to(trainer.device)
        y = sample(model, x, block_size, temperature=1.0, sample=False, top_k=10)[0]

        intent = len(encoded_input) + 1
        predicted = y[intent:]
        completion = clean_tokens(tokenizer_output.decode(predicted, True))
        translation_results.append(completion)

        eval = clean_tokens(valid_output[idx])
        eval_text.append(eval)
        # bleu = sentence_bleu([eval], completion, smoothing_function=smooth)
        # bleu_results.append(bleu)

    with open('tilde_valid.out', 'w') as f:
        f.write("\n".join(translation_results))

    with open('tilde_valid.ref', 'w') as f:
        f.write("\n".join(eval_text))


In [None]:
# joint_vocab -s 10000
# UNKNOWN TOKEN

# |v|

In [None]:
#!perl mosesdecoder/scripts/generic/multi-bleu.perl tilde_valid.ref < tilde_valid.out

In [None]:
# BLEU = 7.92, 38.4/12.4/4.2/2.0 (BP=1.000, ratio=1.021, hyp_len=9711, ref_len=9509)

# joint_vocab -s 10,000
# BLEU = 8.61, 44.4/15.1/5.5/2.8 (BP=0.852, ratio=0.862, hyp_len=8198, ref_len=9509)

# full joint_vocab
# BLEU = 9.18, 41.7/14.1/5.4/2.8 (BP=0.948, ratio=0.950, hyp_len=9030, ref_len=9509)

# model_best.pt
# BLEU = 13.47, 48.0/19.6/9.4/5.5 (BP=0.908, ratio=0.912, hyp_len=8670, ref_len=9509)

In [None]:
#!cat tilde_valid.out | mosesdecoder/scripts/tokenizer/detokenizer.perl -l lv > tilde_valid.detok.out
#!cat tilde_valid.ref | mosesdecoder/scripts/tokenizer/detokenizer.perl -l lv > tilde_valid.detok.ref

In [None]:
#!pip install sacrebleu
#!pip show sacrebleu

In [None]:
#import sacrebleu

#with open('tilde_valid.detok.ref', 'r') as f:
#    eval_ref = [l.strip() for l in f.readlines()]
#with open('tilde_valid.detok.out', 'r') as f:
#    translation_results = [l.strip() for l in f.readlines()]

#refs = [eval_ref]
#sys = translation_results
#bleu = sacrebleu.corpus_bleu(sys, refs)
#print(bleu.score)

In [None]:
# 7.918993465381516
# joint_vocab -s 10000  8.534786641173136

# full joint_vocab 9.174070997058795

# model_best.pt 
13.481896471451254

#Interactive translator

In [None]:
context = input("Enter your English text to translate: ")

# Predict Latvian output
encoded_input = tokenizer_input.encode(tokenizer_input.tokenize(context, block_size))
x = torch.tensor(encoded_input, dtype=torch.long)[None,...].to(trainer.device)
y, attention_state = sample(model, x, block_size, temperature=1.0, sample=False, top_k=10, output_attention=True)

intent = len(encoded_input) + 1

predicted = y[0][intent:]
completion = tokenizer_output.decode(predicted, True)
print(f'Input:            {context}')
print(f'Predicted output: {completion}')


# Plot attention
fig, plots = plt.subplots(nrows=number_of_layers, ncols=number_of_heads, figsize=(30, 20))

axis_text = tokenizer_input.decode(encoded_input, True).split()

axis_text.append('<eos>')

axis_text += tokenizer_input.decode(predicted, True).split()

limit = len(axis_text)
for bi in range(number_of_layers):
    for hi in range(number_of_heads):
        attetion_plot = torch.zeros(limit, limit)
        for di in range(limit):
            attetion_plot[:di, :di] = attention_state[bi][di][0,hi,:di,:di].data

        ax = plots[bi][hi]
        ax.matshow(attetion_plot.numpy(), cmap='bone')

        # Set up axes
        ax.set_xticklabels([''] + axis_text, rotation=90)
        ax.set_yticklabels([''] + axis_text)

        # Show label at every tick
        ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
        ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

        # Set up a title
        ax.set_title(f'Block {bi + 1} Head {hi + 1}', size=25, pad=30)
        
plt.show()