# imports and downloads


In [1]:
!pip install transformers datasets numpy wandb sentencepiece tqdm pandas sacrebleu git+https://github.com/google-research/bleurt.git

Collecting git+https://github.com/google-research/bleurt.git
  Cloning https://github.com/google-research/bleurt.git to /tmp/pip-req-build-m7tcurkb
  Running command git clone -q https://github.com/google-research/bleurt.git /tmp/pip-req-build-m7tcurkb


In [2]:
import torch
import torch.nn.functional as F
import transformers
import numpy as np
from datasets import load_dataset
from tqdm.auto import tqdm
from datasets.arrow_dataset import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import wandb
from datasets import load_metric
from datasets import Dataset
import pandas as pd

## model and tokenizer download


In [3]:
model_name = "google/mt5-small"

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [4]:
from transformers import AutoTokenizer, T5ForConditionalGeneration

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

model = T5ForConditionalGeneration.from_pretrained(model_name)

model = model.to(device)

You are using a model of type mt5 to instantiate a model of type t5. This is not supported for all configurations of models and can yield errors.


## datasets

In [5]:
# forward translation
en_hu_dataset = load_dataset("tatoeba", lang1="en", lang2="hu") 
de_hu_dataset = load_dataset("tatoeba", lang1="de", lang2="hu")
de_es_dataset = load_dataset("tatoeba", lang1="de", lang2="es")
es_ru_dataset = load_dataset("tatoeba", lang1="es", lang2="ru")

Using custom data configuration en-hu-lang1=en,lang2=hu
Reusing dataset tatoeba (/root/.cache/huggingface/datasets/tatoeba/en-hu-lang1=en,lang2=hu/0.0.0/b3ea9c6bb2af47699c5fc0a155643f5a0da287c7095ea14824ee0a8afd74daf6)


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration de-hu-lang1=de,lang2=hu
Reusing dataset tatoeba (/root/.cache/huggingface/datasets/tatoeba/de-hu-lang1=de,lang2=hu/0.0.0/b3ea9c6bb2af47699c5fc0a155643f5a0da287c7095ea14824ee0a8afd74daf6)


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration de-es-lang1=de,lang2=es
Reusing dataset tatoeba (/root/.cache/huggingface/datasets/tatoeba/de-es-lang1=de,lang2=es/0.0.0/b3ea9c6bb2af47699c5fc0a155643f5a0da287c7095ea14824ee0a8afd74daf6)


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration es-ru-lang1=es,lang2=ru
Reusing dataset tatoeba (/root/.cache/huggingface/datasets/tatoeba/es-ru-lang1=es,lang2=ru/0.0.0/b3ea9c6bb2af47699c5fc0a155643f5a0da287c7095ea14824ee0a8afd74daf6)


  0%|          | 0/1 [00:00<?, ?it/s]

# Preprocessing


## setting up datasets

at the moment the datasets are dataset dicts holding just the dataset train which then holds "id" and translation" where translation is a list of Dicts holding a source language and target language. We only need the translations as the id's are parallel nor do we need them as translation holds our technical input ids and decoder ids in the source and target languages

In [6]:
en_hu_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 110177
    })
})

here i try to make a dataset of just the source and target languages, because again thats all i need. plan to do so using dataframe and while doing so begin to split datasets into train and eval sets, as it is easier to split them here as arrays rather than later as datasets 

In [7]:
# train dataframes
en_hu_pd = pd.DataFrame(data=en_hu_dataset['train']['translation'])
de_hu_pd = pd.DataFrame(data=de_hu_dataset['train']['translation'])
de_es_pd = pd.DataFrame(data=de_es_dataset['train']['translation'])
es_ru_pd = pd.DataFrame(data=es_ru_dataset['train']['translation'])

In [8]:
en_hu = Dataset.from_pandas(en_hu_pd)
print(en_hu[0])
en_hu

{'en': "Let's try something.", 'hu': 'Próbáljunk ki valamit.'}


Dataset({
    features: ['en', 'hu'],
    num_rows: 110177
})

In [9]:
# now that we see using pandas works repeat the process for the other three datasets
de_hu = Dataset.from_pandas(de_hu_pd)
de_es = Dataset.from_pandas(de_es_pd)
es_ru = Dataset.from_pandas(es_ru_pd)

Now i have to make backward datasets and i will try doing so with the features param of from pandas

In [10]:
# temp_arr = list(en_hu_pd.columns)
# temp_arr = temp_arr[-1:]+temp_arr[:-1]
hu_en = Dataset.from_pandas(en_hu_pd[reversed(list(en_hu_pd.columns))])
print(hu_en[0])
hu_en

{'hu': 'Próbáljunk ki valamit.', 'en': "Let's try something."}


Dataset({
    features: ['hu', 'en'],
    num_rows: 110177
})

In [11]:
# now that that works time to copy it for the rest
hu_de = Dataset.from_pandas(de_hu_pd[reversed(list(de_hu_pd.columns))])
es_de = Dataset.from_pandas(de_es_pd[reversed(list(de_es_pd.columns))])
ru_es = Dataset.from_pandas(es_ru_pd[reversed(list(es_ru_pd.columns))])


so now that we have all our datasets set up now its time to define our tokens and start formatting out data

In [12]:
lang_tokens = {'en': '<en>', 'hu': '<hu>', 'ru': '<ru>', 'de': '<de>', 'es': '<es>'}

lang_tok_map = {"additional_special_tokens" : list(lang_tokens.values())}
tokenizer.add_special_tokens(lang_tok_map)
model.resize_token_embeddings(len(tokenizer))

Embedding(250105, 512)

In [13]:
unprocessed_datasets = [en_hu, hu_de, de_es, es_ru, hu_en, de_hu, es_de, ru_es]

In [14]:
split_datasets = []
for dataset in unprocessed_datasets:
  split_datasets.append(dataset.train_test_split(test_size=0.3))

In [15]:
for dataset in split_datasets:
  print(dataset)

DatasetDict({
    train: Dataset({
        features: ['en', 'hu'],
        num_rows: 77123
    })
    test: Dataset({
        features: ['en', 'hu'],
        num_rows: 33054
    })
})
DatasetDict({
    train: Dataset({
        features: ['hu', 'de'],
        num_rows: 48592
    })
    test: Dataset({
        features: ['hu', 'de'],
        num_rows: 20826
    })
})
DatasetDict({
    train: Dataset({
        features: ['de', 'es'],
        num_rows: 59903
    })
    test: Dataset({
        features: ['de', 'es'],
        num_rows: 25673
    })
})
DatasetDict({
    train: Dataset({
        features: ['es', 'ru'],
        num_rows: 68186
    })
    test: Dataset({
        features: ['es', 'ru'],
        num_rows: 29223
    })
})
DatasetDict({
    train: Dataset({
        features: ['hu', 'en'],
        num_rows: 77123
    })
    test: Dataset({
        features: ['hu', 'en'],
        num_rows: 33054
    })
})
DatasetDict({
    train: Dataset({
        features: ['de', 'hu'],
        num_r

## functions


In [16]:
def add_prefix(example,seq_len = 128):
  source, target = example.keys()
  targ_tkn = lang_tokens[target]
  example[source] = targ_tkn + ' ' + example[source]
  return example

In [17]:
def preprocess_function(example, seq_len=128, Lang_token_map = lang_tokens):
  source, target = list(example.keys())
  task_prefix = Lang_token_map[target]
  encoding = tokenizer(
    task_prefix + example[source],
    padding="max_length",
    max_length=seq_len,
    truncation=True,
    return_tensors="pt",
  )
  input_ids, attention_mask = encoding.input_ids, encoding.attention_mask

  target_encoding = tokenizer(example[target], padding="max_length", max_length=seq_len, truncation=True)
  labels = target_encoding.input_ids

  out_cols = ['input_ids', 'attn_mask', 'labels']
  out_items = [input_ids, attention_mask, labels]

  return dict(zip(out_cols, out_items))

## param cell


In [18]:
#############Model Params################
num_epochs = 1
lr = 2e-5
batch_size = 8
weight_decay = 0.01
# max_steps = 10
eval_every = 10
# warmup_steps = 5000
#########################################

## actual preprocessing


In [20]:
# lets now format all our data
encoded_datasets = []
for dataset in split_datasets:
  old_col_names = list(dataset['train'].column_names)
  encoded_dataset = dataset.map(preprocess_function, remove_columns = old_col_names, num_proc = 4)
  encoded_datasets.append(encoded_dataset)

      

#0:   0%|          | 0/19281 [00:00<?, ?ex/s]

  

#1:   0%|          | 0/19281 [00:00<?, ?ex/s]

#3:   0%|          | 0/19280 [00:00<?, ?ex/s]

#2:   0%|          | 0/19281 [00:00<?, ?ex/s]

      

#0:   0%|          | 0/8264 [00:00<?, ?ex/s]

  

#1:   0%|          | 0/8264 [00:00<?, ?ex/s]

#3:   0%|          | 0/8263 [00:00<?, ?ex/s]

#2:   0%|          | 0/8263 [00:00<?, ?ex/s]

     

#0:   0%|          | 0/12148 [00:00<?, ?ex/s]

   

#1:   0%|          | 0/12148 [00:00<?, ?ex/s]

#3:   0%|          | 0/12148 [00:00<?, ?ex/s]

#2:   0%|          | 0/12148 [00:00<?, ?ex/s]

      

#0:   0%|          | 0/5207 [00:00<?, ?ex/s]

  

#1:   0%|          | 0/5207 [00:00<?, ?ex/s]

#2:   0%|          | 0/5206 [00:00<?, ?ex/s]

#3:   0%|          | 0/5206 [00:00<?, ?ex/s]

     

#0:   0%|          | 0/14976 [00:00<?, ?ex/s]

   

#1:   0%|          | 0/14976 [00:00<?, ?ex/s]

#2:   0%|          | 0/14976 [00:00<?, ?ex/s]

#3:   0%|          | 0/14975 [00:00<?, ?ex/s]

      

#0:   0%|          | 0/6419 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/6418 [00:00<?, ?ex/s]

 

#2:   0%|          | 0/6418 [00:00<?, ?ex/s]

#3:   0%|          | 0/6418 [00:00<?, ?ex/s]

     

#0:   0%|          | 0/17047 [00:00<?, ?ex/s]

  

#1:   0%|          | 0/17047 [00:00<?, ?ex/s]

 

#2:   0%|          | 0/17046 [00:00<?, ?ex/s]

#3:   0%|          | 0/17046 [00:00<?, ?ex/s]

     

#0:   0%|          | 0/7306 [00:00<?, ?ex/s]

  

#1:   0%|          | 0/7306 [00:00<?, ?ex/s]

 

#2:   0%|          | 0/7306 [00:00<?, ?ex/s]

#3:   0%|          | 0/7305 [00:00<?, ?ex/s]

     

#0:   0%|          | 0/19281 [00:00<?, ?ex/s]

  

#1:   0%|          | 0/19281 [00:00<?, ?ex/s]

 

#2:   0%|          | 0/19281 [00:00<?, ?ex/s]

#3:   0%|          | 0/19280 [00:00<?, ?ex/s]

      

#0:   0%|          | 0/8264 [00:00<?, ?ex/s]

  

#1:   0%|          | 0/8264 [00:00<?, ?ex/s]

#2:   0%|          | 0/8263 [00:00<?, ?ex/s]

#3:   0%|          | 0/8263 [00:00<?, ?ex/s]

      

#0:   0%|          | 0/12148 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/12148 [00:00<?, ?ex/s]

 

#2:   0%|          | 0/12148 [00:00<?, ?ex/s]

#3:   0%|          | 0/12148 [00:00<?, ?ex/s]

      

#0:   0%|          | 0/5207 [00:00<?, ?ex/s]

#1:   0%|          | 0/5207 [00:00<?, ?ex/s]

  

#2:   0%|          | 0/5206 [00:00<?, ?ex/s]

#3:   0%|          | 0/5206 [00:00<?, ?ex/s]

      

#0:   0%|          | 0/14976 [00:00<?, ?ex/s]

#1:   0%|          | 0/14976 [00:00<?, ?ex/s]

  

#2:   0%|          | 0/14976 [00:00<?, ?ex/s]

#3:   0%|          | 0/14975 [00:00<?, ?ex/s]

     

#0:   0%|          | 0/6419 [00:00<?, ?ex/s]

  

#1:   0%|          | 0/6418 [00:00<?, ?ex/s]

 

#3:   0%|          | 0/6418 [00:00<?, ?ex/s]

#2:   0%|          | 0/6418 [00:00<?, ?ex/s]

     

#0:   0%|          | 0/17047 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/17047 [00:00<?, ?ex/s]

  

#2:   0%|          | 0/17046 [00:00<?, ?ex/s]

#3:   0%|          | 0/17046 [00:00<?, ?ex/s]

     

#0:   0%|          | 0/7306 [00:00<?, ?ex/s]

  

#1:   0%|          | 0/7306 [00:00<?, ?ex/s]

#2:   0%|          | 0/7306 [00:00<?, ?ex/s]

 

#3:   0%|          | 0/7305 [00:00<?, ?ex/s]

In [23]:
encoded_datasets[0]['train']

Dataset({
    features: ['input_ids', 'attn_mask', 'labels'],
    num_rows: 77123
})

In [24]:
from datasets import concatenate_datasets
train_splits = []
test_splits = []
for dataset in encoded_datasets:
  train_splits.append(dataset['train'])
  test_splits.append(dataset['test'])

train_dataset = concatenate_datasets(train_splits)
eval_dataset = concatenate_datasets(test_splits)

In [25]:
from transformers.data.data_collator import DataCollatorWithPadding

collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors = "pt")
train_dataloader = torch.utils.data.DataLoader(train_dataset, shuffle=True, collate_fn=collator, batch_size=batch_size)
valid_dataloader = torch.utils.data.DataLoader(eval_dataset, shuffle=True, collate_fn=collator, batch_size=batch_size)

# metrics


In [21]:
from datasets import load_metric
bleu = load_metric("sacrebleu")
bleurt = load_metric("bleurt")

Using default BLEURT-Base checkpoint for sequence maximum length 128. You can use a bigger model for better results with e.g.: datasets.load_metric('bleurt', 'bleurt-large-512').


INFO:tensorflow:Reading checkpoint /root/.cache/huggingface/metrics/bleurt/default/downloads/extracted/887f2dc36c17f53c287f696681b8f7c947278407c1cf9f226662e16c8c0dc417/bleurt-base-128.
INFO:tensorflow:Config file found, reading.
INFO:tensorflow:Will load checkpoint bert_custom
INFO:tensorflow:Loads full paths and checks that files exists.
INFO:tensorflow:... name:bert_custom
INFO:tensorflow:... vocab_file:vocab.txt
INFO:tensorflow:... bert_config_file:bert_config.json
INFO:tensorflow:... do_lower_case:True
INFO:tensorflow:... max_seq_length:128
INFO:tensorflow:Creating BLEURT scorer.
INFO:tensorflow:Creating WordPiece tokenizer.
INFO:tensorflow:WordPiece tokenizer instantiated.
INFO:tensorflow:Creating Eager Mode predictor.
INFO:tensorflow:Loading model.
INFO:tensorflow:BLEURT initialized.


# Old Model Training

In [None]:
run = wandb.init(project=f"mt5-small")

optimizer = torch.optim.AdamW(
        model.parameters(),
        lr=lr,
        weight_decay=weight_decay,
    )

# Task 1.11: Training loop
# YOUR CODE STARTS HERE
for _ in tqdm(range(num_epochs), desc="Epochs"):
    for example in train_dataloader:

        # 1 move everything to the device
        input_ids, att_mask, labels = example["input_ids"].to(device), example["attention_mask"].to(device), example["labels"].to(device)
        # 2 calculate propabilities
        out = model(input_ids=torch.squeeze(input_ids,1), attention_mask=att_mask, labels=torch.squeeze(labels,1))
        # 3. Calculate loss using F.cross_entropy
        loss = out.loss
        # 4 use odamW's zero_grad
        optimizer.zero_grad()
        # 5 backpropagate the loss
        loss.backward()
        # 6 update params using step?
        optimizer.step()

    for batch in tqdm(valid_dataloader, desc="Evaluating"):
        with torch.no_grad():
            # 1 move everything to the device
            input_ids_v, token_ids_v, att_mask_v, labels_v = batch["input_ids"].to(device), batch["token_type_ids"].to(device), batch["attention_mask"].to(device), batch["labels"].to(device)
            # 2 calculate propabilities
            valid_probs = model(input_ids=torch.squeeze(input_ids_v,1), attention_mask=att_mask, labels=torch.squeeze(labels_v,1))
            # 3. Calculate loss using F.cross_entropy
               
    bleu.add_batch(predictions=torch.argmax(valid_probs, dim=-1), references=labels_v)
    bleu_value = bleu.compute()
    wandb.log({'bleu': bleu_value})
    bleurt.add_batch(predictions=torch.argmax(valid_probs, dim=-1), references=labels_v)
    bleurt_value = bleurt.compute()
    wandb.log({'bleu': bleurt_value})

    torch.save(model, "model.pt")
# YOUR CODE ENDS HERE
run.finish()  # stop wandb run

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

Epochs:   0%|          | 0/1 [00:00<?, ?it/s]

# New Modelling
