Instale as bibliotecas necessárias:

pip install transformers
pip install torch
pip install sacrebleu
pip install datasets

Baixe o conjunto de dados MSMARCO "tiny" e prepare os dados de treinamento e validação:

In [1]:
import csv
import random

input_file = 'msmarco_triples.train.tiny.tsv'
train_file = 'train_data.csv'
val_file = 'val_data.csv'

train_percentage = 0.8

with open(input_file, 'r', encoding='utf-8') as tsv_file, open(train_file, 'w', encoding='utf-8', newline='') as train, open(val_file, 'w', encoding='utf-8', newline='') as val:
    reader = csv.reader(tsv_file, delimiter='\t')
    train_writer = csv.writer(train, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    val_writer = csv.writer(val, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    
    for row in reader:
        if random.random() < train_percentage:
            train_writer.writerow([row[1], row[0]])
        else:
            val_writer.writerow([row[1], row[0]])

Treine o modelo T5-base:

In [2]:
from transformers import T5ForConditionalGeneration, T5Tokenizer, T5Config, Trainer, TrainingArguments
from datasets import load_dataset

model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
config = T5Config.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name, config=config)

train_dataset = load_dataset('csv', data_files=train_file, delimiter=',', split='train', column_names=['passage', 'query'])
val_dataset = load_dataset('csv', data_files=val_file, delimiter=',', split='train', column_names=['passage', 'query'])

def tokenize_function(examples):
    inputs = tokenizer(examples['passage'], padding='max_length', truncation=True, return_tensors='pt')
    targets = tokenizer(examples['query'], padding='max_length', truncation=True, return_tensors='pt')
    inputs['labels'] = targets['input_ids']
    return inputs

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)


training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,
    num_train_epochs=3,
    logging_dir='./logs',
    logging_steps=100,
    save_total_limit=1,
    evaluation_strategy='steps',
    eval_steps=500,
    metric_for_best_model='bleu',
    greater_is_better=True,
    #_n_gpu=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)
trainer.train()

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


Downloading and preparing dataset csv/default to C:/Users/crist/.cache/huggingface/datasets/csv/default-3c870cb971bacff2/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to C:/Users/crist/.cache/huggingface/datasets/csv/default-3c870cb971bacff2/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)


Downloading and preparing dataset csv/default to C:/Users/crist/.cache/huggingface/datasets/csv/default-43e4c424f786dbcc/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to C:/Users/crist/.cache/huggingface/datasets/csv/default-43e4c424f786dbcc/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)


Map:   0%|          | 0/8733 [00:00<?, ? examples/s]

Map:   0%|          | 0/2267 [00:00<?, ? examples/s]

The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: passage, query. If passage, query are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 8733
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 4
  Total optimization steps = 1638
  Number of trainable parameters = 222903552
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mcristianoborgescardoso[0m ([33mic_unicamp[0m). Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/1638 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 48.00 MiB (GPU 0; 6.00 GiB total capacity; 5.21 GiB already allocated; 0 bytes free; 5.33 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

Avalie o modelo usando sacreBLEU:

In [None]:
import sacrebleu
from transformers import pipeline

translator = pipeline('translation', model=model, tokenizer=tokenizer)

def evaluate_bleu(predictions, references):
    bleu = sacrebleu.corpus_bleu(predictions, [references])
    return bleu.score

predictions = []
references = []

for i, row in enumerate(val_dataset):
    input_text = tokenizer.decode(row['input_ids'], skip_special_tokens=True)
    target_text = tokenizer.decode(row['labels'], skip_special_tokens=True)
    
    translation = translator(input_text, max_length=32)[0]['translation_text']
    predictions.append(translation)
    references.append(target_text)

bleu_score = evaluate_bleu(predictions, references)
print(f'sacreBLEU score: {bleu_score}')