In [1]:
!pip3 install transformers sacrebleu torch --user

Collecting sacrebleu
  Using cached sacrebleu-2.3.1-py3-none-any.whl (118 kB)
Collecting lxml
  Using cached lxml-4.9.2-cp310-cp310-win_amd64.whl (3.8 MB)
Installing collected packages: lxml, sacrebleu
Successfully installed lxml-4.9.2 sacrebleu-2.3.1


In [2]:
import pandas as pd

df = pd.read_csv("msmarco_triples.train.tiny.tsv", sep="\t", names=["query", "passage", "label"])

Instancie o modelo T5-base da biblioteca Transformers:

In [3]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained('t5-base')
model = T5ForConditionalGeneration.from_pretrained('t5-base')

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


Downloading pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Prepare os dados de treinamento e validação:

In [4]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

train_data = []
val_data = []

for i, row in train_df.iterrows():
    input_str = f"document expansion: {row['passage']} </s>"
    target_str = f"{row['query']} </s>"
    train_data.append((input_str, target_str))

for i, row in val_df.iterrows():
    input_str = f"document expansion: {row['passage']} </s>"
    target_str = f"{row['query']} </s>"
    val_data.append((input_str, target_str))

Defina a função de treinamento do modelo:

In [11]:
import torch

def train_model(train_data, val_data, tokenizer, model, batch_size=8, num_epochs=5, lr=1e-4, log_steps=100, eval_steps=500):
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    train_dataloader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val_data, batch_size=batch_size, shuffle=False)
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        for i, batch in enumerate(train_dataloader):
            input_strs = [data[0] for data in batch]
            target_strs = [data[1] for data in batch]
            input_ids = tokenizer(input_strs, padding=True, truncation=True, return_tensors='pt').input_ids.to(device)
            target_ids = tokenizer(target_strs, padding=True, truncation=True, return_tensors='pt').input_ids.to(device)
            labels = target_ids[:, 1:].clone().detach().to(device)
            labels[labels == tokenizer.pad_token_id] = -100
            outputs = model(input_ids=input_ids.to(device), decoder_input_ids=target_ids[:, :-1].to(device), labels=labels)
            loss = outputs.loss
            train_loss += loss.item()
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            if (i+1) % log_steps == 0:
                print(f"Epoch {epoch+1}/{num_epochs}, Batch {i+1}/{len(train_dataloader)}, Train Loss: {train_loss/log_steps:.4f}")
                train_loss = 0
            if (i+1) % eval_steps == 0:
                val_loss = evaluate_model(val_dataloader, tokenizer, model)
                print(f"Epoch {epoch+1}/{num_epochs}, Batch {i+1}/{len(train_dataloader)}, Val Loss: {val_loss:.4f}")
                model.train()

def evaluate_model(dataloader, tokenizer, model):
    model.eval()
    loss = 0
    for i, batch in enumerate(dataloader):
        input_strs = [data[0] for data in batch]
        target_strs = [data[1] for data in batch]
        input_ids = tokenizer(input_strs, padding=True, truncation=True, return_tensors='pt').input_ids.to(device)
        target_ids = tokenizer(target_strs, padding=True, truncation=True, return_tensors='pt').input_ids.to(device)
        labels = target_ids[:, 1:].clone().detach().to(device)
        labels[labels == tokenizer.pad_token_id] = -100
        with torch.no_grad():
            outputs = model(input_ids=input_ids.to(device), decoder_input_ids=target_ids[:, :-1].to(device), labels=labels)
            loss += outputs.loss.item()
    return loss / len(dataloader)



Defina as configurações de treinamento e execute o treinamento:

In [14]:
import os 

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

batch_size = 8
num_epochs = 5
lr = 1e-4
log_steps = 100
eval_steps = 500
model.to(device)
# define the path to the saved model
model_path = "./t5_fine_tunned.pt"
# create a new instance of the model
model_t5 = T5ForConditionalGeneration.from_pretrained('t5-base')



# Check if the model already exists before loading it
if os.path.exists(model_path):
    print(f"Loading existing model from {model_path}")
    # load the saved state dictionary
    model_t5.load_state_dict(torch.load(model_path))
else:
    train_model(train_data, val_data, tokenizer, model, batch_size=batch_size, num_epochs=num_epochs, lr=lr, log_steps=log_steps, eval_steps=eval_steps)
    # save the model state dictionary
    torch.save(model.state_dict(), model_path)
    print(f"No existing model found at {model_path}. Training new model...")



Epoch 1/5, Batch 100/1100, Train Loss: 4.9423
Epoch 1/5, Batch 200/1100, Train Loss: 4.2545
Epoch 1/5, Batch 300/1100, Train Loss: 4.0415
Epoch 1/5, Batch 400/1100, Train Loss: 3.9355
Epoch 1/5, Batch 500/1100, Train Loss: 3.8471
Epoch 1/5, Batch 500/1100, Val Loss: 3.5410
Epoch 1/5, Batch 600/1100, Train Loss: 3.7809
Epoch 1/5, Batch 700/1100, Train Loss: 3.6957
Epoch 1/5, Batch 800/1100, Train Loss: 3.6577
Epoch 1/5, Batch 900/1100, Train Loss: 3.7501
Epoch 1/5, Batch 1000/1100, Train Loss: 3.6742
Epoch 1/5, Batch 1000/1100, Val Loss: 3.4280
Epoch 1/5, Batch 1100/1100, Train Loss: 3.7168
Epoch 2/5, Batch 100/1100, Train Loss: 3.5575
Epoch 2/5, Batch 200/1100, Train Loss: 3.6047
Epoch 2/5, Batch 300/1100, Train Loss: 3.6808
Epoch 2/5, Batch 400/1100, Train Loss: 3.6334
Epoch 2/5, Batch 500/1100, Train Loss: 3.6371
Epoch 2/5, Batch 500/1100, Val Loss: 3.3848
Epoch 2/5, Batch 600/1100, Train Loss: 3.5662
Epoch 2/5, Batch 700/1100, Train Loss: 3.5535
Epoch 2/5, Batch 800/1100, Train Loss

In [21]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import DataLoader
import torch
import pandas as pd
from sacrebleu import corpus_bleu
from transformers.optimization import AdamW
from transformers.optimization import get_linear_schedule_with_warmup




def evaluate(val_data, tokenizer, model, device):
    model.eval()
    references = []
    predictions = []
    with torch.no_grad():
        for batch in val_data:
            input_ids = batch['input_ids'].to(device)
            target_ids = batch['target_ids'].to(device)
            output_ids = model.generate(input_ids=input_ids, max_length=50, early_stopping=True)
            output_texts = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
            target_texts = tokenizer.batch_decode(target_ids, skip_special_tokens=True)
            references.extend(target_texts)
            predictions.extend(output_texts)
    bleu = corpus_bleu(predictions, [references])
    return bleu.score

def train_model(train_data, val_data, tokenizer, model, batch_size, num_epochs, lr, log_steps, eval_steps):
    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False)
    
    optimizer = AdamW(model.parameters(), lr=lr)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader)*num_epochs)
    
    model.train()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    step = 0
    for epoch in range(num_epochs):
        print(f"Epoch {epoch+1}")
        for i, batch in enumerate(train_loader):
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            target_ids = batch[2].to(device)
            decoder_attention_mask = batch[3].to(device)
            
            model.zero_grad()
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, decoder_input_ids=target_ids[:,:-1], decoder_attention_mask=decoder_attention_mask[:,:-1], labels=target_ids[:,1:])
            
            loss = outputs.loss
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            
            optimizer.step()
            scheduler.step()
            
            if step % log_steps == 0:
                print(f"Step [{step}/{len(train_loader)*num_epochs}] | Loss: {loss}")
                
            if step % eval_steps == 0 and step != 0:
                model.eval()
                with torch.no_grad():
                    val_loss, val_bleu = evaluate_model(val_loader, tokenizer, model)
                print(f"Validation Loss: {val_loss} | Validation BLEU: {val_bleu}")
                model.train()
            step += 1

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

batch_size = 8
num_epochs = 5
lr = 1e-4
log_steps = 100
eval_steps = 500

train_model(train_data, val_data, tokenizer, model, batch_size=batch_size, num_epochs=num_epochs, lr=lr, log_steps=log_steps, eval_steps=eval_steps)


Epoch 1


AttributeError: 'tuple' object has no attribute 'to'