In [1]:
import torch
from torch.utils.data import DataLoader
from transformers import T5ForConditionalGeneration, AdamW, AutoTokenizer
from datasets import load_dataset

In [2]:
# Load Multi30k dataset
# dataset = load_dataset("bentrevett/multi30k", split="train[:10000]")  # Load only a subset for demonstration
dataset = load_dataset("bentrevett/multi30k")  # Load only a subset for demonstration

Downloading readme:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.60M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/164k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/156k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['en', 'de'],
        num_rows: 29000
    })
    validation: Dataset({
        features: ['en', 'de'],
        num_rows: 1014
    })
    test: Dataset({
        features: ['en', 'de'],
        num_rows: 1000
    })
})

In [4]:
dataset["train"]

Dataset({
    features: ['en', 'de'],
    num_rows: 29000
})

In [5]:
# Initialize T5 tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

In [6]:
# def tokenize_data(example):
#     source_text = example["en"]
#     target_text = example["de"]
#     tokenized_inputs = tokenizer(
#         source_text,
#         padding="max_length",
#         truncation=True,
#         max_length=128,
#         return_tensors="pt"
#     )
#     tokenized_targets = tokenizer(
#         target_text,
#         padding="max_length",
#         truncation=True,
#         max_length=128,
#         return_tensors="pt"
#     )
#     return {
#         "input_ids": tokenized_inputs.input_ids.flatten(),
#         "attention_mask": tokenized_inputs.attention_mask.flatten(),
#         "labels": tokenized_targets.input_ids.flatten(),
#         "labels_attention_mask": tokenized_targets.attention_mask.flatten(),
#     }


# def tokenize_data(example):
#     source_text = example["en"]
#     target_text = example["de"]
#     tokenized_inputs = tokenizer(
#         source_text,
#         padding="max_length",
#         truncation=True,
#         max_length=128,
#         return_tensors="pt"
#     )
#     tokenized_targets = tokenizer(
#         target_text,
#         padding="max_length",
#         truncation=True,
#         max_length=128,
#         return_tensors="pt"
#     )
#     return {
#         "input_ids": tokenized_inputs.input_ids[0],
#         "attention_mask": tokenized_inputs.attention_mask[0],
#         "labels": tokenized_targets.input_ids[0],
#         "labels_attention_mask": tokenized_targets.attention_mask[0],
#     }


# Tokenize and preprocess data
def tokenize_data(batch):
    src_texts = batch["en"]
    tgt_texts = batch["de"]
    tokenized_batch = tokenizer.prepare_seq2seq_batch(src_texts, tgt_texts, truncation=True, padding="max_length", max_length=128, return_tensors="pt")
    return {
        "input_ids": tokenized_batch.input_ids,
        "attention_mask": tokenized_batch.attention_mask,
        "labels": tokenized_batch.labels,
    }

In [7]:
dataset = dataset.map(tokenize_data, batched=True)
train_dataloader = DataLoader(dataset['train'], batch_size=8, shuffle=True)

Map:   0%|          | 0/29000 [00:00<?, ? examples/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and targets.

Here is a short example:

model_inputs = tokenizer(src_texts, text_target=tgt_texts, ...)

If you either need to use different keyword arguments for the source and target texts, you should do two calls like
this:

model_inputs = tokenizer(src_texts, ...)
labels = tokenizer(text_target=tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.



Map:   0%|          | 0/1014 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [8]:
# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [9]:
optimizer = AdamW(model.parameters(), lr=1e-4)

# for epoch in range(3):  # Adjust number of epochs as needed
#     for batch in train_dataloader:
#         input_ids = batch["input_ids"].to(device)
#         attention_mask = batch["attention_mask"].to(device)
#         labels = batch["labels"].to(device)

#         optimizer.zero_grad()
#         outputs = model(
#             input_ids=input_ids,
#             attention_mask=attention_mask,
#             labels=labels,
#         )
#         loss = outputs.loss
#         loss.backward()
#         optimizer.step()

#         print(f"Epoch {epoch + 1}, Loss: {loss.item()}")

for epoch in range(1):  # Adjust number of epochs as needed
    for batch in train_dataloader:
        # Convert each sequence in the batch to tensors and move them to device
        input_ids = torch.tensor([item for sublist in batch["input_ids"] for item in sublist]).to(device)
        attention_mask = torch.tensor([item for sublist in batch["attention_mask"] for item in sublist]).to(device)
        labels = torch.tensor([item for sublist in batch["labels"] for item in sublist]).to(device)
        
        optimizer.zero_grad()
        outputs = model(
            input_ids=input_ids.unsqueeze(0),  
            attention_mask=attention_mask.unsqueeze(0),  
            labels=labels.unsqueeze(0),  
        )
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        print(f"Epoch {epoch + 1}, Loss: {loss.item()}")



Epoch 1, Loss: 15.397992134094238
Epoch 1, Loss: 13.578856468200684
Epoch 1, Loss: 12.593639373779297
Epoch 1, Loss: 12.532017707824707
Epoch 1, Loss: 10.71521282196045
Epoch 1, Loss: 11.551669120788574
Epoch 1, Loss: 9.687947273254395
Epoch 1, Loss: 8.377568244934082
Epoch 1, Loss: 8.626347541809082
Epoch 1, Loss: 5.613637447357178
Epoch 1, Loss: 5.17933464050293
Epoch 1, Loss: 6.232644557952881
Epoch 1, Loss: 3.7043616771698
Epoch 1, Loss: 5.3638105392456055
Epoch 1, Loss: 3.612170696258545
Epoch 1, Loss: 3.1822938919067383
Epoch 1, Loss: 3.062915325164795
Epoch 1, Loss: 2.068197011947632
Epoch 1, Loss: 2.258237361907959
Epoch 1, Loss: 2.4222443103790283
Epoch 1, Loss: 2.2555246353149414
Epoch 1, Loss: 2.298738718032837
Epoch 1, Loss: 2.8426969051361084
Epoch 1, Loss: 2.138777732849121
Epoch 1, Loss: 2.4314329624176025
Epoch 1, Loss: 2.6504244804382324
Epoch 1, Loss: 2.6182641983032227
Epoch 1, Loss: 2.2058684825897217
Epoch 1, Loss: 2.019033432006836
Epoch 1, Loss: 2.049898386001587

Epoch 1, Loss: 1.130643606185913
Epoch 1, Loss: 1.091322898864746
Epoch 1, Loss: 1.3831450939178467
Epoch 1, Loss: 1.1916652917861938
Epoch 1, Loss: 1.2966530323028564
Epoch 1, Loss: 1.112549066543579
Epoch 1, Loss: 1.051073431968689
Epoch 1, Loss: 1.322925090789795
Epoch 1, Loss: 1.2747650146484375
Epoch 1, Loss: 1.0510892868041992
Epoch 1, Loss: 1.1529440879821777
Epoch 1, Loss: 1.256658911705017
Epoch 1, Loss: 1.4859212636947632
Epoch 1, Loss: 1.1455434560775757
Epoch 1, Loss: 1.141351342201233
Epoch 1, Loss: 1.034354329109192
Epoch 1, Loss: 1.2325009107589722
Epoch 1, Loss: 1.3514087200164795
Epoch 1, Loss: 1.028560757637024
Epoch 1, Loss: 1.5896151065826416
Epoch 1, Loss: 1.3591983318328857
Epoch 1, Loss: 0.9556340575218201
Epoch 1, Loss: 1.4168521165847778
Epoch 1, Loss: 0.9899098873138428
Epoch 1, Loss: 1.1894758939743042
Epoch 1, Loss: 1.0382803678512573
Epoch 1, Loss: 1.07610285282135
Epoch 1, Loss: 1.2287604808807373
Epoch 1, Loss: 1.3230339288711548
Epoch 1, Loss: 0.96029752

KeyboardInterrupt: 

In [None]:
# Save trained model
model.save_pretrained("model_weights.pth")