<a href="https://colab.research.google.com/github/dietmarja/LLM-Elements/blob/main/Fine_Tuning_02.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Fine Tuning Including Validation

In [14]:
import torch
from torch import nn, optim
from transformers import GPT2Tokenizer, GPT2LMHeadModel

In [15]:
# Initialize the model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

In [16]:
# Add a padding token if it doesn't exist
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token": tokenizer.eos_token})
    model.resize_token_embeddings(len(tokenizer))

In [17]:
# Prepare the data (example using a dummy dataset)
train_texts = ["Example sentence one.", "Example sentence two."]
validation_texts = ["Validation sentence one.", "Validation sentence two."]
train_inputs = tokenizer(train_texts, return_tensors="pt", padding=True, truncation=True, max_length=128)
validation_inputs = tokenizer(validation_texts, return_tensors="pt", padding=True, truncation=True, max_length=128)


In [18]:
# Define the optimizer and loss function
optimizer = optim.AdamW(model.parameters(), lr=5e-5)
loss_fn = nn.CrossEntropyLoss()

In [19]:
# Training loop
epochs = 3
for epoch in range(epochs):
    model.train()
    for i in range(len(train_inputs["input_ids"])):
        input_ids = train_inputs["input_ids"][i].unsqueeze(0)
        attention_mask = train_inputs["attention_mask"][i].unsqueeze(0)

        outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if i % 10 == 0:  # Print loss every 10 batches
            print(f"Epoch: {epoch}, Batch: {i}, Loss: {loss.item()}")



Epoch: 0, Batch: 0, Loss: 7.26689338684082
Epoch: 1, Batch: 0, Loss: 3.3851006031036377
Epoch: 2, Batch: 0, Loss: 1.4126462936401367


In [20]:
# Validation loop
model.eval()
val_loss = 0
with torch.no_grad():
    for i in range(len(validation_inputs["input_ids"])):
        input_ids = validation_inputs["input_ids"][i].unsqueeze(0)
        attention_mask = validation_inputs["attention_mask"][i].unsqueeze(0)
        outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
        val_loss += outputs.loss.item()

avg_val_loss = val_loss / len(validation_inputs["input_ids"])
print(f"Epoch: {epoch}, Validation Loss: {avg_val_loss}")

Epoch: 2, Validation Loss: 5.292510747909546


In [21]:
# Generate sample output
model.eval()
with torch.no_grad():
    sample_output = model.generate(tokenizer.encode("Test sentence", return_tensors="pt"), max_length=50)
    print(f"Sample output: {tokenizer.decode(sample_output[0], skip_special_tokens=True)}")

print("Training completed.")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Sample output: Test sentence one.

The first sentence is a bit of a stretch. The second sentence is a bit more complicated. The third sentence is a bit more complicated.

The third sentence is a bit more complicated.

The fourth sentence
Training completed.
