In [1]:
import torch
from transformers import AutoTokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import load_dataset
import optuna
import random
import numpy as np

# For reproducibility
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

print(f"Using device: {'cuda' if torch.cuda.is_available() else 'cpu'}")

Using device: cuda


In [2]:
# Load the English-Hungarian translation dataset from Hugging Face
dataset = load_dataset('opus_books', 'en-hu')
print(dataset)

README.md:   0%|          | 0.00/28.1k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/23.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/137151 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 137151
    })
})


In [3]:
from datasets import Dataset

# Extract the translation list from your dataset
translation_list = dataset["train"]["translation"]
hf_dataset = Dataset.from_list(translation_list)


In [4]:
# Initialize model name and tokenizer
model_name = 't5-small'
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Preprocessing function to add the T5 task prefix and tokenize both inputs and targets
def preprocess_function(examples):
    inputs = ["translate English to Hungarian: " + ex for ex in examples["en"]]
    targets = [ex for ex in examples["hu"]]

    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding='max_length')
    labels = tokenizer(targets, max_length=128, truncation=True, padding='max_length')

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenize the train and validation splits  
tokenized_datasets = hf_dataset.map(preprocess_function, batched=True)

# Set the dataset format to PyTorch tensors
tokenized_datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

print(tokenized_datasets)

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Map:   0%|          | 0/137151 [00:00<?, ? examples/s]

Dataset({
    features: ['en', 'hu', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 137151
})


In [5]:
import optuna
import torch
from transformers import TrainingArguments, Trainer, T5ForConditionalGeneration
from datasets import Dataset
from sklearn.model_selection import train_test_split

def objective(trial):
    # Suggest hyperparameters
    learning_rate = trial.suggest_float("learning_rate", 5e-5, 1e-3, log=True)  # Slightly wider range
    warmup_steps = trial.suggest_int("warmup_steps", 500, 1500, step=500)  # Add warmup tuning
    gradient_accumulation_steps = trial.suggest_int("gradient_accumulation_steps", 2, 8)
    weight_decay = trial.suggest_float("weight_decay", 0.0, 0.3)
    per_device_train_batch_size = trial.suggest_categorical("per_device_train_batch_size", [8, 16, 32])
    num_train_epochs = 1  # Use 1 epoch for faster tuning

    training_args = TrainingArguments(
        output_dir='./t5-small-translation-optuna',
        evaluation_strategy='steps',
        eval_steps=500,
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=per_device_train_batch_size,
        learning_rate=learning_rate,
        num_train_epochs=num_train_epochs,
        weight_decay=weight_decay,
        report_to="tensorboard",  # 🔥 Logs to TensorBoard
        logging_dir="./t5-small-translation-tensorboard",  # 🔥 TensorBoard log dir
        gradient_accumulation_steps=gradient_accumulation_steps,
        warmup_steps=warmup_steps,  # Add tuned warmup
        lr_scheduler_type="cosine_with_restarts",  # Use cosine scheduler
        logging_steps=100,
        save_steps=500,
        save_total_limit=2,
        fp16=torch.cuda.is_available(),  # Enable mixed precision training if GPU available
        load_best_model_at_end=True,
        metric_for_best_model='loss',
        dataloader_num_workers=4
    )
    
    # Reinitialize model for each trial
    model = T5ForConditionalGeneration.from_pretrained(model_name)
    
    # Split using Hugging Face's built-in method
    train_test_split = tokenized_datasets.train_test_split(test_size=0.2)
    train_data = train_test_split["train"]
    eval_data = train_test_split["test"]

    
    train_dataset = train_data.remove_columns(["en", "hu"])
    eval_dataset = eval_data.remove_columns(["en", "hu"])

    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
    )
    
    trainer.train()
    eval_result = trainer.evaluate()
    
    # Return the evaluation loss as the objective metric
    return eval_result["eval_loss"]


In [6]:
# Run the Optuna study
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=10)

print("Best trial:")
trial = study.best_trial
print(f"  Value: {trial.value}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

[I 2025-03-14 11:54:05,772] A new study created in memory with name: no-name-09bae40b-4fb6-46a6-ac93-a97b41a8a821


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

  trainer = Trainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss,Validation Loss


[I 2025-03-14 12:13:10,088] Trial 0 finished with value: 1.2588436603546143 and parameters: {'learning_rate': 0.0005708948855396893, 'warmup_steps': 500, 'gradient_accumulation_steps': 7, 'weight_decay': 0.10451249398574737, 'per_device_train_batch_size': 32}. Best is trial 0 with value: 1.2588436603546143.
  trainer = Trainer(


Step,Training Loss,Validation Loss
500,1.6698,1.527621
1000,1.4042,1.31556
1500,1.3355,1.222139
2000,1.29,1.162906
2500,1.2387,1.124129
3000,1.2137,1.098963
3500,1.1857,1.084875
4000,1.1923,1.07793
4500,1.171,1.076301


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


[I 2025-03-14 12:54:11,507] Trial 1 finished with value: 1.0763005018234253 and parameters: {'learning_rate': 0.0004378721591873564, 'warmup_steps': 500, 'gradient_accumulation_steps': 3, 'weight_decay': 0.021056370062719286, 'per_device_train_batch_size': 8}. Best is trial 1 with value: 1.0763005018234253.
  trainer = Trainer(


Step,Training Loss,Validation Loss
500,1.5334,1.364499
1000,1.2899,1.18236


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


[I 2025-03-14 13:16:27,203] Trial 2 finished with value: 1.1823604106903076 and parameters: {'learning_rate': 0.000496857824541632, 'warmup_steps': 500, 'gradient_accumulation_steps': 3, 'weight_decay': 0.22338282439705454, 'per_device_train_batch_size': 32}. Best is trial 1 with value: 1.0763005018234253.
  trainer = Trainer(


Step,Training Loss,Validation Loss
500,1.8657,1.702902
1000,1.6401,1.534465


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


[I 2025-03-14 13:38:42,638] Trial 3 finished with value: 1.5344651937484741 and parameters: {'learning_rate': 8.171934845166888e-05, 'warmup_steps': 500, 'gradient_accumulation_steps': 3, 'weight_decay': 0.05917915123627517, 'per_device_train_batch_size': 32}. Best is trial 1 with value: 1.0763005018234253.
  trainer = Trainer(


Step,Training Loss,Validation Loss
500,1.9625,1.864804
1000,1.8093,1.664031
1500,1.6757,1.568769
2000,1.6528,1.511431
2500,1.5848,1.468552
3000,1.5753,1.437542
3500,1.5624,1.415097
4000,1.4869,1.397786
4500,1.5008,1.38523
5000,1.4996,1.37748


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


[I 2025-03-14 14:27:26,671] Trial 4 finished with value: 1.3683903217315674 and parameters: {'learning_rate': 7.528660668458725e-05, 'warmup_steps': 500, 'gradient_accumulation_steps': 2, 'weight_decay': 0.22688529693890694, 'per_device_train_batch_size': 8}. Best is trial 1 with value: 1.0763005018234253.
  trainer = Trainer(


Step,Training Loss,Validation Loss
500,1.9345,1.77682


[W 2025-03-14 14:43:57,621] Trial 5 failed with parameters: {'learning_rate': 0.00011022785282886311, 'warmup_steps': 1000, 'gradient_accumulation_steps': 3, 'weight_decay': 0.06172239734987018, 'per_device_train_batch_size': 32} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "<ipython-input-5-77328bfd3527>", line 60, in objective
    trainer.train()
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 2164, in train
    return inner_training_loop(
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 2522, in _inner_training_loop
    tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 3688, in training_step
    self.accelerator.backward(loss, **kwargs)
  File "/

KeyboardInterrupt: 

In [7]:
# Train the final model using the best hyperparameters found by Optuna
best_params = study.best_trial.params
best_params

{'learning_rate': 0.0004378721591873564,
 'warmup_steps': 500,
 'gradient_accumulation_steps': 3,
 'weight_decay': 0.021056370062719286,
 'per_device_train_batch_size': 8}

In [8]:


final_training_args = TrainingArguments(
    output_dir='./t5-small-translation-final',
    evaluation_strategy='steps',
    eval_steps=500,
    per_device_train_batch_size=best_params["per_device_train_batch_size"],
    per_device_eval_batch_size=best_params["per_device_train_batch_size"],
    learning_rate=best_params["learning_rate"],
    num_train_epochs=3,  # Train for more epochs on final run
    weight_decay=best_params["weight_decay"],
    gradient_accumulation_steps=best_params["gradient_accumulation_steps"],
    logging_steps=100,
    save_steps=500,
    save_total_limit=2,
    fp16=torch.cuda.is_available(),
    load_best_model_at_end=True,
    metric_for_best_model='loss',
    dataloader_num_workers=4,
    report_to="tensorboard",  # 🔥 Logs to TensorBoard
    logging_dir="./t5-small-translation-tensorboard",  # 🔥 TensorBoard log dir
)

# Reinitialize the final model
final_model = T5ForConditionalGeneration.from_pretrained(model_name)

# Split the dataset again for final training
train_test_split = tokenized_datasets.train_test_split(test_size=0.2)
train_data = train_test_split["train"]
eval_data = train_test_split["test"]

# Remove unnecessary columns
train_dataset = train_data.remove_columns(["en", "hu"])
eval_dataset = eval_data.remove_columns(["en", "hu"])

final_trainer = Trainer(
    model=final_model,
    args=final_training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer
)

final_trainer.train()

  final_trainer = Trainer(


Step,Training Loss,Validation Loss
500,1.5509,1.42573
1000,1.365,1.280305
1500,1.3149,1.202585
2000,1.2736,1.148224
2500,1.2211,1.106241
3000,1.1893,1.075608
3500,1.1503,1.048828
4000,1.1398,1.028209
4500,1.0996,1.006354
5000,1.0374,0.990276


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=13713, training_loss=1.1034292878133067, metrics={'train_runtime': 7039.1568, 'train_samples_per_second': 46.761, 'train_steps_per_second': 1.948, 'total_flos': 1.1136194108719104e+16, 'train_loss': 1.1034292878133067, 'epoch': 2.999854174261757})

In [11]:
# Define a helper function to translate text using the final model
def translate_text(text):
    input_text = "translate English to Hungarian: " + text
    input_ids = tokenizer(input_text, return_tensors='pt', max_length=128, truncation=True).input_ids
    
    if torch.cuda.is_available():
        input_ids = input_ids.to('cuda')
        final_model.to('cuda')
    
    outputs = final_model.generate(
        input_ids, 
        max_length=128, 
        num_beams=4, 
        early_stopping=True
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test translation with a sample sentence
sample_text = "I love books."
translated_text = translate_text(sample_text)

print(f"English: {sample_text}")
print(f"Hungarian: {translated_text}")

English: I love books.
Hungarian: Szeretem könyveket.


In [12]:
# Save the fine-tuned model and tokenizer to the specified directory
output_dir = "/kaggle/working/t5-small-hungarian-translator"

# Create directory if it doesn't exist
import os
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Save the model
final_model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"Model and tokenizer successfully saved to {output_dir}")

# If you want to download the files to your local machine
# You'll need to zip the directory first
!tar -czvf /kaggle/working/t5-small-hungarian-translator.tar.gz -C /kaggle/working t5-small-hungarian-translator

Model and tokenizer successfully saved to /kaggle/working/t5-small-hungarian-translator
t5-small-hungarian-translator/
t5-small-hungarian-translator/tokenizer_config.json
t5-small-hungarian-translator/special_tokens_map.json
t5-small-hungarian-translator/model.safetensors
t5-small-hungarian-translator/config.json
t5-small-hungarian-translator/spiece.model
t5-small-hungarian-translator/generation_config.json
t5-small-hungarian-translator/tokenizer.json


In [13]:
!zip -r tensorboard_logs.zip /kaggle/working/t5-small-translation-tensorboard


  adding: kaggle/working/t5-small-translation-tensorboard/ (stored 0%)
  adding: kaggle/working/t5-small-translation-tensorboard/events.out.tfevents.1741954390.aba8776d12aa.31.1 (deflated 25%)
  adding: kaggle/working/t5-small-translation-tensorboard/events.out.tfevents.1741958188.aba8776d12aa.31.6 (deflated 62%)
  adding: kaggle/working/t5-small-translation-tensorboard/events.out.tfevents.1741962446.aba8776d12aa.31.9 (deflated 25%)
  adding: kaggle/working/t5-small-translation-tensorboard/events.out.tfevents.1741958187.aba8776d12aa.31.5 (deflated 25%)
  adding: kaggle/working/t5-small-translation-tensorboard/events.out.tfevents.1741959522.aba8776d12aa.31.7 (deflated 25%)
  adding: kaggle/working/t5-small-translation-tensorboard/events.out.tfevents.1741953248.aba8776d12aa.31.0 (deflated 61%)
  adding: kaggle/working/t5-small-translation-tensorboard/events.out.tfevents.1741962447.aba8776d12aa.31.10 (deflated 62%)
  adding: kaggle/working/t5-small-translation-tensorboard/events.out.tfeve

In [14]:
from IPython.display import FileLink

FileLink("tensorboard_logs.zip")


In [15]:
!zip -r optuna_logs.zip /kaggle/working/t5-small-translation-optuna


  adding: kaggle/working/t5-small-translation-optuna/ (stored 0%)
  adding: kaggle/working/t5-small-translation-optuna/checkpoint-6857/ (stored 0%)
  adding: kaggle/working/t5-small-translation-optuna/checkpoint-6857/tokenizer_config.json (deflated 95%)
  adding: kaggle/working/t5-small-translation-optuna/checkpoint-6857/special_tokens_map.json (deflated 85%)
  adding: kaggle/working/t5-small-translation-optuna/checkpoint-6857/scheduler.pt (deflated 55%)
  adding: kaggle/working/t5-small-translation-optuna/checkpoint-6857/model.safetensors (deflated 8%)
  adding: kaggle/working/t5-small-translation-optuna/checkpoint-6857/rng_state.pth (deflated 25%)
  adding: kaggle/working/t5-small-translation-optuna/checkpoint-6857/config.json (deflated 62%)
  adding: kaggle/working/t5-small-translation-optuna/checkpoint-6857/training_args.bin (deflated 51%)
  adding: kaggle/working/t5-small-translation-optuna/checkpoint-6857/trainer_state.json (deflated 76%)
  adding: kaggle/working/t5-small-transla

In [16]:
from IPython.display import FileLink

FileLink("optuna_logs.zip")
