# 03a - Fine-tune T5 for Sequence Translation

In [1]:
import numpy as np
import pandas as pd

import torch
from datasets import load_dataset

from src import data, models, metrics

DATA_DIR = 'data/'
OUTPUT_DIR = 'output/t5/'
MODEL_NAME = 't5_monitors_printers_3epoch'


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {device}')

Device: cuda


## Create T5 Nodel with Tokenizer

In [2]:
# create model and load pre-trained checkpoint
net = models.T5(pretrained_checkpoint='t5-small')

Downloading:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/242M [00:00<?, ?B/s]

In [3]:
print(f'Number of trainable parameters: {net.num_trainable_params():,}')

Number of trainable parameters: 60,506,624


## Example of Translation

In [4]:
net.predict_sample('translate English to German: The house is wonderful.')

'Das Haus ist wunderbar.'

In [5]:
net.predict_sample('translate English to French: The house is wonderful.')

'La maison est merveilleuse.'

## Load the Data

In [6]:
# load datasets
monitors = load_dataset('csv', data_files={
    'train': DATA_DIR + 'monitors_translation_202107_train.csv',
    'validation': DATA_DIR + 'monitors_translation_202107_val.csv',
    # 'test': DATA_DIR + 'monitors_translation_202107_test.csv'
})
monitors = data.add_column(monitors, name='type', value='monitors')
printers = load_dataset('csv', data_files={
    'train': DATA_DIR + 'printers_translation_202107_train.csv',
    'validation': DATA_DIR + 'printers_translation_202107_val.csv',
    # 'test': DATA_DIR + 'printers_translation_202107_test.csv'
})
printers = data.add_column(printers, name='type', value='printers')
datasets = data.concat_datasets(monitors, printers)


# tokenize datasets
tokenized_monitors = net.tokenize_dataset(monitors, prefix='Clean Monitors:')
tokenized_printers = net.tokenize_dataset(printers, prefix='Clean Printers:')
tokenized_datasets = data.concat_datasets(tokenized_monitors, tokenized_printers)

datasets

Using custom data configuration default-6727cc1724fb7c7f


Downloading and preparing dataset csv/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /home/ec2-user/.cache/huggingface/datasets/csv/default-6727cc1724fb7c7f/0.0.0/e138af468cb14e747fb46a19c787ffcfa5170c821476d20d5304287ce12bbc23...


0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

Using custom data configuration default-11305636ba3651e9


Dataset csv downloaded and prepared to /home/ec2-user/.cache/huggingface/datasets/csv/default-6727cc1724fb7c7f/0.0.0/e138af468cb14e747fb46a19c787ffcfa5170c821476d20d5304287ce12bbc23. Subsequent calls will reuse this data.
Downloading and preparing dataset csv/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /home/ec2-user/.cache/huggingface/datasets/csv/default-11305636ba3651e9/0.0.0/e138af468cb14e747fb46a19c787ffcfa5170c821476d20d5304287ce12bbc23...


0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /home/ec2-user/.cache/huggingface/datasets/csv/default-11305636ba3651e9/0.0.0/e138af468cb14e747fb46a19c787ffcfa5170c821476d20d5304287ce12bbc23. Subsequent calls will reuse this data.


  0%|          | 0/80 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/80 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['inp', 'trg', 'metadata', 'type'],
        num_rows: 160000
    })
    validation: Dataset({
        features: ['inp', 'trg', 'metadata', 'type'],
        num_rows: 10000
    })
})

## Fine-tune the Model

In [7]:
# create trainer instance
trainer = net.get_trainer(
    output_dir=OUTPUT_DIR,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    no_epochs=3,
    bs=32,
    gradient_accumulation_steps=2,
    lr=0.001,
    wd=0.01,
    lr_scheduler_type='linear',
    fp16=False,
    compute_metrics_cb=metrics.TranslationMetricsCallback(tokenizer=net.tokenizer),
    log_level='error')

In [8]:
# train the network
training_output = trainer.train()

Epoch,Training Loss,Validation Loss,Text Accuracy,Levenshtein Score,Jaccard Index
1,0.0442,0.037399,0.8986,0.97809,0.956603
2,0.0271,0.026672,0.933,0.985622,0.970698
3,0.0186,0.02305,0.9442,0.987529,0.97551


In [9]:
# save fine-tuned checkpoint
net.save_pretrained(OUTPUT_DIR + MODEL_NAME)

T5(t5-small)