# Finetunning Translation on T5
## This notebook outlines the concepts behind the Finetuning of Translation task on SQUAD2 dataset

In [1]:
import pandas as pd
from datasets import load_dataset
from transformers import DefaultDataCollator
from transformers import T5ForConditionalGeneration, AdamW
from transformers import T5TokenizerFast as T5Tokenizer

In [2]:
import numpy as np
import torch
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split
from termcolor import colored

In [3]:
MODEL_NAME = "mrm8488/t5-base-finetuned-quartz"

In [4]:
squad = load_dataset("squad")

Downloading builder script:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

Downloading and preparing dataset squad/plain_text (download: 33.51 MiB, generated: 85.63 MiB, post-processed: Unknown size, total: 119.14 MiB) to /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/8.12M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

Dataset squad downloaded and prepared to /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
squad

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [6]:
train_question = squad['train']['question']  # question
train_context = squad['train']['context']    # context

listing = []
listing2 = []
first_list = squad['train']['answers']  
for fl in first_list:
    listing.append(fl.get('text'))
    listing2.append(fl.get('answer_start')) 
    
train_answer = []                           # answers
train_answer_start = []                     # answer_start

for one_list in listing:
    for _ in one_list:
        train_answer.append(_)

for two_list in listing2:
    for _ in two_list:
        train_answer_start.append(_)

In [7]:
data = {'question':train_question, 'context':train_context, 'answer_text':train_answer, 'answer_start':train_answer_start}

In [8]:
len(train_question ), len(train_context), len(train_answer), len(train_answer_start)

(87599, 87599, 87599, 87599)

In [9]:
df = pd.DataFrame(data)

In [10]:
df

Unnamed: 0,question,context,answer_text,answer_start
0,To whom did the Virgin Mary allegedly appear i...,"Architecturally, the school has a Catholic cha...",Saint Bernadette Soubirous,515
1,What is in front of the Notre Dame Main Building?,"Architecturally, the school has a Catholic cha...",a copper statue of Christ,188
2,The Basilica of the Sacred heart at Notre Dame...,"Architecturally, the school has a Catholic cha...",the Main Building,279
3,What is the Grotto at Notre Dame?,"Architecturally, the school has a Catholic cha...",a Marian place of prayer and reflection,381
4,What sits on top of the Main Building at Notre...,"Architecturally, the school has a Catholic cha...",a golden statue of the Virgin Mary,92
...,...,...,...,...
87594,In what US state did Kathmandu first establish...,"Kathmandu Metropolitan City (KMC), in order to...",Oregon,229
87595,What was Yangon previously known as?,"Kathmandu Metropolitan City (KMC), in order to...",Rangoon,414
87596,With what Belorussian city does Kathmandu have...,"Kathmandu Metropolitan City (KMC), in order to...",Minsk,476
87597,In what year did Kathmandu create its initial ...,"Kathmandu Metropolitan City (KMC), in order to...",1975,199


In [11]:
df = df.drop_duplicates(subset=["context"]).reset_index(drop=True)

In [12]:
len(df.question.unique())

18881

In [13]:
len(df.context.unique())

18891

In [14]:
english_sample = df["context"][0]
english_sample

'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.'

In [15]:
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

Downloading:   0%|          | 0.00/1.81k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.74k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.26k [00:00<?, ?B/s]

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.


In [16]:
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, return_dict = True)
model.config

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.


Downloading:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

T5Config {
  "_name_or_path": "mrm8488/t5-base-finetuned-quartz",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      "prefix": "translate English t

# Translation Done

In [17]:
english_sample

'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.'

In [18]:
input_ids_translated = tokenizer(
     "translate English to French : Architecturally the school has a Catholic character Atop the Main Building gold dome is a golden statue of the Virgin Mary",
     return_tensors = 'pt'
).input_ids

generated_ids = model.generate(input_ids = input_ids_translated)
generated_ids

pred_translated = [
          tokenizer.decode(gen_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
          for gen_id in generated_ids
]
"".join(pred_translated)

'Sur le dôme d’or du bâtiment principal se trouve une statue d’'