Installing the required libraries

In [None]:
pip install transformers datasets torch

Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.

Loading datasets

In [1]:
from datasets import load_dataset

cola_dataset = load_dataset("glue", "cola")
paws_dataset = load_dataset("paws-x", "en")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/6.57k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/3.69M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9427 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3270 [00:00<?, ? examples/s]

                                            question  answer  \
0    do iran and afghanistan speak the same language    True   
1  do good samaritan laws protect those who help ...    True   
2  is windows movie maker part of windows essentials    True   
3  is confectionary sugar the same as powdered sugar    True   
4         is elder scrolls online the same as skyrim   False   

                                             passage  
0  Persian (/ˈpɜːrʒən, -ʃən/), also known by its ...  
1  Good Samaritan laws offer legal protection to ...  
2  Windows Movie Maker (formerly known as Windows...  
3  Powdered sugar, also called confectioners' sug...  
4  As with other games in The Elder Scrolls serie...  


Changing dataset so that both input and output are in text format

In [None]:
from datasets import concatenate_datasets

def preprocess_grammar(examples):
    inputs = [f"grammar: {text}" for text in examples["sentence"]]
    outputs = ["correct" if label == 1 else "incorrect" for label in examples["label"]]
    return {"input_text": inputs, "target_text": outputs}

def preprocess_paraphrase(examples):
    inputs = [f"paraphrase: {text}" for text in examples["sentence1"]]
    outputs = examples["sentence2"]
    return {"input_text": inputs, "target_text": outputs}

cola_dataset = cola_dataset.map(preprocess_grammar, batched=True)
paws_dataset = paws_dataset.map(preprocess_paraphrase, batched=True)

cola_dataset = cola_dataset.remove_columns(["label", "id"]) if "id" in cola_dataset["train"].column_names else cola_dataset.remove_columns(["label"])
paws_dataset = paws_dataset.remove_columns(["label", "id"]) if "id" in paws_dataset["train"].column_names else paws_dataset.remove_columns(["label"])

combined_dataset = concatenate_datasets([
    cola_dataset["train"],
    paws_dataset["train"]
])

Tokenizing the input data to token ids for the model to understand

In [None]:
from transformers import T5Tokenizer

train_test_split = combined_dataset.train_test_split(test_size=0.1)

tokenizer = T5Tokenizer.from_pretrained("t5-small")

def tokenize_function(examples):
    model_inputs = tokenizer(
        examples["input_text"],
        padding="max_length",
        max_length=512,
        truncation=True,
    )
    labels = tokenizer(
        text_target=examples["target_text"],
        padding="max_length",
        max_length=128,
        truncation=True,
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = train_test_split.map(tokenize_function, batched=True)

Setting up a trainer object to train model on dataset

In [None]:
from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
)

trainer.train()

Evaluation on the model using Trainer class

In [None]:
eval_results = trainer.evaluate()

print(f"Evaluation results: {eval_results}")

Generating an answer to the question based on a given context

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

model = T5ForConditionalGeneration.from_pretrained("./results")
tokenizer = T5Tokenizer.from_pretrained("t5-small")

input_text = "question: Is the sky blue? context: The sky is blue on a clear day."

input_ids = tokenizer(input_text, return_tensors="pt").input_ids

output_ids = model.generate(input_ids)

predicted_answer = tokenizer.decode(output_ids[0], skip_special_tokens=True)

print(f"Predicted answer: {predicted_answer}")