In [1]:
 pip install transformers torch datasets evaluate transformers[torch] evaluate nltk rouge_score 

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu1

In [5]:
import os
import torch
from datasets import load_dataset, load_metric
import evaluate
from transformers import T5Tokenizer, T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
import numpy as np

# Load the dataset with trust_remote_code=True for safe custom code execution
data = load_dataset('qanta', 'mode=first,char_skip=25', trust_remote_code=True)
tokenizer = T5Tokenizer.from_pretrained('t5-base', legacy=False)

def preprocess_data(examples):
  input_text = ["question: " + q for q in examples['full_question']]
  targets = examples['answer']

  # Tokenize input texts
  model_inputs = tokenizer(input_text, max_length=512, truncation=True, padding="max_length", return_tensors='pt')
  # Tokenize target texts
  labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length", return_tensors='pt')
  model_inputs['labels'] = labels['input_ids']
  return model_inputs

import nltk
# f1_metric = evaluate.load("f1")
exact_match_metric = evaluate.load("exact_match")
nltk.download("punkt", quiet=True)
rouge_metric = evaluate.load("rouge")

def compute_metrics(eval_pred):
  logits, labels = eval_pred
  # print("Logits shape:", logits.shape)

  predictions = np.argmax(logits, axis=-1)

  # Assuming you have a way to decode your predictions and labels from token IDs to text
  decoded_preds = [tokenizer.decode(pred, skip_special_tokens=True) for pred in predictions]
  decoded_labels = [tokenizer.decode(label, skip_special_tokens=True) for label in labels]

  rouge_result = rouge_metric.compute(predictions = decoded_preds, references = decoded_labels)
  exact_match_result = exact_match_metric.compute(predictions=decoded_preds, references=decoded_labels)

  rouge_result.update(exact_match_result)

  return rouge_result


  # Compute F1 and Exact Match scores
  # f1_result = f1_metric.compute(predictions=decoded_preds, references=decoded_labels)
  

  # return {
  #     "f1": f1_result["f1"],
  #     "exact_match": exact_match_result["exact_match"]
  # }
  


tokenized_datasets = data.map(preprocess_data, batched=True, load_from_cache_file=False,
                              remove_columns=['id', 'qanta_id', 'proto_id', 'qdb_id', 'dataset', 'full_question',
                                              'first_sentence', 'char_idx', 'sentence_idx', 'tokenizations', 'page',
                                              'raw_answer', 'fold', 'gameplay', 'category', 'subcategory', 'tournament', 'year'])

small_train_dataset = tokenized_datasets['buzztrain'].shuffle(seed=42)
small_eval_dataset = tokenized_datasets['buzzdev'].shuffle(seed=42)
model = T5ForConditionalGeneration.from_pretrained('t5-base')

training_args = Seq2SeqTrainingArguments(
  output_dir="./models",
  evaluation_strategy="epoch",
  learning_rate=0.0003,
  gradient_accumulation_steps=3,
  per_device_train_batch_size=3,
  num_train_epochs=1,
  weight_decay=0.01,
  seed=42,
  predict_with_generate=True
)

trainer = Seq2SeqTrainer(
  model=model,
  args=training_args,
  train_dataset=small_train_dataset,
  eval_dataset=small_eval_dataset,
  tokenizer=tokenizer,
  compute_metrics=compute_metrics
)

trainer.train()
model.push_to_hub("finetuned-t5-qanta")
tokenizer.push_to_hub("finetuned-t5-qanta")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


ImportError: To be able to use evaluate-metric/rouge, you need to install the following dependencies['rouge_score', 'absl'] using 'pip install rouge_score # Here to have a nice missing dependency error message early on' for instance'

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
!zip -r /content/drive/MyDrive/t5model_token.zip /content/models

  adding: content/models/ (stored 0%)
  adding: content/models/checkpoint-500/ (stored 0%)
  adding: content/models/checkpoint-500/generation_config.json (deflated 27%)
  adding: content/models/checkpoint-500/added_tokens.json (deflated 83%)
  adding: content/models/checkpoint-500/model.safetensors (deflated 8%)
  adding: content/models/checkpoint-500/scheduler.pt (deflated 55%)
  adding: content/models/checkpoint-500/rng_state.pth (deflated 25%)
  adding: content/models/checkpoint-500/training_args.bin (deflated 51%)
  adding: content/models/checkpoint-500/config.json (deflated 47%)
  adding: content/models/checkpoint-500/spiece.model (deflated 48%)
  adding: content/models/checkpoint-500/trainer_state.json (deflated 66%)
  adding: content/models/checkpoint-500/optimizer.pt (deflated 15%)
  adding: content/models/checkpoint-500/special_tokens_map.json (deflated 85%)
  adding: content/models/checkpoint-500/tokenizer_config.json (deflated 94%)
  adding: content/models/generation_config.