# Fine tuning T5 on our dataset

### /!\ Run this script on Collab

In [None]:
pip install transformers[torch] tokenizers datasets evaluate rouge_score sentencepiece huggingface_hub --upgrade

Collecting transformers[torch]
  Downloading transformers-4.36.2-py3-none-any.whl (8.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
Collecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggi

In [None]:
import nltk
import evaluate
import numpy as np
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer

In [None]:
from datasets import load_dataset

# Load dataset
dataset = load_dataset('dzunggg/legal-qa-v1')

# Split into training set and testing set
dataset = dataset["train"].train_test_split(test_size=0.2)
dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data:   0%|          | 0.00/6.21M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 2993
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 749
    })
})

In [None]:
# # For testing purpose
# dataset = load_dataset('dzunggg/legal-qa-v1', split='train[:10]')
# dataset = dataset.train_test_split(test_size=0.1)

In [None]:
# Load the tokenizer, model, and data collator
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
# Prefix inputs with "answer the question"
prefix = "answer the question: "

# Add prefix to sentences, tokenize and set labels
def preprocess_function(data):
    # Tokenize questions
    inputs = [prefix + doc for doc in data["question"]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True)

    # Set labels
    labels = tokenizer(text_target=data["answer"], max_length=512, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Map the function to the dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/2993 [00:00<?, ? examples/s]

Map:   0%|          | 0/749 [00:00<?, ? examples/s]

In [None]:
# Use Rouge score for evaluation
nltk.download("punkt", quiet=True)
metric = evaluate.load("rouge")

def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # Decode preds and labels
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # rougeLSum expects newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    return result

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [None]:
# # Use Bleu score for evaluation
# nltk.download("punkt", quiet=True)
# metric = evaluate.load("bleu")

# def compute_metrics(eval_preds):
#     preds, labels = eval_preds

#     # decode preds and labels
#     labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
#     decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
#     decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

#     result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
#     return result

In [None]:
# Set up training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./training_results",
    evaluation_strategy="epoch",
    learning_rate=3e-4,
    logging_dir="./training_logs",
    logging_steps = 1,
    # per_device_train_batch_size=8,
    # per_device_eval_batch_size=4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=2,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=5,
    predict_with_generate=True,
    push_to_hub=False
)

# Set up trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Save the model
trainer.save_model('./saved_model')

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,1.9124,2.463249,0.122093,0.029613,0.099144,0.112184
2,2.1396,2.416125,0.122183,0.028685,0.098382,0.113316
3,2.8095,2.409556,0.124392,0.029425,0.100126,0.113873
4,2.0296,2.418986,0.124021,0.029265,0.099518,0.113591
5,1.5238,2.433747,0.123911,0.029173,0.099301,0.113379




In [None]:
import shutil

# Compress output folders to zip files
shutil.make_archive('saved_model', 'zip', 'saved_model')
shutil.make_archive('training_logs', 'zip', 'training_logs')
shutil.make_archive('training_results', 'zip', 'training_results')

'/content/training_results.zip'

In [14]:
# Mount to drive
from google.colab import drive
drive.mount('/content/drive')

# Copy zip files to My Drive
destination_folder_path = '/content/drive/My Drive/'

shutil.copy('training_logs.zip', destination_folder_path)
shutil.copy('saved_model.zip', destination_folder_path)
shutil.copy('training_results.zip', destination_folder_path)

Mounted at /content/drive


'/content/drive/My Drive/saved_model.zip'

In [None]:
import logging
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

logging.basicConfig(filename='warnings.log', level=logging.WARNING)

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('./saved_model')
model = AutoModelForSeq2SeqLM.from_pretrained('./saved_model')

# Example input
input_text = "What is a lawyer"

# Tokenize input
input_ids = tokenizer(input_text, return_tensors="pt").input_ids

# Get output
output_ids = model.generate(input_ids, max_length=50, temperature=0.9, do_sample=True)

# Decode the output
output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

# Print result
print("Q:", input_text)
print("A:", output_text)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Input Text: What is a lawyer
Generated Output: A lawyer is a person who manages cases in the legal system. They represent clients in legal matters at state, federal, and local levels. Lawyers also represent clients who have criminal or civil cases under their control. The state bar of
