In [2]:
from datasets import load_dataset
import torch

# Load the BookSum dataset
dataset = load_dataset("kmfoda/booksum")

In [3]:
from transformers import T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained("t5-small")

def preprocess_data(examples):
    inputs = examples['chapter']
    targets = examples['summary']
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=150, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply preprocessing
tokenized_datasets = dataset.map(preprocess_data, batched=True)


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [1]:
from transformers import T5ForConditionalGeneration

model = T5ForConditionalGeneration.from_pretrained("fnando1995/t5-small-finetuned-booksum-pgc")

config.json:   0%|          | 0.00/1.56k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

In [28]:
from tqdm import tqdm

def generate_summaries(dataset, model, tokenizer, device):
    model.to(device)
    inputs = tokenizer(dataset['chapter'], return_tensors="pt", truncation=True, padding="max_length", max_length=512)
    input_ids = inputs.input_ids.to(device)
    attention_mask = inputs.attention_mask.to(device)
    
    outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=1000)
    summaries = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    
    return summaries

device = "cuda" if torch.cuda.is_available() else "cpu"
generated_summaries = generate_summaries(tokenized_datasets['validation'][:3], model, tokenizer, device)

# Prepare references and predictions
references = tokenized_datasets['validation'][:3]['summary']
predictions = generated_summaries

In [29]:
references

['{"name": "Chapters 1-4", "url": "https://web.archive.org/web/20210421025427/https://www.gradesaver.com/bleak-house/study-guide/summary-chapters-1-4", "summary": "The scene opens in London on a foggy, smoggy day. The High Court of Chancery is in session, and it appears that the fog has settled thickest on this part of London. This is where the legal suit of Jarndyce and Jarndyce is being argued. A little mad old woman , and a man from Shropshire are in attendance. A \\"sallow prisoner\\" is brought forward. Mr. Tangle, a lawyer, speaks with the Lord High Chancellor, and the matter of the two young wards in Jarndyce is discussed. This matter will come up before the court tomorrow. The scene changes in Chapter 2 to Chesney Wold, a stately home in Lincolnshire. Here the weather is also bad, but it is constant rain rather than fog. The lady of the manor, Lady Dedlock, is bored to death. She had been in London, and has come back to the country seat before leaving for Paris in a few days. L

In [30]:
predictions

['"name": "Chapter XXXI", "url": "https://web.archive.org/web/202103030303030/https://www.shmoop.com/study-guides/literature/the-sister-of-the-seven-years-old/summary/chapter-xxii", "summary": "The Lord Chancellor is sitting in Lincoln\'s Inn Hall. Michaelmas is a snoring a snort, and he is a shivering in the streets. The narrator is a',
 '"name": "Chapter XXXI", "url": "https://web.archive.org/web/202103030303030/https://www.shmoop.com/study-guides/literature/david-copperfield/summary/chapter-xxii", "summary": "Ada and Peepy are preparing for a walk. They are preparing for a walk. She is astonished by the fact that she is not tired, and that she is not tired. She is a bit tired, and she is a bit tired. She is',
 '"name": "Chapter XXXI", "url": "https://web.archive.org/web/202103030303030/https://www.shmoop.com/study-guides/literature/david-copperfield/summary/chapter-xxii", "summary": "The night before, the sun sets and the sun sets. The sun sets and the sun sets. The sun sets and the

In [13]:
from datasets import load_metric

# Load ROUGE
rouge_metric = load_metric("rouge",trust_remote_code=True)

# Compute ROUGE scores
rouge_results = rouge_metric.compute(predictions=predictions, references=references, use_stemmer=True)
print("ROUGE Results:", rouge_results)


ROUGE Results: {'rouge1': AggregateScore(low=Score(precision=0.6451612903225806, recall=0.031197301854974702, fmeasure=0.0596774193548387), mid=Score(precision=0.7038654918359218, recall=0.03607218580319185, fmeasure=0.06856918057677296), high=Score(precision=0.78125, recall=0.039761431411530816, fmeasure=0.0749063670411985)), 'rouge2': AggregateScore(low=Score(precision=0.29508196721311475, recall=0.01350210970464135, fmeasure=0.02584814216478191), mid=Score(precision=0.3153917029574314, recall=0.01593940829273807, fmeasure=0.030319426815217296), high=Score(precision=0.34920634920634913, recall=0.01791044776119403, fmeasure=0.03377110694183865)), 'rougeL': AggregateScore(low=Score(precision=0.5967741935483871, recall=0.02782462057335582, fmeasure=0.05322580645161292), mid=Score(precision=0.6265867682198327, recall=0.03221522446402069, fmeasure=0.061226875184067914), high=Score(precision=0.671875, recall=0.036779324055666, fmeasure=0.0692883895131086)), 'rougeLsum': AggregateScore(low=

In [17]:
# Compute BLEU scores
from nltk.translate.bleu_score import corpus_bleu
bleu_metric = load_metric("bleu",trust_remote_code=True)

# Tokenize references and predictions for BLEU
tokenized_references = [[ref.split()] for ref in references]
tokenized_predictions = [[pred.split()] for pred in predictions]

# Compute BLEU score
bleu_results = bleu_metric.compute(predictions=tokenized_predictions, references=tokenized_references)
print("BLEU Results:", bleu_results)


BLEU Results: {'bleu': 0.0, 'precisions': [0.0, 0.0, 0.0, 0.0], 'brevity_penalty': 0.0, 'length_ratio': 0.0008782201405152225, 'translation_length': 3, 'reference_length': 3416}


In [15]:
# Compute METEOR score
meteor_metric = load_metric("meteor",trust_remote_code=True)
meteor_results = meteor_metric.compute(predictions=predictions, references=references)
print("METEOR Results:", meteor_results)


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\emmanuelmoran\AppData\Roaming\nltk_data...
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\emmanuelmoran\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\emmanuelmoran\AppData\Roaming\nltk_data...


METEOR Results: {'meteor': 0.023737822454457606}
