In [1]:
!nvidia-smi

Mon Apr 28 18:53:23 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.120                Driver Version: 550.120        CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |


|   0  NVIDIA H100 NVL                Off |   00000000:CA:00.0 Off |                    0 |
| N/A   32C    P0             63W /  400W |      14MiB /  95830MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                                                         
+-----------------------------------------------------------------------------------------+
| Processes:                                                                              |
|  GPU   GI   CI        PID   Type   Process name                              GPU Memory |
|        ID   ID                                                               Usage      |
+-----------------------------------------------------------------------------------------+


In [2]:
# Install dependencies
!pip install "transformers[sentencepiece]" datasets rouge_score sacrebleu py7zr accelerate tqdm -q 

In [3]:
# import libraries
from transformers import pipeline, set_seed, AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_dataset
import pandas as pd
from tqdm import tqdm
import torch
import nltk
nltk.download('punkt')


  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

Check if cuda is available or not

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


Load the tokenizer

In [19]:
model_id = "google/pegasus-cnn_dailymail"
tokenizer = AutoTokenizer.from_pretrained(model_id)

Load the model
This may take some time because the model size is more than 3Gb

In [20]:
model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(model_id).to(device)

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Load dataset

In [21]:
samsum_dataset = load_dataset("samsum")

In [23]:
# sample a single example
samsum_dataset['train'][0]

{'id': '13818513',
 'dialogue': "Amanda: I baked  cookies. Do you want some?\r\nJerry: Sure!\r\nAmanda: I'll bring you tomorrow :-)",
 'summary': 'Amanda baked cookies and will bring Jerry some tomorrow.'}

Some metadata

In [24]:

split_lengths = [len(samsum_dataset[a]) for a in samsum_dataset]
print(f"Dataset split lengths: {split_lengths}")

print(f"Feature names: {samsum_dataset['train'].column_names}")
print("\nDialogue example:")
print(samsum_dataset['train'][0]['dialogue'])

print("\nSummary example:")
print(samsum_dataset['train'][0]['summary'])
    

Dataset split lengths: [14732, 819, 818]
Feature names: ['id', 'dialogue', 'summary']

Dialogue example:
Amanda: I baked  cookies. Do you want some?
Jerry: Sure!
Amanda: I'll bring you tomorrow :-)

Summary example:
Amanda baked cookies and will bring Jerry some tomorrow.


### Process data for training

In [25]:
def convert_example_to_features(example):
    input_encoding = tokenizer(
        example['dialogue'],
        max_length=1024,
        truncation=True,
    )
    with tokenizer.as_target_tokenizer():
        target_encoding = tokenizer(
            example['summary'],
            max_length=128,
            truncation=True,
        )
    return {
        'input_ids': input_encoding['input_ids'],
        'attention_mask': input_encoding['attention_mask'],
        'labels': target_encoding['input_ids'],
    }

In [26]:
dataset_samsum_pt = samsum_dataset.map(convert_example_to_features, batched=True)

In [27]:
dataset_samsum_pt

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 818
    })
})

In [28]:
dataset_samsum_pt['train']['input_ids'][1]

[18038,
 151,
 2632,
 127,
 119,
 6228,
 118,
 115,
 136,
 2974,
 152,
 10463,
 151,
 35884,
 130,
 329,
 107,
 18038,
 151,
 2587,
 314,
 1242,
 10463,
 151,
 1509,
 1]

In [29]:
dataset_samsum_pt['train']['attention_mask'][1]

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [30]:
!pip install evaluate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [16]:
import evaluate
rouge = evaluate.load("rouge")
import numpy as np

In [17]:

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

### Training

In [18]:
from transformers import DataCollatorForSeq2Seq

seq2seq_data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model_pegasus,
)

NameError: name 'model_pegasus' is not defined

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./pegasus-samsum",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    fp16=True,
    gradient_checkpointing=True,  # Add this to save memory
)

In [20]:
trainer = Trainer(
    model=model_pegasus,
    args=training_args,
    train_dataset=dataset_samsum_pt['train'],
    eval_dataset=dataset_samsum_pt['validation'],
    # tokenizer=tokenizer,
    processing_class= tokenizer,
    data_collator=seq2seq_data_collator,
    compute_metrics=compute_metrics,
)

In [21]:
#start training
trainer.train()

Epoch,Training Loss,Validation Loss




: 

In [21]:
def calculate_metric_on_test_ds(self, dataset, metric, model, tokenizer,
                                batch_size=16, device="cuda" if torch.cuda.is_available() else "cpu",
                                column_text="article",
                                column_summary="highlights"):
        article_batches = list(self.generate_batch_sized_chunks(dataset[column_text], batch_size))
        target_batches = list(self.generate_batch_sized_chunks(dataset[column_summary], batch_size))

        for article_batch, target_batch in tqdm(
            zip(article_batches, target_batches), total=len(article_batches)):

            inputs = tokenizer(article_batch, max_length=1024,  truncation=True,
                            padding="max_length", return_tensors="pt")

            summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                            attention_mask=inputs["attention_mask"].to(device),
                            length_penalty=0.8, num_beams=8, max_length=128)
            ''' parameter for length penalty ensures that the model does not generate sequences that are too long. '''

            # Finally, we decode the generated texts,
            # replace the  token, and add the decoded texts with the references to the metric.
            decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True,
                                    clean_up_tokenization_spaces=True)
                for s in summaries]

            decoded_summaries = [d.replace("", " ") for d in decoded_summaries]
            metric.add_batch(predictions=decoded_summaries, references=target_batch)

        #  Finally compute and return the ROUGE scores.
        score = metric.compute()
        return score

In [24]:
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
rouge_metric = load_metric('rouge')

NameError: name 'load_metric' is not defined