In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
%%capture
!pip install transformers datasets evaluate rouge_score bitsandbytes accelerate

In [4]:
from transformers import AutoTokenizer

checkpoint = "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [5]:
import os
from datasets import load_dataset, load_from_disk

# Define paths on your drive to save/load preprocessed datasets
train_data_path = "/content/drive/MyDrive/datasets/cnn_dailymail_train_preprocessed_512_256"
val_data_path = "/content/drive/MyDrive/datasets/cnn_dailymail_val_preprocessed_512_256"

# Define your preprocessing function (same as before)
prefix = "summarize: "

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["article"]]
    model_inputs = tokenizer(inputs)
    labels = tokenizer(text_target=examples["highlights"])
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Check if preprocessed datasets exist; if yes, load them; if not, create them.
if os.path.exists(train_data_path) and os.path.exists(val_data_path):
    print("Loading preprocessed datasets from disk...")
    dataset_train = load_from_disk(train_data_path)
    dataset_val = load_from_disk(val_data_path)
else:
    print("Preprocessed datasets not found. Downloading and processing...")
    dataset = load_dataset("ccdv/cnn_dailymail", '3.0.0', trust_remote_code=True)

    # Process the training dataset
    dataset_train = dataset['train']
    dataset_train = dataset_train.map(preprocess_function, batched=True)
    dataset_train = dataset_train.filter(lambda x: len(x["input_ids"]) <= 512, batched=False)
    dataset_train = dataset_train.filter(lambda x: len(x["labels"]) <= 256, batched=False)

    # Process the validation dataset
    dataset_val = dataset['validation']
    dataset_val = dataset_val.map(preprocess_function, batched=True)
    dataset_val = dataset_val.filter(lambda x: len(x["input_ids"]) <= 512, batched=False)
    dataset_val = dataset_val.filter(lambda x: len(x["labels"]) <= 256, batched=False)

    # Save the preprocessed datasets to your Drive
    dataset_train.save_to_disk(train_data_path)
    dataset_val.save_to_disk(val_data_path)
    print("Preprocessing complete and datasets saved to disk.")

# Print sizes (or perform further steps) as before
print("Number of training samples:", len(dataset_train))
print("Number of validation samples:", len(dataset_val))

Loading preprocessed datasets from disk...
Number of training samples: 42220
Number of validation samples: 2394


In [6]:
# size of train and val sets
print(len(dataset_train))
print(len(dataset_val))

42220
2394


In [7]:
# longest article in tokens
len(max(dataset_train['input_ids'], key=len))

512

In [8]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [9]:
import evaluate

rouge = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [10]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [11]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, BitsAndBytesConfig
import torch

bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,  # 4-bit quantization
    # bnb_4bit_quant_type='nf4',  # Normalized float 4
    # bnb_4bit_use_double_quant=True,  # Second quantization after the first
    # bnb_4bit_compute_dtype=torch.bfloat16  # Computation type
)

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint,
                                              # quantization_config=bnb_config
                                              )
model.config.max_length = 256

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [12]:
from peft import LoraConfig, get_peft_model, TaskType

# lora_config = LoraConfig(
#     r=64,
#     lora_alpha=1,
#     target_modules=["q", "v"],  # Targeting 'q' and 'v' modules
#     lora_dropout=0.05,
#     bias="none",
#     task_type=TaskType.SEQ_2_SEQ_LM  # For T5, use SEQ_2_SEQ_LM
# )

# model = get_peft_model(model, lora_config)

In [13]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [14]:
print_trainable_parameters(model)

trainable params: 60506624 || all params: 60506624 || trainable%: 100.0


In [15]:
training_args = Seq2SeqTrainingArguments(
    output_dir="/content/drive/MyDrive/t5-small-finetuned-cnn_dailymail_optimized6",
    eval_strategy="steps",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=16,
    per_device_eval_batch_size=32,
    weight_decay=0.01,
    save_total_limit=3,
    eval_steps=10,
    logging_steps=10,
    save_steps=10,
    num_train_epochs=1,
    generation_max_length=256,
    predict_with_generate=True,
    fp16=True, #change to bf16=True for XPU
    report_to="none",
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_val,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [16]:
trainer.evaluate()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


{'eval_loss': 2.072248935699463,
 'eval_model_preparation_time': 0.0031,
 'eval_rouge1': 0.3949,
 'eval_rouge2': 0.1874,
 'eval_rougeL': 0.2909,
 'eval_rougeLsum': 0.2908,
 'eval_gen_len': 56.6604,
 'eval_runtime': 121.2103,
 'eval_samples_per_second': 19.751,
 'eval_steps_per_second': 0.619}

In [17]:
trainer.train()

Step,Training Loss,Validation Loss,Model Preparation Time,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
10,2.0548,1.929504,0.0031,0.4208,0.2076,0.3118,0.3118,59.9227
20,1.9226,1.831625,0.0031,0.4319,0.2162,0.3197,0.3196,62.3275
30,1.8709,1.768779,0.0031,0.4323,0.2166,0.3203,0.3203,63.231
40,1.8072,1.724023,0.0031,0.4308,0.2157,0.3199,0.3198,62.7974
50,1.769,1.694199,0.0031,0.4283,0.2138,0.3179,0.3178,62.7272
60,1.7513,1.676522,0.0031,0.4266,0.2132,0.3171,0.317,62.7164
70,1.7246,1.667229,0.0031,0.4264,0.2134,0.317,0.317,62.7794
80,1.7369,1.663638,0.0031,0.4271,0.2139,0.3178,0.3177,62.9499




TrainOutput(global_step=82, training_loss=1.8275028379952036, metrics={'train_runtime': 1756.6419, 'train_samples_per_second': 24.034, 'train_steps_per_second': 0.047, 'total_flos': 5627275257053184.0, 'train_loss': 1.8275028379952036, 'epoch': 0.9939393939393939})

In [18]:
model.save_pretrained("/content/drive/MyDrive/t5-small-finetuned-cnn_dailymail_optimized6")
tokenizer.save_pretrained("/content/drive/MyDrive/t5-small-finetuned-cnn_dailymail_optimized6")

('/content/drive/MyDrive/t5-small-finetuned-cnn_dailymail_optimized6/tokenizer_config.json',
 '/content/drive/MyDrive/t5-small-finetuned-cnn_dailymail_optimized6/special_tokens_map.json',
 '/content/drive/MyDrive/t5-small-finetuned-cnn_dailymail_optimized6/spiece.model',
 '/content/drive/MyDrive/t5-small-finetuned-cnn_dailymail_optimized6/added_tokens.json',
 '/content/drive/MyDrive/t5-small-finetuned-cnn_dailymail_optimized6/tokenizer.json')

In [19]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Load the fine-tuned model and tokenizer
model_path = "/content/drive/MyDrive/t5-small-finetuned-cnn_dailymail_optimized6"
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Example input text (replace this with your own article)
article = dataset_val['article'][0]

# Prepend the prefix if your model was trained with one
input_text = "summarize: " + article

# Tokenize the input text
inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding="longest")

# Generate the summary. You can adjust parameters like max_length and num_beams as needed.
summary_ids = model.generate(inputs["input_ids"],
                             max_length=256,
                             num_beams=4,
                             early_stopping=True)

# Decode and print the summary
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print(input_text)
print("Summary:", summary)


summarize: (CNN)Singer-songwriter David Crosby hit a jogger with his car Sunday evening, a spokesman said. The accident happened in Santa Ynez, California, near where Crosby lives. Crosby was driving at approximately 50 mph when he struck the jogger, according to California Highway Patrol Spokesman Don Clotworthy. The posted speed limit was 55. The jogger suffered multiple fractures, and was airlifted to a hospital in Santa Barbara, Clotworthy said. His injuries are not believed to be life threatening. "Mr. Crosby was cooperative with authorities and he was not impaired or intoxicated in any way. Mr. Crosby did not see the jogger because of the sun," said Clotworthy. According to the spokesman, the jogger and Crosby were on the same side of the road. Pedestrians are supposed to be on the left side of the road walking toward traffic, Clotworthy said. Joggers are considered pedestrians. Crosby is known for weaving multilayered harmonies over sweet melodies. He belongs to the celebrated r