# **Text Summarization with T5**

In [1]:
!pip install transformers datasets sentencepiece accelerate peft -q

In [38]:
import torch
import numpy as np
import pandas as pd

In [3]:
from datasets import load_dataset
from transformers import AutoTokenizer, T5ForConditionalGeneration
from transformers import DataCollatorForSeq2Seq, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType

## **Read Dataset**

In [4]:
from datasets import load_dataset

xsum = load_dataset("xsum")

In [5]:
xsum

DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 204045
    })
    validation: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11332
    })
    test: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11334
    })
})

In [6]:
xsum.shape

{'train': (204045, 3), 'validation': (11332, 3), 'test': (11334, 3)}

In [7]:
xsum["train"][0]

 'summary': 'Clean-up operations are continuing across the Scottish Borders and Dumfries and Galloway after flooding caused by Storm Frank.',
 'id': '35232142'}

## **Fine-Tune on T5 model**

In [8]:
checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [9]:
prefix = "summarize: "

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["document"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [10]:
tokenized_xsum = xsum.map(preprocess_function, batched=True)

Map:   0%|          | 0/204045 [00:00<?, ? examples/s]

Map:   0%|          | 0/11332 [00:00<?, ? examples/s]

Map:   0%|          | 0/11334 [00:00<?, ? examples/s]

In [11]:
tokenized_xsum

DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'id', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 204045
    })
    validation: Dataset({
        features: ['document', 'summary', 'id', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 11332
    })
    test: Dataset({
        features: ['document', 'summary', 'id', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 11334
    })
})

In [12]:
tokenized_xsum = tokenized_xsum.remove_columns(["document", "summary", "id"])

In [13]:
tokenized_xsum

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 204045
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 11332
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 11334
    })
})

In [14]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [15]:
model = T5ForConditionalGeneration.from_pretrained(checkpoint)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [16]:
peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,  # Sequence-to-Sequence Task
    inference_mode=False, 
    r=8,  # LoRA rank
    lora_alpha=32,  # Scaling factor
    lora_dropout=0.1  # Dropout for LoRA layers
)

In [17]:
peft_model = get_peft_model(model, peft_config)

In [25]:
from huggingface_hub import login

# text-summarization-T5
login(token="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")

In [27]:
training_args = TrainingArguments(
    output_dir="./t5-small-peft-lora",
    per_device_train_batch_size=8, 
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=8,
    learning_rate=2e-5, 
    save_total_limit=3,
    num_train_epochs=2,
    logging_steps=100,
    evaluation_strategy="steps",
    save_steps=100,
    eval_steps=100,
    push_to_hub=True,
    push_to_hub_model_id="text-summarization-T5",
    report_to="none"  
)



In [28]:
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_xsum["train"],
    eval_dataset=tokenized_xsum["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [29]:
trainer.train()



Step,Training Loss,Validation Loss
100,3.8764,3.637598
200,3.6129,3.263052
300,3.3392,3.024847
400,3.207,2.92941
500,3.1548,2.872545
600,3.0969,2.833324
700,3.0718,2.801751
800,3.0476,2.780329
900,3.0431,2.765088
1000,3.0216,2.753767




TrainOutput(global_step=3188, training_loss=3.052526294512013, metrics={'train_runtime': 24377.7275, 'train_samples_per_second': 16.74, 'train_steps_per_second': 0.131, 'total_flos': 1.087759190902702e+17, 'train_loss': 3.052526294512013, 'epoch': 1.9998431741551008})

## **Save model to the Huggingface**

In [30]:
trainer.push_to_hub("text-summarization-T5")

CommitInfo(commit_url='https://huggingface.co/danfarh2000/text-summarization-T5/commit/4e0794bfe43ccf4ed56b39303e67237798ee5771', commit_message='text-summarization-T5', commit_description='', oid='4e0794bfe43ccf4ed56b39303e67237798ee5771', pr_url=None, repo_url=RepoUrl('https://huggingface.co/danfarh2000/text-summarization-T5', endpoint='https://huggingface.co', repo_type='model', repo_id='danfarh2000/text-summarization-T5'), pr_revision=None, pr_num=None)

In [31]:
peft_model.push_to_hub("text-summarization-T5")
tokenizer.push_to_hub("text-summarization-T5")

README.md:   0%|          | 0.00/2.94k [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/danfarh2000/text-summarization-T5/commit/4e0794bfe43ccf4ed56b39303e67237798ee5771', commit_message='Upload tokenizer', commit_description='', oid='4e0794bfe43ccf4ed56b39303e67237798ee5771', pr_url=None, repo_url=RepoUrl('https://huggingface.co/danfarh2000/text-summarization-T5', endpoint='https://huggingface.co', repo_type='model', repo_id='danfarh2000/text-summarization-T5'), pr_revision=None, pr_num=None)

## **Test the model**

In [36]:
checkpoint = "danfarh2000/text-summarization-T5"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = T5ForConditionalGeneration.from_pretrained(checkpoint)

tokenizer_config.json:   0%|          | 0.00/20.7k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/696 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/1.19M [00:00<?, ?B/s]

In [39]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): lora.Linear(
                (base_layer): Linear(in_features=512, out_features=512, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=512, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=512, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k): Linear(in_feature

In [42]:
def generate_summary(input_text, max_length=100, num_beams=4):
    input_text = "summarize: " + input_text

    inputs = tokenizer(input_text, return_tensors="pt", max_length=1024, truncation=True).to(device)

    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=max_length,
        num_beams=num_beams,
        early_stopping=True
    )

    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return summary

In [72]:
generate_summary(xsum["test"][0]['document'])

'Prison Link Cymru, a charity that helps homeless people in Wales, says it is a "chronic" need for accommodation.'

In [73]:
generate_summary(xsum["test"][1]['document'])

'Police have recovered three firearms, ammunition and a five-figure sum of money.'

In [55]:
sample_text1 = """
Steve Jobs (1955-2011) was an American entrepreneur, inventor, and co-founder of Apple Inc. He was born in San Francisco, California, and was adopted by Paul and Clara Jobs. As a child, Jobs showed an early interest in electronics and technology, and he built his first computer with his friend Steve Wozniak while still in high school.
After dropping out of college, Jobs co-founded Apple Computer in 1976 with Wozniak and Ronald Wayne. Apple's first product was the Apple I personal computer, which was followed by the Apple II, which became a huge success and established Apple as a major player in the emerging personal computer industry.
"""

generate_summary(sample_text1)

'Steve Jobs was an American entrepreneur, inventor, and co-founder of Apple Inc.'

In [56]:
sample_text2 = """
Cristiano Ronaldo dos Santos Aveiro (born February 5, 1985) is a Portuguese professional footballer who currently plays as a forward for Manchester United and the Portugal national team. He is widely considered one of the greatest football players of all time, and has won numerous awards and titles throughout his career, including five Ballon d'Or awards. Ronaldo has also been a prolific scorer, and holds the record for most goals scored in the UEFA Champions League. He has also been involved in various philanthropic efforts, including donating millions of dollars to charity.
"""

generate_summary(sample_text2)

'Cristiano Ronaldo dos Santos Aveiro (born February 5, 1985) is a Portuguese professional footballer.'