In [None]:

!pip install -U transformers
!pip install -U datasets
!pip install tensorboard
!pip install sentencepiece
!pip install accelerate
!pip install evaluate
!pip install rouge_score

In [2]:
import torch
import pprint
import evaluate
import numpy as np

from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    TrainingArguments,
    Trainer
)
from datasets import load_dataset

In [3]:
import os
from huggingface_hub import login
# Login with your Hugging Face token
login("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")


In [10]:
# Load the tokenizer and base model
tokenizer=T5Tokenizer.from_pretrained("google-t5/t5-small")


In [11]:
# Load dataset
ds = load_dataset("antash420/text-summarization-alpaca-format")

# Sample 5000 rows from the train, validation, and test sets
train_sample = ds['train'].shuffle(seed=42).select(range(2000))
validation_sample = ds['validation'].shuffle(seed=42).select(range(300))  # Adjust if needed
test_sample = ds['test'].shuffle(seed=42).select(range(100))  # Adjust if needed


In [12]:
# Create a new DatasetDict
reduced_dataset = {
    'train': train_sample,
    'validation': validation_sample,
    'test': test_sample,
}


In [13]:
from datasets import get_dataset_config_names
from datasets import load_dataset,DatasetDict
# Tokenize the dataset
def preprocess_function(examples):
    inputs = tokenizer(examples['input'], truncation=True, padding="max_length", max_length=512)
    targets = tokenizer(examples['output'], truncation=True, padding="max_length", max_length=512)
    inputs['labels'] = targets['input_ids']
    return inputs

tokenized_datasets = DatasetDict({
    split: reduced_dataset[split].map(preprocess_function, batched=True)
    for split in ['train', 'validation', 'test']
})

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [14]:
model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Total parameters and trainable parameters.
total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,} total parameters.")
total_trainable_params = sum(
    p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.")

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

60,506,624 total parameters.
60,506,624 training parameters.


In [15]:
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred.predictions[0], eval_pred.label_ids

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True,
        rouge_types=[
            'rouge1',
            'rouge2',
            'rougeL'
        ]
    )

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [16]:
def preprocess_logits_for_metrics(logits, labels):
    """
    Original Trainer may have a memory leak.
    This is a workaround to avoid storing too many tensors that are not needed.
    """
    pred_ids = torch.argmax(logits[0], dim=-1)
    return pred_ids, labels

In [18]:
BATCH_SIZE = 4
NUM_PROCS = 4
EPOCHS = 10
OUT_DIR = 'results_t5base'
MAX_LENGTH = 512
training_args = TrainingArguments(
    output_dir=OUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir=OUT_DIR,
    logging_steps=10,
    evaluation_strategy='steps',
    eval_steps=200,
    save_strategy='epoch',
    save_total_limit=2,
    report_to='tensorboard',
    learning_rate=0.0001,
    dataloader_num_workers=4
)



In [19]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, Trainer
import torch
# Create Trainer instance with PEFT model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    preprocess_logits_for_metrics=preprocess_logits_for_metrics,
    compute_metrics=compute_metrics)


In [20]:
# Train the model
history=trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Gen Len
200,0.4367,0.374649,0.5762,0.3066,0.5253,82.2267
400,0.3003,0.307762,0.5946,0.3274,0.5438,83.1767
600,0.315,0.301873,0.5976,0.3305,0.5471,83.3167
800,0.309,0.300104,0.5994,0.3334,0.5486,83.3367
1000,0.2873,0.2989,0.5994,0.3354,0.5477,83.34
1200,0.2534,0.298391,0.6001,0.3341,0.5482,83.3367
1400,0.2825,0.298281,0.5989,0.333,0.5481,83.3433
1600,0.2788,0.298218,0.6027,0.3374,0.551,83.3433
1800,0.3025,0.298719,0.6002,0.3354,0.5483,83.3433
2000,0.2799,0.299093,0.5994,0.3347,0.5484,83.34




In [30]:
# Save the tokenizer explicitly
tokenizer.save_pretrained(OUT_DIR)

# Save the model explicitly
model.save_pretrained(OUT_DIR)


In [31]:
# Load the saved tokenizer and model
tokenizer = T5Tokenizer.from_pretrained(OUT_DIR)
model = T5ForConditionalGeneration.from_pretrained(OUT_DIR)


In [33]:
# Push model and tokenizer to Hugging Face Hub
model.push_to_hub("dheerajnarne/textsummarizer", use_auth_token=True)
tokenizer.push_to_hub("dheerajnarne/textsummarizer", use_auth_token=True)



model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]



README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/dheerajnarne/textsummarizer/commit/d6813d28d9b24a53832f0eb71fd88fd3e4053bc0', commit_message='Upload tokenizer', commit_description='', oid='d6813d28d9b24a53832f0eb71fd88fd3e4053bc0', pr_url=None, repo_url=RepoUrl('https://huggingface.co/dheerajnarne/textsummarizer', endpoint='https://huggingface.co', repo_type='model', repo_id='dheerajnarne/textsummarizer'), pr_revision=None, pr_num=None)

In [36]:
# Ensure the model is on the correct device (cuda or cpu)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [38]:
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM

# Use the pipeline as a high-level helper
pipe = pipeline("text2text-generation", model="dheerajnarne/textsummarizer")

# Example text to summarize
text = """
Yemen Sanctions Kerala Nurse's Death Sentence For Murder, India RespondsWith Yemen President Rashad al-Alimi also rejecting Nimisha Priya's appeal, her release depended on securing forgiveness from the victim's family and their tribal leaders and paying them blood money.
Written by:
Sanstuti Nath
World News
Dec 31, 2024 11:38 am IST
Published On
Dec 31, 2024 09:37 am IST
Last Updated On
Dec 31, 2024 11:38 am IST
Read Time:
4 mins
Share
TwitterWhatsAppFacebookRedditEmail

Yemen Sanctions Kerala Nurse's Death Sentence For Murder, India Responds
New Delhi:
Yemen President Rashad al-Alimi has approved the death sentence for Indian nurse Nimisha Priya, who has been serving a prison sentence since 2017 for the murder of a Yemeni national. The sentence can be executed in a month's time, according to media reports.

Reacting to the development, the Ministry of External Affairs (MEA) on Monday said, India is aware of the sentencing of Nimisha Priya in Yemen.

"We understand that the family of Priya is exploring relevant options. The government is extending all possible help in the matter," MEA Spokesperson, Randhir Jaiswal, said in a statement.

The Yemeni President's decision came as a shock to the family back home that has been making efforts to save the 36-year-old from death row. Her mother Prema Kumari, 57, reached Sana'a, Yemen's capital, earlier this year and has since been reportedly staying there to secure a waiver of the death penalty and negotiate the blood money with the victim's family.

Nimisha Priya Case
Nimisha Priya was found guilty of killing Talal Abdo Mahdi, a Yemeni national, in 2017. A year later, she was sentenced to death by a trial court in Yemen. Since then, her family has been fighting for her release. They approached the Yemini Supreme Court against the trial court's order, but their appeal was rejected in 2023. Now, with the country's President also rejecting Priya's appeal, her release depended on securing forgiveness from the victim's family and their tribal leaders.

Her mother, Prema Kumari has been trying to negotiate the blood money with the victim's family, but talks with the victim's family had come to an abrupt halt in September after Abdullah Ameer, the lawyer appointed by the Indian Embassy, demanded a pre-negotiation fee of $20,000 (approximately Rs16.6 lakh), according to a report by Manorama online.

MEA has already provided $19,871 to Ameer in July, but he insisted on a total fee of $40,000, payable in two instalments before he would resume talks.

The Save Nimisha Priya International Action Council succeeded in raising the first instalment of Ameer's fee through crowdfunding. However, later they reportedly faced challenges in ensuring transparency to donors about how the funds were being used.

About Nimisha Priya
Nimisha Priya, a native of Palakkad, is a trained nurse who worked in private hospitals in Yemen for a few years. Her husband and minor daughter returned to India in 2014 because of financial reasons. The same year, Yemen was gripped by civil war, and they could not go back, as the country stopped issuing new visas.

Later in 2015, Priya sought Mahdi's support to set up her clinic in Sana'a, as under Yemen's law, only nationals are allowed to set up clinics and business firms.

Per her appeal plea in Yemani Supreme Court, in 2015, Mahdi accompanied Priya to Kerala when she came for a month-long holiday. During the visit, he stole her wedding photograph, which he later manipulated to claim that he was married to her.

Upon their return, when Priya started the clinic, Mahadi started cornering all the revenue. He also manipulated the ownership documents of the clinic. When Nimisha Priya questioned him about the embezzlement, he became hostile towards her.

He also began to take money out of her monthly earnings after telling everyone that Priya was his wife and even morphed their pictures to show they were married. In her plea, Priya alleged that soon the harassment turned into physical torture and Mahdi also seized her passport.

According to her plea, Priya even approached the police in Sanaa regarding the matter, but instead of taking action against Mahdi, the police arrested her and put her in jail for six days.

In July 2017, Priya approached the warden of a jail near her clinic, where Mahdi was previously jailed under various charges.

Comments
The warden suggested that she should try to sedate him, and then convince him to give her passport. However, sedation did not affect Mahdi, who was a substance abuser. She tried sedating him again, using a stronger sedative to retrieve her passport but he died within a few minutes due to a drug overdose.
"""

# Run summarization using the pipeline
summary = pipe(text, max_length=100, num_beams=4, early_stopping=True)  # Limiting output length
print("Summarized Text:", summary[0]['generated_text'])

# Alternatively, you can load the model and tokenizer manually
tokenizer = AutoTokenizer.from_pretrained("dheerajnarne/textsummarizer")
model = AutoModelForSeq2SeqLM.from_pretrained("dheerajnarne/textsummarizer")

# Tokenize the input text
inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)

# Generate the summary with max_length control
summary_ids = model.generate(
    inputs['input_ids'],
    max_length=100,  # Limit the summary length
    num_beams=4,     # Beam search for better summary quality
    early_stopping=True
)

# Decode the summary
summary_text = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print("Summarized Text (Manual):", summary_text)


config.json:   0%|          | 0.00/1.51k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/20.8k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.59k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers
Device set to use cuda:0
Token indices sequence length is longer than the specified maximum sequence length for this model (1167 > 512). Running this sequence through the model will result in indexing errors


Summarized Text: Yemeni President Rashad al-Alimi has approved the death sentence for Indian nurse Nimisha Priya . The 36-year-old has been serving a prison sentence since 2017 for the murder of a Yemeni national . She has since been reportedly staying there to secure a waiver of the death penalty and negotiate the blood money with the victim's family .
Summarized Text (Manual): Yemen Sanctions Kerala Nurse's Death Sentence For Murder, India RespondsWith Yemen President Rashad al-Alimi also rejecting Nimisha Priya's appeal, her release depended on securing forgiveness from the victim's family and their tribal leaders and paying them blood money. She has been serving a prison sentence since 2017 for the murder of a Yemeni national.
