In [45]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import load_dataset, DatasetDict
import pandas as pd
from sklearn.model_selection import train_test_split
from peft import LoraConfig, get_peft_model, TaskType
import evaluate
import numpy as np
from peft import PeftModel, PeftConfig


In [None]:
from datasets import Dataset


def load_and_preprocess_data(file_path):
    df = pd.read_csv(file_path)
    
    df = df.dropna()
    df = df[df['abstract'].str.strip().astype(bool)]  
    
    train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
    val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)
    
    return DatasetDict({
        'train': Dataset.from_pandas(train_df),
        'val': Dataset.from_pandas(val_df),
        'test': Dataset.from_pandas(test_df)
    })

In [None]:

model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

In [None]:

def preprocess_function(examples):
    inputs = ["summarize: " + doc[:5000] for doc in examples["article"]] 
    targets = [abs[:1000] for abs in examples["abstract"]]  
    
    model_inputs = tokenizer(
        inputs,
        max_length=512,
        truncation=True,
        padding="max_length"
    )
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=256,
            truncation=True,
            padding="max_length"
        )
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


dataset = load_and_preprocess_data("train.csv")

In [None]:

tokenized_datasets = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=["article", "abstract"]
)


Map: 100%|██████████| 93785/93785 [03:58<00:00, 393.48 examples/s]
Map: 100%|██████████| 11723/11723 [00:29<00:00, 393.46 examples/s]
Map: 100%|██████████| 11724/11724 [00:29<00:00, 399.54 examples/s]


In [12]:
# 5. Configure LoRA
peft_config = LoraConfig(
    r=32,
    lora_alpha=32,
    target_modules=["q", "v"],  # T5 attention matrices
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)


In [13]:
# 6. Create PEFT model
model = get_peft_model(model, peft_config)
print(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad)}")

Trainable parameters: 3538944


In [14]:
# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./t5-summarizer",
    evaluation_strategy="steps",
    max_steps=30000,
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    learning_rate=3e-5,
    per_device_train_batch_size=6,
    per_device_eval_batch_size=6,
    gradient_accumulation_steps=4,
    weight_decay=0.01,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
    report_to="tensorboard",
    logging_steps=100,
    push_to_hub=False,
)



In [15]:
# 8. Create trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["val"],
    tokenizer=tokenizer,
)

  trainer = Seq2SeqTrainer(
No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [16]:
# 9. Start training
trainer.train()


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss,Validation Loss
500,2.6645,2.402045
1000,2.4757,2.250269
1500,2.3886,2.187856
2000,2.3572,2.142895
2500,2.3211,2.113314
3000,2.2613,2.08597
3500,2.2342,2.070204
4000,2.2011,2.054226
4500,2.2081,2.042915
5000,2.1987,2.033538


TrainOutput(global_step=30000, training_loss=4.636019141133627, metrics={'train_runtime': 48321.2446, 'train_samples_per_second': 14.9, 'train_steps_per_second': 0.621, 'total_flos': 4.463032919402742e+17, 'train_loss': 4.636019141133627, 'epoch': 7.678395496129486})

In [20]:
# 13. Inference function
def generate_research_summary(article, model, tokenizer):
    inputs = tokenizer(
        "summarize: " + article[:5000],
        max_length=512,
        truncation=True,
        return_tensors="pt"
    ).to(model.device)
    
    outputs = model.generate(
            input_ids=inputs.input_ids,
            attention_mask=inputs.attention_mask,
            max_length=256,
            num_beams=4,
            early_stopping=True
        )
    
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [46]:

def load_saved_model(model_path):
    base_model = T5ForConditionalGeneration.from_pretrained("t5-base")
    
    model = PeftModel.from_pretrained(base_model, model_path)
    
    tokenizer = T5Tokenizer.from_pretrained(model_path)
    
    return model, tokenizer

def evaluate_summarization(model, tokenizer, dataset, max_samples=3):
    rouge = evaluate.load('rouge')
    bleu = evaluate.load('bleu')
    
    generated_summaries = []
    reference_summaries = []
    
    for example in dataset.select(range(max_samples)):
        inputs = tokenizer(
            "summarize: " + example["Abstract"][:5000],
            max_length=512,
            truncation=True,
            return_tensors="pt"
        ).to(model.device)
        
        outputs = model.generate(
            input_ids=inputs.input_ids,
            attention_mask=inputs.attention_mask,
            max_length=256,
            num_beams=4,
            early_stopping=True
        )

        generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
        generated_summaries.append(generated)
        reference_summaries.append([example["Abstract"]])  # Wrap in list for BLEU

    results = {}
    
    rouge_scores = rouge.compute(
        predictions=generated_summaries,
        references=[ref[0] for ref in reference_summaries],
        use_stemmer=True
    )
    results.update(rouge_scores)

    bleu_scores = bleu.compute(
        predictions=generated_summaries,
        references=reference_summaries,
        max_order=4
    )
    results.update(bleu_scores)
    
    return results

In [None]:

model_path = "./checkpoint-8500"
model, tokenizer = load_saved_model(model_path)
model = model.to("cuda" if torch.cuda.is_available() else "cpu")
    
dataset = load_dataset('csv', data_files='Brain_Dead_CompScholar_Dataset.csv')['train']

test_results = evaluate_summarization(model, tokenizer, dataset, max_samples=300)

print("\nFinal Evaluation Results:")
print(f"ROUGE-1: {test_results['rouge1']:.4f}")
print(f"ROUGE-2: {test_results['rouge2']:.4f}")
print(f"ROUGE-L: {test_results['rougeL']:.4f}")
print(f"BLEU: {test_results['bleu']:.4f}")
print(f"BLEU-1: {test_results['precisions'][0]:.4f}")
print(f"BLEU-2: {test_results['precisions'][1]:.4f}")
print(f"BLEU-3: {test_results['precisions'][2]:.4f}")
print(f"BLEU-4: {test_results['precisions'][3]:.4f}")


Final Evaluation Results:
ROUGE-1: 0.6030
ROUGE-2: 0.5850
ROUGE-L: 0.5962
BLEU: 0.2361
BLEU-1: 0.9810
BLEU-2: 0.9495
BLEU-3: 0.9244
BLEU-4: 0.9016


In [44]:
sample_article = dataset[2]["Abstract"]
summary = generate_research_summary(sample_article, model, tokenizer)
print("\nGenerated Summary:", summary)


Generated Summary: Abstractive Text Summarization (ATS), which is the task of constructing summary sentences by merging facts from different source sentences and condensing them into a shorter representation while preserving information content and overall meaning. In this paper, we propose an LSTM-CNN based ATS framework (ATSDL) that can construct new sentences by exploring more fine-grained fragments than sentences, namely, semantic phrases. Experimental results on the datasets CNN and DailyMail show that our ATSDL framework outperforms the state-the-art models in terms of both semantics and syntactic structure, and achieves competitive results on manual linguistic quality evaluation.
