In [None]:
# Install gdown if needed
# !pip install gdown

# import gdown
# gdown.download_folder('your_folder_name')


In [None]:
# Install dependencies if needed
# !pip install transformers datasets evaluate rouge-score
# !pip install sumy
# !pip install streamlit gradio
# !pip install nltk
# !pip install --upgrade transformers

In [None]:
from datasets import load_dataset
dataset = load_dataset("cnn_dailymail", name="3.0.0")
print(dataset)
print(dataset["train"][0])

In [None]:
RANDOM_SEED=63
# Select smaller sets so training is achievable
small_train = dataset["train"].shuffle(seed=RANDOM_SEED).select(range(25000))
small_val = dataset["validation"].shuffle(seed=RANDOM_SEED).select(range(2000))

In [None]:
from transformers import pipeline

summarizer = pipeline("summarization",model="facebook/bart-large-cnn")

text=dataset["test"][0]["article"]
print("Original", text[:500],"...")
print("Reference: ", dataset["test"][0]["highlights"])

summary = summarizer(text, max_length=130, min_length=30, do_sample=False)
print("Generated:", summary[0]['summary_text']) #Sanity check


In [None]:
# If needed:
# !pip install sumy
import nltk
nltk.download('punkt_tab')

from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.text_rank import TextRankSummarizer

text = dataset["test"][0]["article"]

parser = PlaintextParser.from_string(text, Tokenizer("english"))
summarizer = TextRankSummarizer()

In [None]:
!pip install evaluate
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Trainer, TrainingArguments
import evaluate
import numpy as np

checkpoint = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

# Tokenization
def preprocess(batch):
    inputs = tokenizer(batch["article"],
                       max_length=512,
                       truncation=True,
                       padding=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(batch["highlights"],
                           max_length=128,
                           truncation=True,
                           padding=True)
    inputs["labels"] = labels["input_ids"]
    return inputs

# Dataset.map - applying the tokenizer function to each element of the dataset
tokenized_train = small_train.map(
    preprocess,
    batched=True,
    remove_columns=["article", "highlights", "id"]  # Remove original columns
)
tokenized_val = small_val.map(
    preprocess,
    batched=True,
    remove_columns=["article", "highlights", "id"]  # Remove original columns
)



In [None]:
#!pip install rouge_score
#!pip install evaluate
import os
import gc
import glob
import numpy as np
import torch
import evaluate
from transformers import (
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq,
)

# Cleaning GPU (working on Colab)
torch.cuda.empty_cache()
gc.collect()

# Might be useful for VRAM management
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

from google.colab import drive
drive.mount('/content/drive')

# Complicated metrics function with exceptions to prevent overflow which happened without them
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    print(f"Predictions shape: {predictions.shape}")
    print(f"Predictions dtype: {predictions.dtype}")
    print(f"Predictions min/max: {predictions.min()}/{predictions.max()}")

    # Handle predictions - they might be logits, so take argmax
    if predictions.ndim == 3:  # If predictions are logits (batch_size, seq_len, vocab_size)
        print("Taking argmax of 3D predictions (logits)")
        predictions = np.argmax(predictions, axis=-1)

    print(f"After processing - Predictions min/max: {predictions.min()}/{predictions.max()}")

    # Clip predictions to valid token ID range
    max_token_id = tokenizer.vocab_size - 1 if hasattr(tokenizer, 'vocab_size') else 50000
    print(f"Tokenizer vocab size: {getattr(tokenizer, 'vocab_size', 'Unknown')}")
    print(f"Clipping to max_token_id: {max_token_id}")

    predictions = np.clip(predictions, 0, max_token_id)

    # Replace -100s in predictions with pad token id
    predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)

    # Ensure predictions are int32 and within valid range
    predictions = predictions.astype(np.int32)

    try:
        # Decode predictions
        decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
        print(f"Successfully decoded {len(decoded_preds)} predictions")
    except (OverflowError, ValueError) as e:
        print(f"Error decoding predictions: {e}")
        # Fallback: create empty predictions
        decoded_preds = [""] * len(predictions)

    # Decode labels (replace -100 with pad token)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    labels = np.clip(labels, 0, max_token_id)  # Clip labels too for safety
    labels = labels.astype(np.int32)

    try:
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
        print(f"Successfully decoded {len(decoded_labels)} labels")
    except (OverflowError, ValueError) as e:
        print(f"Error decoding labels: {e}")
        # Fallback: create empty labels
        decoded_labels = [""] * len(labels)

    # Compute ROUGE only if we have valid decoded text
    if any(decoded_preds) and any(decoded_labels):
        try:
            result = rouge.compute(
                predictions=decoded_preds,
                references=decoded_labels,
                use_stemmer=True
            )

            print(f"ROUGE scores computed successfully")
            # Return selected ROUGE scores
            return {
                "rouge1": result["rouge1"],
                "rouge2": result["rouge2"],
                "rougeL": result["rougeL"],
            }
        except Exception as e:
            print(f"Error computing ROUGE: {e}")
            return {
                "rouge1": 0.0,
                "rouge2": 0.0,
                "rougeL": 0.0,
            }
    else:
        print("No valid decoded text found, returning zero scores")
        # Return zero scores if decoding failed
        return {
            "rouge1": 0.0,
            "rouge2": 0.0,
            "rougeL": 0.0,
        }


# Data collator
collator = DataCollatorForSeq2Seq(
    tokenizer, model=model, padding=True
)


# Training arguments
args = Seq2SeqTrainingArguments(
    output_dir="/content/drive/MyDrive/model_checkpoints",
    save_strategy="steps",
    save_steps=500,
    save_total_limit=3,
    eval_strategy="steps",
    eval_steps=500,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=1,
    eval_accumulation_steps=16,
    predict_with_generate=True,
    generation_num_beams=2,
    generation_max_length=256,
    num_train_epochs=2,
    learning_rate=3e-5,
    warmup_steps=500,
    weight_decay=0.01,
    fp16=True,
    logging_strategy="steps",
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="rouge1",
    greater_is_better=True,
    report_to=["tensorboard"],
    logging_dir="logs",
)

# Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=collator,
    compute_metrics=compute_metrics,
)

# Resuming from checkpoint (in my case, 3600)
resume_from = "/content/checkpoint-NUMBER"

# Check if it exists
if os.path.exists(resume_from):
    print(f"Using checkpoint: {resume_from}")
    trainer.train(resume_from_checkpoint=resume_from)
else:
    print("Checkpoint not found, starting fresh")
    trainer.train()


# Save the model
trainer.save_model("final_model")
tokenizer.save_pretrained("final_model")

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Load model and tokenizer from a specific checkpoint
checkpoint_path = "/content/your-checkpoint-path"

print("Loading model from checkpoint...")
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path)
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)

print("Model loaded successfully!")


In [None]:
# Test function for news summarization
def test_summarization(article_text, max_length=128):
    inputs = tokenizer(article_text, return_tensors="pt", truncation=True, max_length=512)

    with torch.no_grad():
        outputs = model.generate(
            inputs.input_ids,
            max_length=max_length,
            num_beams=4,
            early_stopping=True,
            do_sample=False,
            length_penalty=1.0
        )

    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return summary

# Test 1: Technology News
test_article_1 = """
Apple announced today that it will be releasing a major software update for its iPhone lineup next month. The iOS 18.2 update includes several new artificial intelligence features, improved battery management, and enhanced security protocols. The company's CEO stated that this update represents the most significant advancement in iPhone software in the past three years. Early beta testers have reported improvements in app performance and faster charging speeds. The update will be available for iPhone 12 and newer models, with older devices receiving a limited version of the features. Apple's stock price rose 3% following the announcement, as investors showed confidence in the company's continued innovation in the mobile technology sector.
"""

print("=== TEST 1: Technology News ===")
print("Original article length:", len(test_article_1.split()))
print("\nGenerated summary:")
summary_1 = test_summarization(test_article_1)
print(summary_1)
print(f"\nSummary length: {len(summary_1.split())} words")

# Test 2: Politics/Economy News
test_article_2 = """
The Federal Reserve announced a 0.25% interest rate cut yesterday, marking the third reduction this year as officials attempt to stimulate economic growth amid concerns about global trade tensions. Fed Chair Jerome Powell explained that the decision was made to support continued expansion and maintain price stability. Economists had mixed reactions to the announcement, with some arguing that further cuts may be necessary while others warned about potential inflationary pressures. The stock market responded positively, with major indices gaining over 2% in after-hours trading. Small businesses are expected to benefit from lower borrowing costs, while savers may see reduced returns on deposits. The next Federal Open Market Committee meeting is scheduled for December, where additional rate changes will be considered based on economic data and employment figures.
"""

print("\n\n=== TEST 2: Politics/Economy News ===")
print("Original article length:", len(test_article_2.split()))
print("\nGenerated summary:")
summary_2 = test_summarization(test_article_2)
print(summary_2)
print(f"\nSummary length: {len(summary_2.split())} words")

def clean_summary(text):
    # Remove spaces before punctuation
    import re
    text = re.sub(r'\s+([.,!?;:])', r'\1', text)
    # Clean up multiple spaces
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

# Apply to your summary
clean_summary_2 = clean_summary(summary_2)
print("Cleaned:", clean_summary_2)
