Phase 1: Install & Imports

In [None]:
!pip install -q sentence-transformers datasets rouge-score nltk scikit-learn huggingface_hub

import random
import numpy as np
import torch
import nltk
from tqdm.auto import tqdm
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, InputExample, losses, models, util
from torch.utils.data import DataLoader
from rouge_score import rouge_scorer
from nltk.tokenize import sent_tokenize
from sklearn.model_selection import train_test_split

nltk.download('punkt')
# Safe download for punkt_tab (sometimes needed in Colab)
try:
    nltk.download('punkt_tab')
except:
    pass



  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Step 2: Advanced Data Prep (Train/Test Split)

In [None]:
# 1. Load Data
# We load 15k total: 13k for Training, 2k for Testing
print("Loading dataset...")
ds = load_dataset("cnn_dailymail", "3.0.0", split="train")
ds = ds.shuffle(seed=42).select(range(15000))

# 2. Split into Train & Test (85% Train, 15% Test)
# We convert to a list of dicts for easier splitting
data_list = [item for item in ds]
train_data, test_data = train_test_split(data_list, test_size=0.15, random_state=42)

print(f"Training Samples: {len(train_data)}")
print(f"Testing Samples: {len(test_data)}")

# 3. Helper: Triplet Generator
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

def build_triplets(data_subset):
    examples = []
    skipped = 0

    for item in tqdm(data_subset, desc="Building Triplets"):
        article = item["article"]
        summary = item["highlights"] # Anchor

        sentences = sent_tokenize(article)
        if len(sentences) < 5:
            skipped += 1
            continue

        # Check first 50 sentences (Lead Bias)
        check_limit = min(len(sentences), 50)

        # Calculate scores
        scores = []
        for sent in sentences[:check_limit]:
            if not sent.strip():
                scores.append(0.0)
                continue
            s = scorer.score(summary, sent)["rougeL"].fmeasure
            scores.append(s)

        # Find Positive (Best Sentence)
        best_idx = np.argmax(scores)
        best_score = scores[best_idx]
        positive_sent = sentences[best_idx]

        # Find Negative (Random Bad Sentence)
        neg_candidates = [
            sentences[j] for j, s in enumerate(scores)
            if s < 0.15 and j != best_idx
        ]

        # Create Triplet if quality is good
        if best_score > 0.20 and len(neg_candidates) > 0:
            negative_sent = random.choice(neg_candidates)
            examples.append(InputExample(texts=[summary, positive_sent, negative_sent]))
        else:
            skipped += 1

    print(f"Skipped {skipped} poor samples.")
    return examples

# 4. Generate Training Triplets ONLY
train_examples = build_triplets(train_data)
print(f"Final Training Triplets: {len(train_examples)}")

Loading dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

3.0.0/train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

3.0.0/train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

3.0.0/train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

3.0.0/validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

3.0.0/test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

Training Samples: 12750
Testing Samples: 2250


Building Triplets:   0%|          | 0/12750 [00:00<?, ?it/s]

Skipped 747 poor samples.
Final Training Triplets: 12003


Step 3: Train (SBERT with Triplet Loss)

In [None]:
# 1. Load Base Model (MPNet - Best balance of speed/performance)
model_name = "sentence-transformers/all-mpnet-base-v2"
model = SentenceTransformer(model_name)

# 2. DataLoader
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)

# 3. Loss Function (Triplet Loss)
train_loss = losses.MultipleNegativesRankingLoss(model=model)

# 4. Train
num_epochs = 1
warmup_steps = int(len(train_dataloader) * num_epochs * 0.1)
output_path = "/content/SBERT_Summarization_v2"

print("Starting Training...")
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=num_epochs,
    warmup_steps=warmup_steps,
    output_path=output_path,
    show_progress_bar=True
)
print("Training Complete.")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Starting Training...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 3


[34m[1mwandb[0m: You chose "Don't visualize my results"


Step,Training Loss
500,0.0933


Training Complete.


Step 4: Evaluate Performance(KNN)



In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min

# Load the just-trained model
eval_model = SentenceTransformer(output_path)

def generate_summary(article_text, model, num_sentences=3):
    """
    Generates summary using K-Means Clustering + SBERT
    """
    sentences = sent_tokenize(article_text)
    if len(sentences) <= num_sentences:
        return " ".join(sentences)

    # Encode
    embeddings = model.encode(sentences)

    # Clustering
    kmeans = KMeans(n_clusters=num_sentences, n_init=10, random_state=42)
    kmeans.fit(embeddings)

    # Find closest sentences to cluster centers
    avg = []
    closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, embeddings)
    closest = sorted(closest)

    selected = [sentences[i] for i in closest]
    return " ".join(selected)

# Run Evaluation on 100 random test samples (Full test set takes too long for quick check)
print("Running Evaluation on Test Set...")
rouge_scorer_eval = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

rouge1_scores = []
rougeL_scores = []

# Evaluate on first 100 test samples
for item in tqdm(test_data[:100]):
    article = item["article"]
    reference = item["highlights"]

    # Generate Summary
    generated = generate_summary(article, eval_model, num_sentences=3)

    # Calculate Score
    scores = rouge_scorer_eval.score(reference, generated)
    rouge1_scores.append(scores['rouge1'].fmeasure)
    rougeL_scores.append(scores['rougeL'].fmeasure)

print("\n--- FINAL RESULTS ---")
print(f"Avg ROUGE-1 F1: {np.mean(rouge1_scores):.4f}")
print(f"Avg ROUGE-L F1: {np.mean(rougeL_scores):.4f}")
print("(Typical Extractive Scores: ROUGE-1 ~0.30-0.40 is considered Good)")

Running Evaluation on Test Set...


  0%|          | 0/100 [00:00<?, ?it/s]


--- FINAL RESULTS ---
Avg ROUGE-1 F1: 0.3129
Avg ROUGE-L F1: 0.2044
(Typical Extractive Scores: ROUGE-1 ~0.30-0.40 is considered Good)


Step 4: Evaluate Performance(MMR)

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

def generate_summary_mmr(article_text, model, num_sentences=3, lambda_param=0.5):
    """
    Generates summary using MMR + SBERT Embeddings.
    lambda_param: 1.0 (pure relevance), 0.0 (pure diversity). 0.5 is balanced.
    """
    sentences = sent_tokenize(article_text)
    if len(sentences) <= num_sentences:
        return " ".join(sentences)

    # 1. Encode sentences and the "article context"
    # We treat the mean of all sentence embeddings as the "Article Vector" (The Anchor)
    sentence_embeddings = model.encode(sentences)
    article_embedding = np.mean(sentence_embeddings, axis=0).reshape(1, -1)

    # 2. Iteratively select sentences using MMR
    selected_indices = []
    unselected_indices = list(range(len(sentences)))

    # Pre-calculate similarities to the article to save time
    article_sims = cosine_similarity(sentence_embeddings, article_embedding).flatten()

    for _ in range(num_sentences):
        best_mmr_score = -np.inf
        best_idx = -1

        for i in unselected_indices:
            relevance = article_sims[i]

            # Redundancy: Max similarity to any sentence already in the summary
            if not selected_indices:
                redundancy = 0
            else:
                redundancy = max([cosine_similarity(sentence_embeddings[i].reshape(1, -1),
                                                   sentence_embeddings[j].reshape(1, -1))[0][0]
                                  for j in selected_indices])

            # MMR Formula: Score = λ * Relevance - (1 - λ) * Redundancy
            mmr_score = lambda_param * relevance - (1 - lambda_param) * redundancy

            if mmr_score > best_mmr_score:
                best_mmr_score = mmr_score
                best_idx = i

        selected_indices.append(best_idx)
        unselected_indices.remove(best_idx)

    # 3. Sort by original appearance order for readability
    selected_indices.sort()
    return " ".join([sentences[idx] for idx in selected_indices])

# --- EVALUATION BLOCK (Same as yours, just calling MMR) ---
print("Running MMR Evaluation on Test Set...")
rouge_scorer_eval = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

mmr_rouge1 = []
mmr_rougeL = []

for item in tqdm(test_data[:100]):
    article = item["article"]
    reference = item["highlights"]

    # Generate Summary using MMR (Balanced lambda 0.5)
    generated = generate_summary_mmr(article, eval_model, num_sentences=3, lambda_param=0.5)

    # Calculate Score
    scores = rouge_scorer_eval.score(reference, generated)
    mmr_rouge1.append(scores['rouge1'].fmeasure)
    mmr_rougeL.append(scores['rougeL'].fmeasure)

print("\n--- MMR FINAL RESULTS ---")
print(f"Avg ROUGE-1 F1: {np.mean(mmr_rouge1):.4f}")
print(f"Avg ROUGE-L F1: {np.mean(mmr_rougeL):.4f}")

Running MMR Evaluation on Test Set...


100%|██████████| 100/100 [00:20<00:00,  4.97it/s]


--- MMR FINAL RESULTS ---
Avg ROUGE-1 F1: 0.2954
Avg ROUGE-L F1: 0.1857





Step 5: HuggingFace Login

In [None]:
# Login to Hugging Face to save the model later
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Step 6: Save & Upload

In [None]:
# 1. Your existing repo ID
repo_id = "DarkKnight001/SBERT-Summarization-v2"

# 2. Upload using push_to_hub
# We add exist_ok=True to prevent the 409 Conflict error
eval_model.push_to_hub(
    repo_id,
    commit_message="Updated model trained on 13k CNN/DailyMail triplets",
    exist_ok=True  # <--- This is the fix!
)

print(f"Model successfully updated in your existing repo: https://huggingface.co/{repo_id}")

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...bk3ui4a/model.safetensors:   0%|          |  558kB /  438MB            

Model successfully updated in your existing repo: https://huggingface.co/DarkKnight001/SBERT-Summarization-v2


In [None]:
# --- FINAL INFERENCE TEST ---

def test_model_on_news(news_text):
    print("\n" + "="*50)
    print("ORIGINAL ARTICLE (First 300 chars):")
    print(news_text[:300] + "...")
    print("="*50)

    # 1. Test using K-Means (Thematic Diversity)
    knn_summary = generate_summary(news_text, eval_model, num_sentences=3)

    # 2. Test using MMR (Redundancy Penalty)
    mmr_summary = generate_summary_mmr(news_text, eval_model, num_sentences=3, lambda_param=0.5)

    print("\n[K-MEANS SUMMARY]:")
    print(knn_summary)

    print("\n[MMR SUMMARY]:")
    print(mmr_summary)
    print("="*50)

# Sample News Article for Testing
sample_news = """
In the early hours of January 2, 2026, record-breaking monsoon rains triggered catastrophic flooding across Jakarta, displacing over 200,000 residents within twelve hours. The National Disaster Mitigation Agency (BNPB) reported that water levels in the Ciliwung River reached an unprecedented 950cm, breaching several key levees. Emergency services are struggling to reach isolated districts in North Jakarta, where power outages and communication failures have hampered rescue efforts. President Prabowo Subianto has declared a state of emergency, authorizing the immediate release of 500 billion Rupiah in relief funds. While urban planners warn that this event highlights the city's sinking infrastructure, the immediate focus remains on the "Golden Hour" of search and rescue as rain is forecasted to continue through the weekend.
"""

test_model_on_news(sample_news)


ORIGINAL ARTICLE (First 300 chars):

In the early hours of January 2, 2026, record-breaking monsoon rains triggered catastrophic flooding across Jakarta, displacing over 200,000 residents within twelve hours. The National Disaster Mitigation Agency (BNPB) reported that water levels in the Ciliwung River reached an unprecedented 950cm,...

[K-MEANS SUMMARY]:
The National Disaster Mitigation Agency (BNPB) reported that water levels in the Ciliwung River reached an unprecedented 950cm, breaching several key levees. President Prabowo Subianto has declared a state of emergency, authorizing the immediate release of 500 billion Rupiah in relief funds. While urban planners warn that this event highlights the city's sinking infrastructure, the immediate focus remains on the "Golden Hour" of search and rescue as rain is forecasted to continue through the weekend.

[MMR SUMMARY]:

In the early hours of January 2, 2026, record-breaking monsoon rains triggered catastrophic flooding across Jakarta,