In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/cnnfiltered2/cnn_filtered2.csv


In [2]:
# Install required libraries
!pip install transformers torch evaluate rouge-score pandas numpy tqdm nlpaug sacremoses nltk

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl.metadata (14 kB)
Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading nlpaug-1.1.11-py3-none-any.whl (410 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m37.2 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge-score
  Building wheel for rouge-score (se

In [3]:
# Imports
import torch
from transformers import BartTokenizer, BartForConditionalGeneration, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from tqdm import tqdm
from evaluate import load
from sklearn.model_selection import train_test_split
import re
import os
import shutil

In [4]:
# 1. Load Model & Tokenizer
checkpoint_dir = "/kaggle/working/best_bartlarge_model"
tokenizer_dir = "/kaggle/working/best_bartlarge_tok"
if os.path.exists(checkpoint_dir) and os.path.exists(tokenizer_dir):
    print("Loading from checkpoint (e.g., ROUGE-L = 0.305)...")
    model = BartForConditionalGeneration.from_pretrained(checkpoint_dir)
    tokenizer = BartTokenizer.from_pretrained(tokenizer_dir)
else:
    print("Starting from pretrained model...")
    model_name = "facebook/bart-large-cnn"
    model = BartForConditionalGeneration.from_pretrained(model_name)
    tokenizer = BartTokenizer.from_pretrained(model_name)
model.config.dropout = 0.3



Starting from pretrained model...


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [5]:
# 2. Preprocessing & Augmentation
def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [6]:
# 3. Dataset Class
class NewsSummaryDataset(Dataset):
    def __init__(self, data, tokenizer, max_input=512, max_output=160):
        self.data = data
        self.tokenizer = tokenizer
        self.max_input = max_input
        self.max_output = max_output

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        article = preprocess_text(self.data.iloc[idx]["article"])
        summary = preprocess_text(self.data.iloc[idx]["highlights"])

        inputs = self.tokenizer(
            article,
            max_length=self.max_input,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )
        targets = self.tokenizer(
            summary,
            max_length=self.max_output,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )
        return {
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0),
            "labels": targets["input_ids"].squeeze(0)
        }

In [7]:
# 4. Load Pure CNN Dataset (35K)
df = pd.read_csv("/kaggle/input/cnnfiltered2/cnn_filtered2.csv")
df = df.dropna(subset=["article", "highlights"])

if len(df) < 35000:
    print(f"Warning: Dataset has {len(df)} samples, duplicating to 35,000...")
    df = pd.concat([df] * ((35000 // len(df)) + 1)).iloc[:35000]
else:
    df = df.sample(n=35000, random_state=42)

train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)  # 28K train
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)  # 3.5K val/test
print(f"Dataset size: {len(df)} samples")
print(f"Train dataset size: {len(train_df)}")
print(f"Val dataset size: {len(val_df)}")
print(f"Test dataset size: {len(test_df)}")

Dataset size: 35000 samples
Train dataset size: 28000
Val dataset size: 3500
Test dataset size: 3500


In [8]:
train_dataset = NewsSummaryDataset(train_df, tokenizer)
val_dataset = NewsSummaryDataset(val_df, tokenizer)
test_dataset = NewsSummaryDataset(test_df, tokenizer)

In [9]:
# 5. DataLoader
BATCH_SIZE = 4
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [10]:
# 6. Training Setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
NUM_EPOCHS = 4
TOTAL_STEPS = len(train_loader) * NUM_EPOCHS  # ~35,000 steps
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=1000, num_training_steps=TOTAL_STEPS)
scaler = torch.cuda.amp.GradScaler()
accumulation_steps = 4

  scaler = torch.cuda.amp.GradScaler()


In [11]:
# Load ROUGE metric
rouge = load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [12]:
# 7. Training Loop
best_rougeL = 0.0
patience = 2

for epoch in range(NUM_EPOCHS):
    model.train()
    total_train_loss = 0
    for step, batch in enumerate(tqdm(train_loader, desc=f"Epoch {epoch+1} Train")):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.amp.autocast(device_type='cuda', dtype=torch.float16):
            outputs = model(**batch)
            loss = outputs.loss / accumulation_steps
        scaler.scale(loss).backward()
        if (step + 1) % accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            scheduler.step()
        total_train_loss += loss.item() * accumulation_steps

    model.eval()
    total_val_loss = 0
    val_summaries = []
    val_references = []
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Epoch {epoch+1} Val"):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            total_val_loss += outputs.loss.item()
            preds = model.generate(
                batch["input_ids"],
                max_length=160,  # ~110 words
                min_length=90,  # ~70 words
                length_penalty=1.2,
                num_beams=8,
                early_stopping=True
            )
            summaries = tokenizer.batch_decode(preds, skip_special_tokens=True)
            references = tokenizer.batch_decode(batch["labels"], skip_special_tokens=True)
            val_summaries.extend(summaries)
            val_references.extend(references)

    avg_train_loss = total_train_loss / len(train_loader)
    avg_val_loss = total_val_loss / len(val_loader)
    rouge_scores = rouge.compute(predictions=val_summaries, references=val_references)
    print(f"Epoch {epoch+1}:")
    print(f"  Train Loss: {avg_train_loss:.3f}")
    print(f"  Val Loss: {avg_val_loss:.3f}")
    print(f"  ROUGE-1: {rouge_scores['rouge1']:.3f}, ROUGE-2: {rouge_scores['rouge2']:.3f}, ROUGE-L: {rouge_scores['rougeL']:.3f}")

    if rouge_scores['rougeL'] > best_rougeL:
        best_rougeL = rouge_scores['rougeL']
        model.save_pretrained("/kaggle/working/best_bartlarge_model")
        tokenizer.save_pretrained("/kaggle/working/best_bartlarge_tok")
        print(f"Best model saved with ROUGE-L: {best_rougeL:.3f}")
    elif epoch > 1 and rouge_scores['rougeL'] < best_rougeL - 0.01:
        print(f"Early stopping at epoch {epoch+1}")
        break

Epoch 1 Train: 100%|██████████| 7000/7000 [1:15:13<00:00,  1.55it/s]
Epoch 1 Val: 100%|██████████| 875/875 [1:21:30<00:00,  5.59s/it]


Epoch 1:
  Train Loss: 1.285
  Val Loss: 0.973
  ROUGE-1: 0.469, ROUGE-2: 0.227, ROUGE-L: 0.309




Best model saved with ROUGE-L: 0.309


Epoch 2 Train: 100%|██████████| 7000/7000 [1:15:03<00:00,  1.55it/s]
Epoch 2 Val: 100%|██████████| 875/875 [1:18:53<00:00,  5.41s/it]


Epoch 2:
  Train Loss: 0.927
  Val Loss: 0.968
  ROUGE-1: 0.469, ROUGE-2: 0.228, ROUGE-L: 0.310
Best model saved with ROUGE-L: 0.310


Epoch 3 Train: 100%|██████████| 7000/7000 [1:15:09<00:00,  1.55it/s]
Epoch 3 Val: 100%|██████████| 875/875 [1:14:49<00:00,  5.13s/it]


Epoch 3:
  Train Loss: 0.827
  Val Loss: 0.978
  ROUGE-1: 0.470, ROUGE-2: 0.228, ROUGE-L: 0.311
Best model saved with ROUGE-L: 0.311


Epoch 4 Train: 100%|██████████| 7000/7000 [1:15:14<00:00,  1.55it/s]
Epoch 4 Val: 100%|██████████| 875/875 [1:19:42<00:00,  5.47s/it]


Epoch 4:
  Train Loss: 0.740
  Val Loss: 1.000
  ROUGE-1: 0.474, ROUGE-2: 0.231, ROUGE-L: 0.312
Best model saved with ROUGE-L: 0.312


In [13]:
# 8. Test Set Evaluation
model.eval()
test_summaries = []
test_references = []
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Test Evaluation"):
        batch = {k: v.to(device) for k, v in batch.items()}
        preds = model.generate(
            batch["input_ids"],
            max_length=160,
            min_length=90,
            length_penalty=1.2,
            num_beams=8,
            early_stopping=True
        )
        summaries = tokenizer.batch_decode(preds, skip_special_tokens=True)
        references = tokenizer.batch_decode(batch["labels"], skip_special_tokens=True)
        test_summaries.extend(summaries)
        test_references.extend(references)

test_rouge = rouge.compute(predictions=test_summaries, references=test_references)
print("Test Set Results:")
print(f"  ROUGE-1: {test_rouge['rouge1']:.3f}, ROUGE-2: {test_rouge['rouge2']:.3f}, ROUGE-L: {test_rouge['rougeL']:.3f}")

Test Evaluation: 100%|██████████| 875/875 [1:15:42<00:00,  5.19s/it]


Test Set Results:
  ROUGE-1: 0.473, ROUGE-2: 0.230, ROUGE-L: 0.310


In [14]:
# 9. Final Save and Zip
model.save_pretrained("/kaggle/working/final_bartlarge_model")
tokenizer.save_pretrained("/kaggle/working/final_bartlarge_tok")
shutil.make_archive('/kaggle/working/best_bartlarge_model', 'zip', '/kaggle/working/best_bartlarge_model')
shutil.make_archive('/kaggle/working/best_bartlarge_tok', 'zip', '/kaggle/working/best_bartlarge_tok')
print("Model and tokenizer zipped successfully.")

Model and tokenizer zipped successfully.
