In [1]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


In [2]:
!kaggle datasets download -d gowrishankarp/newspaper-text-summarization-cnn-dailymail


Dataset URL: https://www.kaggle.com/datasets/gowrishankarp/newspaper-text-summarization-cnn-dailymail
License(s): CC0-1.0
Downloading newspaper-text-summarization-cnn-dailymail.zip to /content
 76% 383M/503M [00:02<00:01, 115MB/s]
100% 503M/503M [00:02<00:00, 200MB/s]


In [3]:
import zipfile
import os

zip_path = "newspaper-text-summarization-cnn-dailymail.zip"
extract_path = "/content/cnn_dailymail_data"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

os.listdir(extract_path)


['cnn_dailymail']

In [4]:
import pandas as pd
import os

base_path = "/content/cnn_dailymail_data/cnn_dailymail"

train_df = pd.read_csv(os.path.join(base_path, "train.csv"), engine='python', on_bad_lines='skip')
val_df   = pd.read_csv(os.path.join(base_path, "validation.csv"), engine='python', on_bad_lines='skip')
test_df  = pd.read_csv(os.path.join(base_path, "test.csv"), engine='python', on_bad_lines='skip')

print("Train shape:", train_df.shape)
print("Validation shape:", val_df.shape)
print("Test shape:", test_df.shape)

train_df.head()


Train shape: (287113, 3)
Validation shape: (13368, 3)
Test shape: (11490, 3)


Unnamed: 0,id,article,highlights
0,0001d1afc246a7964130f43ae940af6bc6c57f01,By . Associated Press . PUBLISHED: . 14:11 EST...,"Bishop John Folda, of North Dakota, is taking ..."
1,0002095e55fcbd3a2f366d9bf92a95433dc305ef,(CNN) -- Ralph Mata was an internal affairs li...,Criminal complaint: Cop used his role to help ...
2,00027e965c8264c35cc1bc55556db388da82b07f,A drunk driver who killed a young woman in a h...,"Craig Eccleston-Todd, 27, had drunk at least t..."
3,0002c17436637c4fe1837c935c04de47adb18e9a,(CNN) -- With a breezy sweep of his pen Presid...,Nina dos Santos says Europe must be ready to a...
4,0003ad6ef0c37534f80b55b4235108024b407f0b,Fleetwood are the only team still to have a 10...,Fleetwood top of League One after 2-0 win at S...


In [6]:
!pip install transformers datasets torch evaluate

import pandas as pd
import torch
from torch.utils.data import DataLoader
from transformers import BartTokenizer, BartForConditionalGeneration
from tqdm import tqdm


Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [7]:
base_path = "/content/cnn_dailymail_data/cnn_dailymail"

train_df = pd.read_csv(f"{base_path}/train.csv", engine='python', on_bad_lines='skip')
val_df   = pd.read_csv(f"{base_path}/validation.csv", engine='python', on_bad_lines='skip')
test_df  = pd.read_csv(f"{base_path}/test.csv", engine='python', on_bad_lines='skip')

print("Train shape:", train_df.shape)
print("Validation shape:", val_df.shape)
print("Test shape:", test_df.shape)


Train shape: (287113, 3)
Validation shape: (13368, 3)
Test shape: (11490, 3)


In [8]:
tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")

device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

In [9]:
max_input_length = 512
max_output_length = 128

def encode(batch):
    inputs = tokenizer(batch['article'], truncation=True, padding="max_length", max_length=max_input_length, return_tensors="pt")
    targets = tokenizer(batch['highlights'], truncation=True, padding="max_length", max_length=max_output_length, return_tensors="pt")
    return {'input_ids': inputs.input_ids.squeeze(), 'attention_mask': inputs.attention_mask.squeeze(), 'labels': targets.input_ids.squeeze()}

subset_size = 1000
train_data = [encode(row) for i, row in train_df.head(subset_size).iterrows()]
val_data   = [encode(row) for i, row in val_df.head(200).iterrows()]

train_loader = DataLoader(train_data, batch_size=4, shuffle=True)
val_loader   = DataLoader(val_data, batch_size=4)


In [10]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)
model.train()

for epoch in range(3):
    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}")
    for batch in loop:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        loop.set_postfix(loss=loss.item())


Epoch 1: 100%|██████████| 250/250 [01:32<00:00,  2.71it/s, loss=1.26]
Epoch 2: 100%|██████████| 250/250 [01:31<00:00,  2.73it/s, loss=0.897]
Epoch 3: 100%|██████████| 250/250 [01:31<00:00,  2.74it/s, loss=1.09]


In [11]:
model.eval()
sample_articles = test_df['article'].head(3).tolist()

for article in sample_articles:
    inputs = tokenizer(article, return_tensors="pt", truncation=True, max_length=max_input_length).to(device)
    summary_ids = model.generate(inputs['input_ids'], max_length=128, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    print("\nArticle:\n", article[:500], "...\n")
    print("Generated Summary:\n", summary)



Article:
 Ever noticed how plane seats appear to be getting smaller and smaller? With increasing numbers of people taking to the skies, some experts are questioning if having such packed out planes is putting passengers at risk. They say that the shrinking space on aeroplanes is not only uncomfortable - it's putting our health and safety in danger. More than squabbling over the arm rest, shrinking space on planes putting our health and safety in danger? This week, a U.S consumer advisory group set up by t ...

Generated Summary:
 Increasing space on planes is not only uncomfortable - it's putting our health and safety in danger? This week, a U.S consumer advisory group set up by the Department of Transportation said at a public hearing that while the government is happy to set standards for animals flying on planes, it doesn't stipulate a minimum amount of space for humans .
In a world where animals have more rights to space and food than humans,' said Charlie Leocha, consumer represe

In [12]:
!pip install rouge-score
from rouge_score import rouge_scorer


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=167a5010b4c73e64b44c9a1d5cd1be58ff83f4b865e1246c4fed453a5956640f
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [13]:
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

model.eval()
all_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}

for batch in tqdm(val_loader, desc="Evaluating"):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)


    summaries_ids = model.generate(input_ids, max_length=80, num_beams=4, early_stopping=True)
    preds = [tokenizer.decode(g, skip_special_tokens=True) for g in summaries_ids]
    refs = [tokenizer.decode(l, skip_special_tokens=True) for l in labels]

    for pred, ref in zip(preds, refs):
        scores = scorer.score(ref, pred)
        all_scores['rouge1'].append(scores['rouge1'].fmeasure)
        all_scores['rouge2'].append(scores['rouge2'].fmeasure)
        all_scores['rougeL'].append(scores['rougeL'].fmeasure)

# Compute average ROUGE scores
avg_rouge1 = sum(all_scores['rouge1']) / len(all_scores['rouge1'])
avg_rouge2 = sum(all_scores['rouge2']) / len(all_scores['rouge2'])
avg_rougeL = sum(all_scores['rougeL']) / len(all_scores['rougeL'])

print(f"ROUGE-1: {avg_rouge1:.4f}")
print(f"ROUGE-2: {avg_rouge2:.4f}")
print(f"ROUGE-L: {avg_rougeL:.4f}")


Evaluating: 100%|██████████| 50/50 [01:26<00:00,  1.73s/it]

ROUGE-1: 0.4095
ROUGE-2: 0.1856
ROUGE-L: 0.2826





In [14]:
from transformers import BartTokenizer, BartForConditionalGeneration
import torch

tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
model.eval()

examples = val_df[['article', 'highlights']].head(5)

for i, row in examples.iterrows():
    article = row['article']
    reference = row['highlights']

    inputs = tokenizer(article, return_tensors="pt", max_length=1024, truncation=True).to(device)
    summary_ids = model.generate(
        inputs.input_ids,
        num_beams=4,
        max_length=100,
        early_stopping=True
    )
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    print(f"\n--- Example {i+1} ---")
    print("Original Article:\n", article[:500], "...")  # show first 500 chars
    print("\nReference Summary:\n", reference)
    print("\nGenerated Summary:\n", summary)



--- Example 1 ---
Original Article:
 Sally Forrest, an actress-dancer who graced the silver screen throughout the '40s and '50s in MGM musicals and films such as the 1956 noir While the City Sleeps died on March 15 at her home in Beverly Hills, California. Forrest, whose birth name was Katherine Feeney, was 86 and had long battled cancer. Her publicist, Judith Goffin, announced the news Thursday. Scroll down for video . Actress: Sally Forrest was in the 1951 Ida Lupino-directed film 'Hard, Fast and Beautiful' (left) and the 1956 Fr ...

Reference Summary:
 Sally Forrest, an actress-dancer who graced the silver screen throughout the '40s and '50s in MGM musicals and films died on March 15 .
Forrest, whose birth name was Katherine Feeney, had long battled cancer .
A San Diego native, Forrest became a protege of Hollywood trailblazer Ida Lupino, who cast her in starring roles in films .

Generated Summary:
 Sally Forrest, an actress-dancer who graced the silver screen throughout the '40s