In [None]:
# Install necessary libraries (run only once)
!pip install transformers datasets torch

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [None]:
import pandas as pd
from datasets import Dataset

# Load the dataset
file_path = "/content/combined_sorted_all_final7.csv"
df = pd.read_csv(file_path)

# Combine title and content for summarization
df['text'] = df['title'] + "\n\n" + df['content']

# Optional: Convert to Hugging Face Dataset
hf_dataset = Dataset.from_pandas(df[['text']])

print(f"Dataset loaded and preprocessed. Total rows: {len(df)}")

Dataset loaded and preprocessed. Total rows: 752


In [None]:
from transformers import BartForConditionalGeneration, BartTokenizer
import torch

# Load the pre-trained BART model and tokenizer
model_name = "facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

# Move the model to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

print("Model and tokenizer loaded.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Model and tokenizer loaded.


In [None]:
def summarize_batch_with_prompt(texts, max_length=150):
    """
    Summarizes a batch of texts using prompt engineering.
    Each input is a dictionary with 'title' and 'content' keys.
    """
    prompts = [
        f"Title: {text['title']}\n\nContent: {text['content']}\n\nSummarize the above article."
        for text in texts
    ]

    inputs = tokenizer(
        prompts,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=1024
    ).to(device)

    outputs = model.generate(
        inputs["input_ids"],
        max_length=max_length,
        num_beams=4,
        early_stopping=True
    )

    return [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

In [None]:
# Execution section with batch processing
batch_size = 16  # Adjust based on system memory
summaries = []

print("Starting summarization...")
for i in range(0, len(df), batch_size):
    # Select a batch of rows and convert to list of dictionaries
    batch_texts = df.iloc[i:i + batch_size][['title', 'content']].to_dict(orient="records")

    # Generate summaries for the batch
    batch_summaries = summarize_batch_with_prompt(batch_texts)
    summaries.extend(batch_summaries)
    print(f"Processed {i + len(batch_texts)} of {len(df)} rows...")

# Add summaries to the DataFrame
df['summary'] = summaries

print("Summarization completed.")

# Save the summarized dataset
output_file_path = "summarized_with_prompts.csv"
df.to_csv(output_file_path, index=False)

print(f"Summarized dataset saved to {output_file_path}")

Starting summarization...
Processed 16 of 752 rows...
Processed 32 of 752 rows...
Processed 48 of 752 rows...
Processed 64 of 752 rows...
Processed 80 of 752 rows...
Processed 96 of 752 rows...
Processed 112 of 752 rows...
Processed 128 of 752 rows...
Processed 144 of 752 rows...
Processed 160 of 752 rows...
Processed 176 of 752 rows...
Processed 192 of 752 rows...
Processed 208 of 752 rows...
Processed 224 of 752 rows...
Processed 240 of 752 rows...
Processed 256 of 752 rows...
Processed 272 of 752 rows...
Processed 288 of 752 rows...
Processed 304 of 752 rows...
Processed 320 of 752 rows...
Processed 336 of 752 rows...
Processed 352 of 752 rows...
Processed 368 of 752 rows...
Processed 384 of 752 rows...
Processed 400 of 752 rows...
Processed 416 of 752 rows...
Processed 432 of 752 rows...
Processed 448 of 752 rows...
Processed 464 of 752 rows...
Processed 480 of 752 rows...
Processed 496 of 752 rows...
Processed 512 of 752 rows...
Processed 528 of 752 rows...
Processed 544 of 752 ro

In [None]:
# Save the dataset
output_file_path = "/content/summarized_with_prompts.csv"
df.to_csv(output_file_path, index=False)

print(f"Summarized dataset saved to {output_file_path}")

Summarized dataset saved to /content/summarized_with_prompts.csv


In [None]:
!pip install rouge_score

In [None]:
from rouge_score import rouge_scorer

# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Calculate ROUGE for a sample
for i in range(5):  # Compare the first 5 rows
    print(f"Original: {df['text'].iloc[i]}")
    print(f"Summary: {df['summary'].iloc[i]}")
    scores = scorer.score(df['text'].iloc[i], df['summary'].iloc[i])
    print(f"ROUGE Scores: {scores}\n")