In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from transformers import BartTokenizer, BartForConditionalGeneration, TrainingArguments, Trainer

In [None]:
%%capture
!pip install datasets
import datasets

In [None]:
from datasets import load_dataset

dataset = load_dataset("ccdv/cnn_dailymail", '3.0.0')

Downloading builder script:   0%|          | 0.00/9.27k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/13.9k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/159M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/376M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/572k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/12.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/661k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/5 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [None]:
# Access the train, validation, and test splits
train_data = dataset['train']
validation_data = dataset['validation']
test_data = dataset['test']

In [None]:
# Convert dictionaries to DataFrames
train_df = pd.DataFrame(train_data)
validation_df = pd.DataFrame(validation_data)
test_df = pd.DataFrame(test_data)

In [None]:
import re
def clean_text(text):
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(' +', ' ', text)
    return text

In [None]:
train_df['article_cleaned'] = train_df['article'].apply(clean_text)
train_df['highlights_cleaned'] = train_df['highlights'].apply(clean_text)

In [None]:
train_df.columns


Index(['article', 'highlights', 'id', 'article_cleaned', 'highlights_cleaned'], dtype='object')

In [None]:
selected_columns = ['article', 'article_cleaned', 'highlights', 'highlights_cleaned']


In [None]:
df = pd.DataFrame(train_df)

In [None]:
df

Unnamed: 0,article,highlights,id,article_cleaned,highlights_cleaned
0,It's official: U.S. President Barack Obama wan...,Syrian official: Obama climbed to the top of t...,0001d1afc246a7964130f43ae940af6bc6c57f01,Its official US President Barack Obama wants l...,Syrian official Obama climbed to the top of th...
1,(CNN) -- Usain Bolt rounded off the world cham...,Usain Bolt wins third gold of world championsh...,0002095e55fcbd3a2f366d9bf92a95433dc305ef,CNN Usain Bolt rounded off the world champions...,Usain Bolt wins third gold of world championsh...
2,"Kansas City, Missouri (CNN) -- The General Ser...",The employee in agency's Kansas City office is...,00027e965c8264c35cc1bc55556db388da82b07f,Kansas City Missouri CNN The General Services ...,The employee in agencys Kansas City office is ...
3,Los Angeles (CNN) -- A medical doctor in Vanco...,NEW: A Canadian doctor says she was part of a ...,0002c17436637c4fe1837c935c04de47adb18e9a,Los Angeles CNN A medical doctor in Vancouver ...,NEW A Canadian doctor says she was part of a t...
4,(CNN) -- Police arrested another teen Thursday...,Another arrest made in gang rape outside Calif...,0003ad6ef0c37534f80b55b4235108024b407f0b,CNN Police arrested another teen Thursday the ...,Another arrest made in gang rape outside Calif...
...,...,...,...,...,...
287108,Tiger Woods’s frustration at the lamentable st...,"Woods said: ’Guys, give me a little f***ing sp...",fffdfb56fdf1a12d364562cc2b9b1d4de7481dee,Tiger Woodss frustration at the lamentable sta...,Woods said Guys give me a little fing space to...
287109,By . Mark Duell . Last updated at 4:07 PM on 2...,13 sailors died in 1804 after explosives ship ...,fffeecb8690b85de8c3faed80adbc7a978f9ae2a,By Mark Duell Last updated at 407 PM on 23rd A...,13 sailors died in 1804 after explosives ship ...
287110,"Suicide: Troll victim Hannah Smith, 14, killed...",Hannah Smith's father says Ask.fm's safety cha...,ffff5231e4c71544bc6c97015cdb16c60e42b3f4,Suicide Troll victim Hannah Smith 14 killed he...,Hannah Smiths father says Askfms safety change...
287111,By . Victoria Woollaston and Mark Prigg . PUBL...,A test version of Windows 8.1 is available to ...,ffff924b14a8d82058b6c1c5368ff1113c1632af,By Victoria Woollaston and Mark Prigg PUBLISHE...,A test version of Windows 81 is available to d...


In [None]:
table = df[selected_columns].head(7)

In [None]:
table

Unnamed: 0,article,article_cleaned,highlights,highlights_cleaned
0,It's official: U.S. President Barack Obama wan...,Its official US President Barack Obama wants l...,Syrian official: Obama climbed to the top of t...,Syrian official Obama climbed to the top of th...
1,(CNN) -- Usain Bolt rounded off the world cham...,CNN Usain Bolt rounded off the world champions...,Usain Bolt wins third gold of world championsh...,Usain Bolt wins third gold of world championsh...
2,"Kansas City, Missouri (CNN) -- The General Ser...",Kansas City Missouri CNN The General Services ...,The employee in agency's Kansas City office is...,The employee in agencys Kansas City office is ...
3,Los Angeles (CNN) -- A medical doctor in Vanco...,Los Angeles CNN A medical doctor in Vancouver ...,NEW: A Canadian doctor says she was part of a ...,NEW A Canadian doctor says she was part of a t...
4,(CNN) -- Police arrested another teen Thursday...,CNN Police arrested another teen Thursday the ...,Another arrest made in gang rape outside Calif...,Another arrest made in gang rape outside Calif...
5,(CNN) -- Thousands on Saturday fled the area i...,CNN Thousands on Saturday fled the area in sou...,"Humanitarian groups expect 4,000 refugees in o...",Humanitarian groups expect 4000 refugees in on...
6,(CNN) -- Four groups that advocate for immigra...,CNN Four groups that advocate for immigrant ri...,NEW: 4 groups announce legal challenge in Phoe...,NEW 4 groups announce legal challenge in Phoen...


In [None]:
# Check for missing data
missing_data = train_df.isnull().sum()
print("Missing Data:")
print(missing_data)

Missing Data:
article               0
highlights            0
id                    0
article_cleaned       0
highlights_cleaned    0
dtype: int64


In [None]:
train_df['article_len'] = train_df['article'].apply(lambda x: len(x.split()))

In [None]:
# Check if there is an imbalance in the target variable (e.g., if you have categories)
category_counts = train_df['article_len'].value_counts()
print("Train Dataset - Category Counts:")
print(category_counts)

Train Dataset - Category Counts:
531     432
475     431
520     426
460     423
574     421
       ... 
1933      1
1955      1
1921      1
1987      1
1943      1
Name: article_len, Length: 1923, dtype: int64


In [None]:
%%capture
!pip install rouge_score

In [None]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
from datasets import load_metric
from rouge_score import rouge_scorer

In [None]:
# Specify the BART model name
model_name = "facebook/bart-large-cnn"

# Load the BART tokenizer and model
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

In [None]:
# Function to generate summaries
def generate_summary(article_text):
    inputs = tokenizer.encode("summarize: " + article_text, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = model.generate(inputs, max_length=100, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Function to calculate ROUGE scores
def calculate_rouge_scores(original_summary, generated_summary):
    rouge = load_metric("rouge")
    scores = rouge.compute(predictions=[generated_summary], references=[original_summary])
    return scores

In [None]:
# Summarize and evaluate a single article from the test dataset
article = test_df.iloc[0]['article']
original_summary = test_df.iloc[0]['highlights']
generated_summary = generate_summary(article)
rouge_scores = calculate_rouge_scores(original_summary, generated_summary)

print("Original Summary:")
print(original_summary)
print("\nGenerated Summary:")
print(generated_summary)

Original Summary:
James Best, who played the sheriff on "The Dukes of Hazzard," died Monday at 88 .
"Hazzard" ran from 1979 to 1985 and was among the most popular shows on TV .

Generated Summary:
James Best was best known for his portrayal of bumbling sheriff Rosco P. Coltrane on "The Dukes of Hazzard" He died in hospice in Hickory, North Carolina, of complications from pneumonia, a friend says.


In [None]:
# Calculate ROUGE scores
rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL', 'rougeLsum'])
rouge_scores = rouge_scorer.score(generated_summary, original_summary)

# Print ROUGE scores line by line
for metric, scores in rouge_scores.items():
    print(f"{metric}:")
    print(f"Precision: {scores.precision}")
    print(f"Recall: {scores.recall}")
    print(f"F1 Score: {scores.fmeasure}")
    print()

rouge1:
Precision: 0.36666666666666664
Recall: 0.3235294117647059
F1 Score: 0.34375

rouge2:
Precision: 0.1724137931034483
Recall: 0.15151515151515152
F1 Score: 0.16129032258064518

rougeL:
Precision: 0.3333333333333333
Recall: 0.29411764705882354
F1 Score: 0.3125

rougeLsum:
Precision: 0.3333333333333333
Recall: 0.29411764705882354
F1 Score: 0.3125



In [None]:
# Function to calculate word overlap accuracy
def calculate_accuracy(original_summary, generated_summary):
    # Split summaries into words
    original_words = set(original_summary.split())
    generated_words = set(generated_summary.split())

    # Calculate the intersection of words (common words)
    common_words = original_words.intersection(generated_words)

    # Calculate accuracy as the ratio of common words to total words in the original summary
    accuracy = len(common_words) / len(original_words) if len(original_words) > 0 else 0.0
    return accuracy

# Calculate and print accuracy
accuracy = calculate_accuracy(original_summary, generated_summary)
print(f"Word Overlap Accuracy: {accuracy:.2%}")

Word Overlap Accuracy: 31.03%


In [None]:
# Calculate and print the average ROUGE F1 score
average_f1_score = sum(scores.fmeasure for scores in rouge_scores.values()) / len(rouge_scores)
print(f"Average ROUGE F1 Score: {average_f1_score:.4f}")

Average ROUGE F1 Score: 0.2825


In [None]:
# Initialize empty lists to store original summaries, generated summaries, and ROUGE scores
original_summaries = []
generated_summaries = []
rouge_scores_list = []

# Loop through 10 articles in your test dataset
for i in range(10):
    article = test_df.iloc[i]['article']
    original_summary = test_df.iloc[i]['highlights']

    # Generate summary for the current article
    generated_summary = generate_summary(article)

    # Calculate ROUGE scores
    rouge_scores = calculate_rouge_scores(original_summary, generated_summary)

    # Append original and generated summaries, and ROUGE scores to respective lists
    original_summaries.append(original_summary)
    generated_summaries.append(generated_summary)
    rouge_scores_list.append(rouge_scores)

    # Print the summaries and ROUGE scores for each article
    print(f"Article {i+1}")
    print("Original Summary:")
    print(original_summary)
    print("\nGenerated Summary:")
    print(generated_summary)
    print("\nROUGE Scores:")
    print(rouge_scores)
    print("-----------------------------------")


Article 1
Original Summary:
James Best, who played the sheriff on "The Dukes of Hazzard," died Monday at 88 .
"Hazzard" ran from 1979 to 1985 and was among the most popular shows on TV .

Generated Summary:
James Best was best known for his portrayal of bumbling sheriff Rosco P. Coltrane on "The Dukes of Hazzard" He died in hospice in Hickory, North Carolina, of complications from pneumonia, a friend says.

ROUGE Scores:
{'rouge1': AggregateScore(low=Score(precision=0.3235294117647059, recall=0.36666666666666664, fmeasure=0.34375), mid=Score(precision=0.3235294117647059, recall=0.36666666666666664, fmeasure=0.34375), high=Score(precision=0.3235294117647059, recall=0.36666666666666664, fmeasure=0.34375)), 'rouge2': AggregateScore(low=Score(precision=0.15151515151515152, recall=0.1724137931034483, fmeasure=0.16129032258064518), mid=Score(precision=0.15151515151515152, recall=0.1724137931034483, fmeasure=0.16129032258064518), high=Score(precision=0.15151515151515152, recall=0.172413793103

Pegasus

In [None]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

In [None]:
!pip install transformers
!pip install sentencepiece
!git clone https://github.com/huggingface/transformers

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99
Cloning into 'transformers'...
remote: Enumerating objects: 174651, done.[K
remote: Counting objects: 100% (21721/21721), done.[K
remote: Compressing objects: 100% (940/940), done.[K
remote: Total 174651 (delta 21266), reused 20805 (delta 20770), pack-reused 152930[K
Receiving objects: 100% (174651/174651), 173.31 MiB | 23.34 MiB/s, done.
Resolving deltas: 100% (132666/132666), done.


In [None]:
from transformers import AutoTokenizer

In [None]:
model_name = "google/pegasus-xsum"
device = "cuda" if torch.cuda.is_available() else "cpu"
#tokenizer = PegasusTokenizer.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/87.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.52M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.encoder.embed_positions.weight', 'model.decoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/259 [00:00<?, ?B/s]

In [None]:
# Function to generate summaries using DistilBERT
def generate_summary(article_text, max_length=100):
    # Tokenize and generate the summary
    inputs = tokenizer.encode("summarize: " + article_text, return_tensors="pt", max_length=512, truncation=True)
    summary_ids = model.generate(inputs, max_length=max_length, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Function to calculate ROUGE scores
def calculate_rouge_scores(original_summary, generated_summary):
    # Import the necessary library
    from rouge_score import rouge_scorer

    # Initialize the ROUGE scorer
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL', 'rougeLsum'])

    # Calculate ROUGE scores
    rouge_scores = scorer.score(generated_summary, original_summary)
    return rouge_scores

In [None]:
# Summarize and evaluate a single article from the test dataset
article = test_df.iloc[0]['article']
#max_input_length = tokenizer.model_max_length  # Get the maximum sequence length of the model
#article = article[:max_input_length]
original_summary = test_df.iloc[0]['highlights']
generated_summary = generate_summary(article)
rouge_scores = calculate_rouge_scores(original_summary, generated_summary)

print("Original Summary:")
print(original_summary)
print("\nGenerated Summary:")
print(generated_summary)

Original Summary:
James Best, who played the sheriff on "The Dukes of Hazzard," died Monday at 88 .
"Hazzard" ran from 1979 to 1985 and was among the most popular shows on TV .

Generated Summary:
"Hazzard" star James Best, best known for his role as bumbling sheriff Rosco P. Coltrane on TV's "The Dukes of Hazzard," died Monday after a brief illness.


In [None]:
# Calculate ROUGE scores
rouge_scores = calculate_rouge_scores(original_summary, generated_summary)

# Print the ROUGE scores
for metric, score in rouge_scores.items():
    print(f"{metric}:")
    print(f"Precision: {score.precision:.4f}")
    print(f"Recall: {score.recall:.4f}")
    print(f"F1 Score: {score.fmeasure:.4f}")
    print()

rouge1:
Precision: 0.4000
Recall: 0.4286
F1 Score: 0.4138

rouge2:
Precision: 0.2414
Recall: 0.2593
F1 Score: 0.2500

rougeL:
Precision: 0.3333
Recall: 0.3571
F1 Score: 0.3448

rougeLsum:
Precision: 0.4000
Recall: 0.4286
F1 Score: 0.4138



In [None]:
# Function to calculate word overlap accuracy
def calculate_accuracy(original_summary, generated_summary):
    # Split summaries into words
    original_words = set(original_summary.split())
    generated_words = set(generated_summary.split())

    # Calculate the intersection of words (common words)
    common_words = original_words.intersection(generated_words)

    # Calculate accuracy as the ratio of common words to total words in the original summary
    accuracy = len(common_words) / len(original_words) if len(original_words) > 0 else 0.0
    return accuracy

# Calculate and print accuracy
accuracy = calculate_accuracy(original_summary, generated_summary)
print(f"Word Overlap Accuracy: {accuracy:.2%}")

Word Overlap Accuracy: 37.93%


In [None]:
# Calculate and print the average ROUGE F1 score
average_f1_score = sum(scores.fmeasure for scores in rouge_scores.values()) / len(rouge_scores)
print(f"Average ROUGE F1 Score: {average_f1_score:.4f}")

Average ROUGE F1 Score: 0.3556


In [None]:
# Initialize empty lists to store original summaries, generated summaries, and ROUGE scores
original_summaries = []
generated_summaries = []
rouge_scores_list = []

# Loop through 10 articles in test dataset
for i in range(10):
    article = test_df.iloc[i]['article']
    original_summary = test_df.iloc[i]['highlights']

    # Generate summary for the current article
    generated_summary = generate_summary(article)

    # Calculate ROUGE scores
    rouge_scores = calculate_rouge_scores(original_summary, generated_summary)

    # Append original and generated summaries, and ROUGE scores to respective lists
    original_summaries.append(original_summary)
    generated_summaries.append(generated_summary)
    rouge_scores_list.append(rouge_scores)

    # Print the summaries and ROUGE scores for each article
    print(f"Article {i+1}")
    print("Original Summary:")
    print(original_summary)
    print("\nGenerated Summary:")
    print(generated_summary)
    print("\nROUGE Scores:")
    print(rouge_scores)
    print("-----------------------------------")


Article 1
Original Summary:
James Best, who played the sheriff on "The Dukes of Hazzard," died Monday at 88 .
"Hazzard" ran from 1979 to 1985 and was among the most popular shows on TV .

Generated Summary:
"Hazzard" star James Best, best known for his role as bumbling sheriff Rosco P. Coltrane on TV's "The Dukes of Hazzard," died Monday after a brief illness.

ROUGE Scores:
{'rouge1': Score(precision=0.4, recall=0.42857142857142855, fmeasure=0.4137931034482759), 'rouge2': Score(precision=0.2413793103448276, recall=0.25925925925925924, fmeasure=0.25), 'rougeL': Score(precision=0.3333333333333333, recall=0.35714285714285715, fmeasure=0.3448275862068965), 'rougeLsum': Score(precision=0.4, recall=0.42857142857142855, fmeasure=0.4137931034482759)}
-----------------------------------
Article 2
Original Summary:
A lawyer for Dr. Anthony Moschetto says the charges against him are baseless .
Moschetto, 54, was arrested for selling drugs and weapons, prosecutors say .
Authorities allege Moschet