# Loading the News Article and Reference Summary:

In [12]:
import os
import random
import pandas as pd
from bert_score import score

In [78]:
articles_dir = r'BBC News Summary\News Articles\business'
summaries_dir = r'BBC News Summary\Summaries\business'

In [79]:
article_files = sorted([f for f in os.listdir(articles_dir) if f.endswith('.txt')])
summary_files = sorted([f for f in os.listdir(summaries_dir) if f.endswith('.txt')])

In [80]:
file_pairs = list(zip(article_files, summary_files))

In [81]:
random.seed(42)  
selected_pairs = random.sample(file_pairs, 20)

In [82]:
def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

# 1. PEGASUS Model :

In [8]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
pegasus_tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-large")
pegasus_model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-large")

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
def generate_summary_pegasus(article, max_length=100):
    inputs = pegasus_tokenizer.encode(article, return_tensors="pt", truncation=True)
    summary_ids = pegasus_model.generate(inputs, max_length=max_length, num_beams=4, early_stopping=True)
    return pegasus_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [83]:
pegasus_precision_scores = []
pegasus_recall_scores = []
pegasus_f1_scores = []

# Iterate over the selected pairs
for article_file, summary_file in selected_pairs:
    article_path = os.path.join(articles_dir, article_file)
    summary_path = os.path.join(summaries_dir, summary_file)
    
    # Read article and reference summary
    article = read_file(article_path)
    reference_summary = read_file(summary_path)
    
    # Generate summary using Pegasus
    generated_summary = generate_summary_pegasus(article)
    
    # Evaluate the summary using BERTScore
    P, R, F1 = score([generated_summary], [reference_summary], lang='en', verbose=False)
    
    # Store the individual scores
    pegasus_precision_scores.append(P.item())
    pegasus_recall_scores.append(R.item())
    pegasus_f1_scores.append(F1.item())

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

In [84]:
pegasus_business = pd.DataFrame({
    'Precision': pegasus_precision_scores,
    'Recall': pegasus_recall_scores,
    'F1 Score': pegasus_f1_scores
})

print(pegasus_business)

    Precision    Recall  F1 Score
0    0.957410  0.862893  0.907697
1    0.933025  0.844105  0.886340
2    0.959055  0.885500  0.920811
3    0.940711  0.861259  0.899233
4    0.956425  0.878585  0.915854
5    0.908054  0.850337  0.878248
6    0.896424  0.837705  0.866070
7    0.900026  0.846090  0.872225
8    0.849548  0.850108  0.849828
9    0.880596  0.840135  0.859890
10   0.951864  0.871208  0.909752
11   0.926327  0.896395  0.911115
12   0.954179  0.861950  0.905722
13   0.951534  0.838982  0.891720
14   0.948883  0.860580  0.902577
15   0.908829  0.846684  0.876656
16   0.979966  0.893668  0.934830
17   0.965993  0.861489  0.910753
18   0.964793  0.906781  0.934888
19   0.944252  0.888565  0.915563


In [85]:
pegasus_business.to_excel('pegasus_business.xlsx', index=False)

# 2. BART Model:

In [23]:
from transformers import BartForConditionalGeneration, BartTokenizer
bart_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")
bart_model = BartForConditionalGeneration.from_pretrained("facebook/bart-large")

In [26]:
def generate_summary_bart(article, max_length=100):
    inputs = bart_tokenizer.encode(article, return_tensors="pt", truncation=True)
    summary_ids = bart_model.generate(inputs, max_length=max_length, num_beams=4, early_stopping=True)
    return bart_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [86]:
bart_precision_scores = []
bart_recall_scores = []
bart_f1_scores = []

# Iterate over the selected pairs
for article_file, summary_file in selected_pairs:
    article_path = os.path.join(articles_dir, article_file)
    summary_path = os.path.join(summaries_dir, summary_file)
    
    # Read article and reference summary
    article = read_file(article_path)
    reference_summary = read_file(summary_path)
    
    # Generate summary using BART
    generated_summary = generate_summary_bart(article)
    
    # Evaluate the summary using BERTScore
    P, R, F1 = score([generated_summary], [reference_summary], lang='en', verbose=False)
    
    # Store the individual scores
    bart_precision_scores.append(P.item())
    bart_recall_scores.append(R.item())
    bart_f1_scores.append(F1.item())

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

In [87]:
bart_business = pd.DataFrame({
    'Precision': bart_precision_scores,
    'Recall': bart_recall_scores,
    'F1 Score': bart_f1_scores
})

print(bart_business)

    Precision    Recall  F1 Score
0    0.864749  0.875710  0.870195
1    0.914950  0.884959  0.899704
2    0.888840  0.868622  0.878615
3    0.901507  0.899700  0.900603
4    0.894263  0.879567  0.886854
5    0.900553  0.861527  0.880608
6    0.852954  0.859692  0.856310
7    0.861058  0.844869  0.852887
8    0.863347  0.885605  0.874334
9    0.919045  0.890211  0.904398
10   0.914039  0.885249  0.899414
11   0.897617  0.875244  0.886290
12   0.913847  0.902328  0.908051
13   0.897739  0.863924  0.880507
14   0.688211  0.805876  0.742410
15   0.840859  0.817367  0.828946
16   0.834498  0.845945  0.840183
17   0.883742  0.868690  0.876151
18   0.680873  0.796287  0.734071
19   0.921112  0.899490  0.910172


In [88]:
bart_business.to_excel('bart_business.xlsx', index=False)

# 3. T5 Model:

In [30]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
t5_tokenizer = T5Tokenizer.from_pretrained("t5-large")
t5_model = T5ForConditionalGeneration.from_pretrained("t5-large")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [46]:
def generate_summary_t5(article, max_length=100):
    inputs = t5_tokenizer.encode("summarize: " + article, return_tensors="pt", truncation=True)
    summary_ids = t5_model.generate(inputs, max_length=max_length, num_beams=4, early_stopping=True)
    return t5_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [89]:
t5_precision_scores = []
t5_recall_scores = []
t5_f1_scores = []

# Iterate over the selected pairs
for article_file, summary_file in selected_pairs:
    article_path = os.path.join(articles_dir, article_file)
    summary_path = os.path.join(summaries_dir, summary_file)
    
    # Read article and reference summary
    article = read_file(article_path)
    reference_summary = read_file(summary_path)
    
    # Generate summary using BART
    generated_summary = generate_summary_bart(article)
    
    # Evaluate the summary using BERTScore
    P, R, F1 = score([generated_summary], [reference_summary], lang='en', verbose=False)
    
    # Store the individual scores
    t5_precision_scores.append(P.item())
    t5_recall_scores.append(R.item())
    t5_f1_scores.append(F1.item())

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

In [90]:
t5_business = pd.DataFrame({
    'Precision': t5_precision_scores,
    'Recall': t5_recall_scores,
    'F1 Score': t5_f1_scores
})

print(t5_business)

    Precision    Recall  F1 Score
0    0.864749  0.875710  0.870195
1    0.914950  0.884959  0.899704
2    0.888840  0.868622  0.878615
3    0.901507  0.899700  0.900603
4    0.894263  0.879567  0.886854
5    0.900553  0.861527  0.880608
6    0.852954  0.859692  0.856310
7    0.861058  0.844869  0.852887
8    0.863347  0.885605  0.874334
9    0.919045  0.890211  0.904398
10   0.914039  0.885249  0.899414
11   0.897617  0.875244  0.886290
12   0.913847  0.902328  0.908051
13   0.897739  0.863924  0.880507
14   0.688211  0.805876  0.742410
15   0.840859  0.817367  0.828946
16   0.834498  0.845945  0.840183
17   0.883742  0.868690  0.876151
18   0.680873  0.796287  0.734071
19   0.921112  0.899490  0.910172


In [91]:
t5_business.to_excel('t5_business.xlsx', index=False)

# Sample News Article and Reference Summary

In [96]:
file_path = r'BBC News Summary\News Articles\tech\007.txt' 

with open(file_path, 'r') as file:
    text = file.read()

print(text)


Microsoft releases bumper patches

Microsoft has warned PC users to update their systems with the latest security fixes for flaws in Windows programs.

In its monthly security bulletin, it flagged up eight "critical" security holes which could leave PCs open to attack if left unpatched. The number of holes considered "critical" is more than usual. They affect Windows programs, including Internet Explorer (IE), media player and instant messaging. Four other important fixes were also released. These were considered to be less critical, however. If not updated, either automatically or manually, PC users running the programs could be vulnerable to viruses or other malicious attacks designed to exploit the holes. Many of the flaws could be used by virus writers to take over computers remotely, install programs, change, and delete or see data.

One of the critical patches Microsoft has made available is an important one that fixes some IE flaws. Stephen Toulouse, a Microsoft security manager

In [97]:
file_path = r'BBC News Summary\Summaries\tech\007.txt' 

with open(file_path, 'r') as file:
    reference_summary = file.read()

print(reference_summary)


Microsoft has warned PC users to update their systems with the latest security fixes for flaws in Windows programs.One of the critical patches Microsoft has made available is an important one that fixes some IE flaws.In its monthly security bulletin, it flagged up eight "critical" security holes which could leave PCs open to attack if left unpatched.Often, when a critical flaw is announced, spates of viruses follow because home users and businesses leave the flaw unpatched.The most important ones are those which are classed as "critical".The number of holes considered "critical" is more than usual.


# Model Summaries

## PEGASUS:

In [98]:
pegasus_summary = generate_summary_pegasus(text)
print(pegasus_summary)

Microsoft releases bumper patches Microsoft has warned PC users to update their systems with the latest security fixes for flaws in Windows programs. Its latest releases came the week that the company announced it was to buy security software maker Sybari Software as part of Microsoft's plans to make its own security programs.


## BART:

In [99]:
bart_summary = generate_summary_bart(text)
print(bart_summary)

Microsoft releases bumper patches for 'critical' security holes in Windows programsImage copyright Getty Images Image caption Microsoft has warned PC users to update their systems with the latest security fixes for flaws in Windows productsMicrosoft has warned PCs running its Windows operating system that they could be vulnerable to viruses or other malicious attacks designed to exploit the holes.In its monthly security bulletin, it flagged up eight "critical" security holes which could leave PCs open to attack if left unpatched. They affect Windows programs,


## T5:

In [100]:
t5_summary = generate_summary_t5(text)
print(t5_summary)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Microsoft has warned PC users to update their systems with the latest security fixes for flaws in windows programs. it flagged up eight "critical" security holes which could leave PCs open to attack if left unpatched. many of the flaws could be used by virus writers to take over computers remotely, install programs, change, and delete or see data.
