In [1]:
from datasets import load_dataset
from transformers import pipeline
from rouge_score import rouge_scorer
import pandas as pd

In [2]:
xsum_dataset = load_dataset("xsum", version="1.2.0") 

In [3]:
xsum_sample = xsum_dataset["train"].select(range(5))
display(xsum_sample.to_pandas())

Unnamed: 0,document,summary,id
0,"The full cost of damage in Newton Stewart, one...",Clean-up operations are continuing across the ...,35232142
1,A fire alarm went off at the Holiday Inn in Ho...,Two tourist buses have been destroyed by fire ...,40143035
2,Ferrari appeared in a position to challenge un...,Lewis Hamilton stormed to pole position at the...,35951548
3,"John Edward Bates, formerly of Spalding, Linco...",A former Lincolnshire Police officer carried o...,36266422
4,Patients and staff were evacuated from Cerahpa...,An armed man who locked himself into a room at...,38826984


In [4]:
summarizer_t5 = pipeline(
    task="summarization",
    model="t5-small",
  ) 

In [5]:
results = summarizer_t5(xsum_sample["document"],min_length=20,max_length=40,truncation=True)


In [6]:
opt_result = pd.DataFrame.from_dict(results).rename({"summary_text": "generated_summary"}, axis=1).join(pd.DataFrame.from_dict(xsum_sample))[
        ["generated_summary", "summary", "document"]
    ]   
display(opt_result.head())

Unnamed: 0,generated_summary,summary,document
0,the full cost of damage in Newton Stewart is s...,Clean-up operations are continuing across the ...,"The full cost of damage in Newton Stewart, one..."
1,a fire alarm went off at the Holiday Inn in Ho...,Two tourist buses have been destroyed by fire ...,A fire alarm went off at the Holiday Inn in Ho...
2,Sebastian Vettel will start third ahead of tea...,Lewis Hamilton stormed to pole position at the...,Ferrari appeared in a position to challenge un...
3,the 67-year-old is accused of committing the o...,A former Lincolnshire Police officer carried o...,"John Edward Bates, formerly of Spalding, Linco..."
4,a man receiving psychiatric treatment at the c...,An armed man who locked himself into a room at...,Patients and staff were evacuated from Cerahpa...


In [7]:
print("Generated Summary : ",opt_result.iloc[0]["generated_summary"])
print("Summary : ",opt_result.iloc[0]["summary"])
print("Document : ",opt_result.iloc[0]["document"])

Generated Summary :  the full cost of damage in Newton Stewart is still being assessed . many roads in peeblesshire remain badly affected by standing water . a flood alert remains in place across the
Summary :  Clean-up operations are continuing across the Scottish Borders and Dumfries and Galloway after flooding caused by Storm Frank.
Document :  The full cost of damage in Newton Stewart, one of the areas worst affected, is still being assessed.
Repair work is ongoing in Hawick and many roads in Peeblesshire remain badly affected by standing water.
Trains on the west coast mainline face disruption due to damage at the Lamington Viaduct.
Many businesses and householders were affected by flooding in Newton Stewart after the River Cree overflowed into the town.
First Minister Nicola Sturgeon visited the area to inspect the damage.
The waters breached a retaining wall, flooding many commercial properties on Victoria Street - the main shopping thoroughfare.
Jeanette Tate, who owns the Cinn

In [8]:
def calculate_rouge(data):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    data["r1_fscore"] = data.apply(lambda row : scorer.score(row["summary"],row["generated_summary"])['rouge1'][2], axis=1)
    data["r2_fscore"] = data.apply(lambda row : scorer.score(row["summary"],row["generated_summary"])['rouge2'][2], axis=1)
    data["rl_fscore"] = data.apply(lambda row : scorer.score(row["summary"],row["generated_summary"])['rougeL'][2], axis=1)
    
    return data

In [9]:
score_ret=calculate_rouge(opt_result)

In [10]:
print("ROUGE - 1 : ",score_ret["r1_fscore"].mean())
print("ROUGE - 2 : ",score_ret["r2_fscore"].mean())
print("ROUGE - L : ",score_ret["rl_fscore"].mean())

ROUGE - 1 :  0.21093822113349553
ROUGE - 2 :  0.03740640713172167
ROUGE - L :  0.1288910963823506


In [11]:
summarizer_bart_cnn = pipeline(
    task="summarization",
    model="facebook/bart-large-cnn",
    min_length=20,
    max_length=40,
    truncation=True,
  ) 

In [12]:
results_bart = summarizer_bart_cnn(xsum_sample["document"])

In [13]:
import pandas as pd

opt_result_bart = pd.DataFrame.from_dict(results_bart).rename({"summary_text": "generated_summary"}, axis=1).join(pd.DataFrame.from_dict(xsum_sample))[
        ["generated_summary", "summary", "document"]
    ]



In [14]:
score_ret_bart=calculate_rouge(opt_result_bart)

In [15]:
print("ROUGE - 1 : ",score_ret_bart["r1_fscore"].mean())
print("ROUGE - 2 : ",score_ret_bart["r2_fscore"].mean())
print("ROUGE - L : ",score_ret_bart["rl_fscore"].mean())

ROUGE - 1 :  0.15316964796210078
ROUGE - 2 :  0.027547169811320753
ROUGE - L :  0.1211003838928367
