In [1]:
# !pip install rouge-metric
# !pip install evaluate

## Import packages

In [1]:
from rouge_metric import PyRouge
import pandas as pd

## Read file

In [2]:
df = pd.read_csv("extractive_summary.csv")
df = df[['url', 'bert_extractive', 'gold_truth']]
df.shape

(221, 3)

In [3]:
df.head()

Unnamed: 0,url,bert_extractive,gold_truth
0,https://edition.cnn.com/2023/02/28/sport/los-a...,during the lakers' 27-point comeback victory o...,LeBron James' injury adds to the Los Angeles L...
1,https://edition.cnn.com/2023/02/16/sport/lesle...,it's while running in the moors and hills of h...,Triathlete and screenwriter Lesley Paterson's ...
2,https://edition.cnn.com/2023/02/27/sport/damia...,damian lillard set an nba record in his monste...,
3,https://edition.cnn.com/2023/02/27/football/su...,betrayal has formed part of european politics ...,The European Super League (ESL) was a proposed...
4,https://edition.cnn.com/2023/02/22/football/pa...,"""nakba"" means catastrophe in arabic, evoking a...",The Israeli-Palestinian conflict has cast a sh...


Since we only produced gold truth summaries for 99 articles (1 was violating chatgpt rules), our dataframe will have nulls. While we have 99 gold truth articles, BERT did not manage to provide extractive summaries for the dataset, which is why after dropping, there are only 92 rows.

In [4]:
df.dropna(inplace=True)
df.shape

(92, 3)

## ROGUE evaluation

In [23]:
rouge = PyRouge(rouge_n=(1, 2, 3, 4), rouge_l=True, rouge_w=True,
                rouge_w_weight=1.2, rouge_s=True, rouge_su=True, skip_gap=4)
scores = rouge.evaluate(df["bert_extractive"], df["gold_truth"])

In [24]:
scores

{'rouge-1': {'r': 0.08947609327822567,
  'p': 0.000871344480681971,
  'f': 0.001725881817235716},
 'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0},
 'rouge-3': {'r': 0.0, 'p': 0.0, 'f': 0.0},
 'rouge-4': {'r': 0.0, 'p': 0.0, 'f': 0.0},
 'rouge-l': {'r': 0.08947609327822567,
  'p': 0.000871344480681971,
  'f': 0.001725881817235716},
 'rouge-w-1.2': {'r': 0.13070249344486135,
  'p': 0.00131211396307712,
  'f': 0.0025981453117238248},
 'rouge-s4': {'r': 0.0, 'p': 0.0, 'f': 0.0},
 'rouge-su4': {'r': 0.0, 'p': 0.0, 'f': 0.0}}

## BERT score evaluation

In [8]:
# !pip install evaluate

In [5]:
# https://huggingface.co/spaces/evaluate-metric/bertscore/blob/main/README.md

from evaluate import load
bertscore = load("bertscore")
results = bertscore.compute(predictions=df['gold_truth'], references=df['bert_extractive'], lang="en")

In [12]:
results

{'precision': [0.8548375368118286,
  0.8423986434936523,
  0.842686653137207,
  0.8637030124664307,
  0.8535655736923218,
  0.8320750594139099,
  0.860188901424408,
  0.8893088102340698,
  0.8630494475364685,
  0.8482155799865723,
  0.8893137574195862,
  0.8820285797119141,
  0.8916323184967041,
  0.8869955539703369,
  0.8773421049118042,
  0.8591261506080627,
  0.8486756086349487,
  0.8835134506225586,
  0.888674795627594,
  0.9014855623245239,
  0.8856498003005981,
  0.8705019950866699,
  0.8862748742103577,
  0.7850698232650757,
  0.8167484402656555,
  0.8623031377792358,
  0.9159845113754272,
  0.8731406927108765,
  0.8102579116821289,
  0.8244129419326782,
  0.8759710788726807,
  0.8868055939674377,
  0.8700794577598572,
  0.8321923613548279,
  0.8916469216346741,
  0.8761851191520691,
  0.8672460913658142,
  0.8945640921592712,
  0.8566836714744568,
  0.8437748551368713,
  0.8431178331375122,
  0.8398867845535278,
  0.9025480151176453,
  0.8543885946273804,
  0.8268226385116577,


Calculating average precision

In [9]:
total = sum(results["precision"])
length = len(results["precision"])
print(total/length)

0.869330588242282


Calculating average recall

In [13]:
total = sum(results["recall"])
length = len(results["recall"])
print(total/length)

0.8564417964738348


Calculating average f1

In [14]:
total = sum(results["f1"])
length = len(results["f1"])
print(total/length)

0.8626157602538234
