In [25]:
gen_sums = []
gt_sums = []
corrected_sums = []

with open("./bart.part.txt", "r") as f:
    for line in f:
        gen_sums.append(line.strip().lower())

with open("./target.part.txt", "r") as f:
    for line in f:
        gt_sums.append(line.strip().lower())

with open("./corrected.part.txt", "r") as f:
    for line in f:
        corrected_sums.append(line.strip().lower())


In [29]:
from rouge_score import rouge_scorer
import numpy as np

rouge_l_scorer = rouge_scorer.RougeScorer(
    ['rougeL'],
    use_stemmer=True
)

bart_rouge_scores = []
corrected_rouge_scores = []
for gt_summary, bart_summary, corrected_summary in zip(gt_sums, gen_sums, corrected_sums):
    bart_rouge_scores.append(
        rouge_l_scorer.score(gt_summary, bart_summary)['rougeL'].fmeasure
    )
    corrected_rouge_scores.append(
        rouge_l_scorer.score(gt_summary, corrected_summary)['rougeL'].fmeasure
    )

paper_bart = 38.63
paper_corrected = 36.62
our_bart = np.mean(bart_rouge_scores) * 100
our_corrected = np.mean(corrected_rouge_scores) * 100

print("Rouge Scores")
print(f"BART-large, paper: {paper_bart}, ours: {our_bart:.2f}, diff: {(paper_bart - our_bart):.2f}")
print(f"Corrected, paper: {paper_corrected}, ours: {our_corrected:.2f}, diff: {(paper_corrected - our_corrected):.2f}")

Rouge Scores
BART-large, paper: 38.63, ours: 37.80, diff: 0.83
Corrected, paper: 36.62, ours: 35.81, diff: 0.81


In [30]:
from bert_score import score
import numpy as np

def evaluate_bert_score(gt_summaries, generated_summaries):
    (P, R, F), hashname = score(
        generated_summaries, 
        gt_summaries,
        lang="en", 
        return_hash=True
    )
    print(f"P={P.mean().item():.6f} R={R.mean().item():.6f} F={F.mean().item():.6f}")
    return F.tolist()

bart_bert_scores = evaluate_bert_score(gt_sums, gen_sums)
corrected_bert_scores = evaluate_bert_score(gt_sums, corrected_sums)

paper_bart = 91.61
paper_corrected = 91.10
our_bart = np.mean(bart_bert_scores) * 100
our_corrected = np.mean(corrected_bert_scores) * 100

print("BERT Scores")
print(f"BART-large, paper: {paper_bart}, ours: {our_bart:.2f}, diff: {(paper_bart - our_bart):.2f}")
print(f"Corrected, paper: {paper_corrected}, ours: {our_corrected:.2f}, diff: {(paper_corrected - our_corrected):.2f}")

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


KeyboardInterrupt: 