# Automatic Rouge evaluation

In [1]:
import os

import numpy as np
from rouge_score import rouge_scorer
from tqdm import tqdm

In [2]:
ABSTRACT_PATH = 'abstracts'
GENERATED_SUMMARY_PATH = 'gen_summaries'

def evaluate_summaries_using_rouge(experiment_name: str):
    no_files = len(os.listdir(ABSTRACT_PATH))
    rouge_1_scores = np.zeros([no_files, 3])
    rouge_2_scores = np.zeros([no_files, 3])
    rouge_l_scores = np.zeros([no_files, 3])

    for i, filename in tqdm(enumerate(os.listdir(ABSTRACT_PATH))):

        # Read paper abstract
        with open(f'{ABSTRACT_PATH}/{filename}', 'r') as abstract_file:
            paper_abstract = abstract_file.read()

        # Read generated summary
        with open(f'{GENERATED_SUMMARY_PATH}/{experiment_name}/{filename}', 'r') as summary_file:
            generated_summary = summary_file.read()

        # Compute rouge scores and store results
        rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=False)
        rouge_scores = rouge.score(paper_abstract, generated_summary)
        
        rouge_1_scores[i] = list(rouge_scores['rouge1'])
        rouge_2_scores[i] = list(rouge_scores['rouge2'])
        rouge_l_scores[i] = list(rouge_scores['rougeL'])

    # Display average results
    print(f'Average rouge-1 score for {experiment_name} experiment: {np.mean(rouge_1_scores[:, 2])}, std:{np.std(rouge_1_scores[:, 2])}')
    print(f'Average rouge-2 score for {experiment_name} experiment: {np.mean(rouge_2_scores[:, 2])}, std:{np.std(rouge_2_scores[:, 2])}')
    print(f'Average rouge-l score for {experiment_name} experiment: {np.mean(rouge_l_scores[:, 2])}, std:{np.std(rouge_l_scores[:, 2])}')
    
    # Return results
    return rouge_1_scores, rouge_2_scores, rouge_l_scores

## Baseline (normal tfidf) evaluation

In [3]:
rouge_1_scores_tfidf, rouge_2_scores_tfidf, rouge_l_scores_tfidf = evaluate_summaries_using_rouge('tfidf_2')
with open('rouge_1_tfidf.npy', 'wb') as f_rouge_1:
    np.save(f_rouge_1, rouge_1_scores_tfidf)
    
with open('rouge_2_tfidf.npy', 'wb') as f_rouge_2:
    np.save(f_rouge_2, rouge_2_scores_tfidf)
    
with open('rouge_l_tfidf.npy', 'wb') as f_rouge_l:
    np.save(f_rouge_l, rouge_l_scores_tfidf)

10148it [07:28, 22.62it/s]

Average rouge-1 score for tfidf experiment: 0.18711283359480235, std:0.08476868473976243
Average rouge-2 score for tfidf experiment: 0.0325325831507373, std:0.025866923845545307
Average rouge-l score for tfidf experiment: 0.09540082029241616, std:0.04058176539174261





In [48]:
np.mean(rouge_l_scores_tfidf[:, 0])

0.07668224193579935

## TFIDF + Wordnet evaluation

In [4]:
rouge_1_scores_tfidf_wordnet, rouge_2_scores_tfidf_wordnet, rouge_l_scores_tfidf_wordnet = evaluate_summaries_using_rouge('tfidf_wordnet_2')

10148it [07:18, 23.13it/s]

Average rouge-1 score for tfidf_wordnet experiment: 0.18805354329981144, std:0.08556898434223188
Average rouge-2 score for tfidf_wordnet experiment: 0.032891348870105565, std:0.02605473677146605
Average rouge-l score for tfidf_wordnet experiment: 0.09567986842854938, std:0.04074743424021038





In [6]:
with open('rouge_1_tfidf_wordnet.npy', 'wb') as f_rouge_1:
    np.save(f_rouge_1, rouge_1_scores_tfidf_wordnet)
    
with open('rouge_2_tfidf_wordnet.npy', 'wb') as f_rouge_2:
    np.save(f_rouge_2, rouge_2_scores_tfidf_wordnet)
    
with open('rouge_l_tfidf_wordnet.npy', 'wb') as f_rouge_l:
    np.save(f_rouge_l, rouge_l_scores_tfidf_wordnet)

In [51]:
print(np.mean(rouge_l_scores_tfidf_wordnet[:, 2]))
print(np.std(rouge_l_scores_tfidf_wordnet[:, 2]))

0.09567986842854938
0.04074743424021038


## Wordnet Lesk evaluation

In [5]:
rouge_1_scores_wordnet, rouge_2_scores_wordnet, rouge_l_scores_wordnet = evaluate_summaries_using_rouge('wordnet')

10148it [08:00, 21.13it/s]

Average rouge-1 score for wordnet experiment: 0.2195510926243785, std:0.10520157981929261
Average rouge-2 score for wordnet experiment: 0.05392322966068278, std:0.04270303097268596
Average rouge-l score for wordnet experiment: 0.10904514059906768, std:0.04899882559568424





In [7]:
with open('rouge_1_lesk.npy', 'wb') as f_rouge_1:
    np.save(f_rouge_1, rouge_1_scores_wordnet)
    
with open('rouge_2_lesk.npy', 'wb') as f_rouge_2:
    np.save(f_rouge_2, rouge_2_scores_wordnet)
    
with open('rouge_l_lesk.npy', 'wb') as f_rouge_l:
    np.save(f_rouge_l, rouge_l_scores_wordnet)

In [60]:
print(np.mean(rouge_l_scores_wordnet[:, 2]))
print(np.std(rouge_l_scores_wordnet[:, 2]))

0.10904514059906768
0.04899882559568424
