In [1]:
# trying to recreate the lebanoff et al 2018 evaluation setup
# working from here: 
# https://github.com/ucfnlp/multidoc_summarization/blob/ae30c9ee039d4ad5ff64fd2245faafc5a62c4dd7/decode.py

# install pyrouge
# https://stackoverflow.com/questions/45894212/installing-pyrouge-gets-error-in-ubuntu

In [2]:
import pyrouge
from pathlib import Path
import json
import os
import logging

import spacy
nlp = spacy.load("en_core_web_sm")


In [3]:
# pass over files and format output for pyrouge, using output heuristics from lebanoff et al

def write_for_rouge(all_reference_sents, decoded_words, ex_index):
    """Write output to file in correct format for eval with pyrouge. This is called in single_pass mode.
    Args:
        all_reference_sents: list of list of strings
        decoded_words: list of strings
        ex_index: int, the index with which to label the files
    """
    # TODO: we need to tokenize hypothesis to match their logic
    
    
    # First, divide decoded output into sentences
    decoded_sents = []
    while len(decoded_words) > 0:
        try:
            fst_period_idx = decoded_words.index(".")
        except ValueError: # there is text remaining that doesn't end in "."
            fst_period_idx = len(decoded_words)
        sent = decoded_words[:fst_period_idx+1] # sentence up to and including the period
        decoded_words = decoded_words[fst_period_idx+1:] # everything else
        decoded_sents.append(' '.join(sent))

    # pyrouge calls a perl script that puts the data into HTML files.
    # Therefore we need to make our output HTML safe.
    decoded_sents = [make_html_safe(w) for w in decoded_sents]
    # note sentence splitting here 
    all_reference_sents = [
        [make_html_safe(' '.join([str(w) for w in s])) for s in nlp(abstract).sents]
        for abstract in all_reference_sents
    ]

    # Write to file
    decoded_file = os.path.join(rouge_dec_dir, "%06d_decoded.txt" % ex_index)

    for abs_idx, abs in enumerate(all_reference_sents):
        ref_file = os.path.join(rouge_ref_dir, "%06d_reference.%s.txt" % (
            ex_index, chr(ord('A') + abs_idx)))
        with open(ref_file, "w") as f:
            f.write(' '.join(abs).lower() + '\n')

#             for idx, sent in enumerate(abs):
#                 f.write(sent+"\n")
                
                # f.write(sent) if idx==len(abs)-1 else f.write(sent+"\n")
    with open(decoded_file, "w") as f:
        f.write(' '.join(decoded_sents).lower() + '\n')
#         for idx,sent in enumerate(decoded_sents):
#             f.write(sent+"\n")
            
            # f.write(sent) if idx==len(decoded_sents)-1 else f.write(sent+"\n")

#     print("Wrote example %i to file" % ex_index)


In [4]:

def make_html_safe(s):
    """Replace any angled brackets in string s to avoid interfering with HTML attention visualizer."""
    s.replace("<", "&lt;")
    s.replace(">", "&gt;")
    return s


def rouge_eval(ref_dir, dec_dir):
    """Evaluate the files in ref_dir and dec_dir with pyrouge, returning results_dict"""
    r = pyrouge.Rouge155()
#   r.model_filename_pattern = '#ID#_reference.txt'
    r.model_filename_pattern = '#ID#_reference.[A-Z].txt'
    r.system_filename_pattern = '(\d+)_decoded.txt'
    r.model_dir = ref_dir
    r.system_dir = dec_dir
    logging.getLogger('global').setLevel(logging.WARNING) # silence pyrouge logging
    rouge_args = ['-e', r._data_dir,
         '-c',
         '95',
         '-2', '4',        # This is the only one we changed (changed the max skip from -1 to 4)
         '-U',
         '-r', '1000',
         '-n', '4',
         '-w', '1.2',
         '-a',
         '-l', '100']
    rouge_args = ' '.join(rouge_args)
    rouge_results = r.convert_and_evaluate(rouge_args=rouge_args)
    return r.output_to_dict(rouge_results)


# def rouge_log(results_dict, dir_to_write):
def rouge_log(results_dict):

    """Log ROUGE results to screen and write to file.
    Args:
        results_dict: the dictionary returned by pyrouge
        dir_to_write: the directory where we will write the results to"""
    log_str = ""
    for x in ["1","2","l","s4","su4"]:
        log_str += "\nROUGE-%s:\n" % x
        for y in ["f_score", "recall", "precision"]:
            key = "rouge_%s_%s" % (x,y)
            key_cb = key + "_cb"
            key_ce = key + "_ce"
            val = results_dict[key]
            val_cb = results_dict[key_cb]
            val_ce = results_dict[key_ce]
            log_str += "%s: %.4f with confidence interval (%.4f, %.4f)\n" % (key, val, val_cb, val_ce)
    logging.info(log_str) # log to screen
#     results_file = os.path.join(dir_to_write, "ROUGE_results.txt")
#     print("Writing final ROUGE results to %s...", results_file)
#     with open(results_file, "w") as f:
#         f.write(log_str)
    return log_str
    


In [5]:
TEMP_EVAL_DIR = Path('rouge_evaluation_tempdir')
os.makedirs(TEMP_EVAL_DIR, exist_ok=True)

# GLOBALS
rouge_dec_dir = TEMP_EVAL_DIR / 'rouge_dec_dir'  
rouge_ref_dir = TEMP_EVAL_DIR / 'rouge_ref_dir'
# END GLOBALS

evaluation_dataset = '/home/chris/projects/aylien/dynamic-ensembles/data/DUC2004/DUC2004_test.jsonl'
system_hypotheses = '/home/chris/projects/aylien/dynamic-ensembles/data/DUC2004/system_hypotheses/eval_predicted_summaries.out'

rouge_dec_dir.mkdir(parents=True, exist_ok=True)
rouge_ref_dir.mkdir(parents=True, exist_ok=True)

# TODO: rm tempdir after eval


dataset_rows = [json.loads(l) for l in open(evaluation_dataset)]
orig_system_hyps = [h.strip() for h in open(system_hypotheses)]
system_hyp_tokens = [[str(t) for t in nlp(h.strip())] for h in open(system_hypotheses)]

In [6]:
print(f'rows in dataset: {len(dataset_rows)}, rows in system hyps: {len(system_hyp_tokens)}')

rows in dataset: 50, rows in system hyps: 50


In [7]:
len(system_hyp_tokens)

50

In [14]:
all_summaries = []
all_hyps = []

# write the rouge files
for idx, (row, h, orig_h) in enumerate(zip(dataset_rows, system_hyp_tokens, orig_system_hyps)):
    if type(row['summary']) is list:
        summaries = row['summary']
    else:
        summaries = [row['summary']]
#     print(f'{len(summaries)} summaries available at row {idx}')
    write_for_rouge(summaries, h, idx)
    all_summaries.append(summaries[0])
    all_hyps.append(orig_h)
    
    
log_report = rouge_log(rouge_eval(rouge_ref_dir, rouge_dec_dir))
print(log_report)
    


ROUGE-1:
rouge_1_f_score: 0.3321 with confidence interval (0.3200, 0.3442)
rouge_1_recall: 0.2676 with confidence interval (0.2571, 0.2784)
rouge_1_precision: 0.4409 with confidence interval (0.4241, 0.4584)

ROUGE-2:
rouge_2_f_score: 0.0806 with confidence interval (0.0721, 0.0904)
rouge_2_recall: 0.0649 with confidence interval (0.0577, 0.0727)
rouge_2_precision: 0.1074 with confidence interval (0.0961, 0.1201)

ROUGE-l:
rouge_l_f_score: 0.1911 with confidence interval (0.1837, 0.1997)
rouge_l_recall: 0.1539 with confidence interval (0.1480, 0.1608)
rouge_l_precision: 0.2538 with confidence interval (0.2427, 0.2653)

ROUGE-s4:
rouge_s4_f_score: 0.0698 with confidence interval (0.0639, 0.0763)
rouge_s4_recall: 0.0558 with confidence interval (0.0511, 0.0609)
rouge_s4_precision: 0.0939 with confidence interval (0.0859, 0.1027)

ROUGE-su4:
rouge_su4_f_score: 0.1147 with confidence interval (0.1080, 0.1221)
rouge_su4_recall: 0.0918 with confidence interval (0.0862, 0.0977)
rouge_su4_pre

In [9]:
dataset_rows[1]

{'articles': [{'title': '',
  {'title': '',
   'text': "Hurricane Mitch paused in its whirl through the western Caribbean on Wednesday to punish Honduras with 120-mph (205-kph) winds, topping trees, sweeping away bridges, flooding neighborhoods and killing at least 32 people. Mitch was drifting west at only 2 mph (3 kph) over the Bay Islands, Honduras' most popular tourist area. It also was only 30 miles (50 kms) off the coast, and hurricane-force winds stretched outward 105 miles (165 kms); tropical storm-force winds 175 miles (280 kms). That meant the Honduran coast had been under hurricane conditions for more than a day. ``The hurricane has destroyed almost everything,'' said Mike Brown, a resident of Guanaja Island which was within miles (kms) of the eye of the hurricane. ``Few houses have remained standing.'' At its, 4th graf pvs"},
  {'title': '',
   'text': "Hurricane Mitch cut through the Honduran coast like a ripsaw Thursday, its devastating winds whirling for a third day thro

In [10]:
from transformer_decoding.evaluate import evaluate_rouge, print_mean



In [11]:
all_summaries[0]

"Prospects were dim for resolution of the political crisis in Cambodia in October 1998. Prime Minister Hun Sen insisted that talks take place in Cambodia while opposition leaders Ranariddh and Sam Rainsy, fearing arrest at home, wanted them abroad. King Sihanouk declined to chair talks in either place. A U.S. House resolution criticized Hun Sen's regime while the opposition tried to cut off his access to loans. But in November the King announced a coalition government with Hun Sen heading the executive and Ranariddh leading the parliament. Left out, Sam Rainsy sought the King's assurance of Hun Sen's promise of safety and freedom for all politicians."

In [12]:
# our implementation
print_mean(*evaluate_rouge(all_hyps, all_summaries))

rouge-1 p: 0.455 r: 0.232 f: 0.307
rouge-2 p: 0.125 r: 0.063 f: 0.083
rouge-l p: 0.262 r: 0.133 f: 0.176


In [13]:
h = [l.strip() for l in open('/home/chris/projects/aylien/dynamic-ensembles/data/DUC2004/system_hypotheses/eval_predicted_summaries.out')] 
r = [l.strip() for l in open('/home/chris/projects/aylien/dynamic-ensembles/data/DUC2004/system_hypotheses/eval_gold_summaries.out')]
print_mean(*evaluate_rouge(h, r))

rouge-1 p: 0.455 r: 0.232 f: 0.307
rouge-2 p: 0.125 r: 0.063 f: 0.083
rouge-l p: 0.262 r: 0.133 f: 0.176


In [None]:
h[0] == all_hyps[0]

In [None]:
r[0] == all_summaries[0]

In [None]:
for r1, r2 in zip(r, all_summaries):
    print(r1 == r2)

In [None]:
all_hyps[0]

In [None]:
all_hyps[0]

In [None]:
print(log_report)