In [1]:
!pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [2]:
import argparse
import json
import subprocess
import tqdm
import random
import string
import re
import numpy as np
from collections import Counter
from rouge import Rouge

In [3]:
# Mount Google Drive for running on Colab
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

path = "/content/drive/My Drive/Senior Design/"

Mounted at /content/drive


In [4]:
# EVAL metrics and definitions from:
# https://github.com/facebookresearch/KILT/
def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""

    def remove_articles(text):
        return re.sub(r"\b(a|an|the)\b", " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

# F1 score definition
def _f1_score(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

# ROUGEL score definition
def _rougel_score(prediction, ground_truth):
    rouge = Rouge()
    # no normalization
    try:
        scores = rouge.get_scores(prediction, ground_truth, avg=True)
    except ValueError:  # "Hypothesis is empty."
        return 0.0
    return scores["rouge-l"]["f"]

In [None]:
# Remove padding tokens.
def remove_padding(s):
  s = s.split()
  for i, token in enumerate(s):
    if token == "[SEP]":
      return " ".join(s[:i])

In [7]:
aTA_gen_70 = {}

with open(path + "eval/rt_generations_70.json", "r", encoding='utf-8') as f:
    aTA_gen_70 = json.load(f)

In [8]:
aTA_gen_100 = {}

with open(path + "eval/rt_generations_100.json", "r", encoding='utf-8') as f:
    aTA_gen_100 = json.load(f)

In [9]:
aTA_gen_150 = {}

with open(path + "eval/rt_generations_150.json", "r", encoding='utf-8') as f:
    aTA_gen_150 = json.load(f)

aTA_total_gen = len(aTA_gen_150)

In [10]:
google_gen = {}

with open(path + "eval/google_generations.json", "r", encoding='utf-8') as f:
    google_gen = json.load(f)

google_total_gen = len(google_gen)

In [11]:
aTA_gen = aTA_gen_70
total_f1 = 0
total_rougel = 0

for i in aTA_gen:
  generation = aTA_gen[i]['gen:']
  generation = generation.replace('_', ' ')
  actual = aTA_gen[i]['actual:']
  actual = remove_padding(actual)
  total_f1 += _f1_score(generation, actual)
  total_rougel += _rougel_score(generation, actual)

f1 = total_f1 / aTA_total_gen
rougel = total_rougel / aTA_total_gen

print(f"f1: {f1*100:.2f}\nrougel:{rougel*100:.2f}")

f1: 16.52
rougel:15.97


In [12]:
aTA_gen = aTA_gen_100
total_f1 = 0
total_rougel = 0

for i in aTA_gen:
  generation = aTA_gen[i]['gen:']
  generation = generation.replace('_', ' ')
  generation = generation.replace('=', ' ')
  generation = generation.replace('-', ' ')
  actual = aTA_gen[i]['actual:']
  actual = remove_padding(actual)
  total_f1 += _f1_score(generation, actual)
  total_rougel += _rougel_score(generation, actual)

f1 = total_f1 / aTA_total_gen
rougel = total_rougel / aTA_total_gen

print(f"f1: {f1*100:.2f}\nrougel:{rougel*100:.2f}")

f1: 17.53
rougel:16.37


In [13]:
aTA_gen = aTA_gen_150
total_f1 = 0
total_rougel = 0

for i in aTA_gen:
  generation = aTA_gen[i]['gen:']
  generation = generation.replace('_', ' ')
  generation = generation.replace('=', ' ')
  generation = generation.replace('-', ' ')
  actual = aTA_gen[i]['actual:']
  actual = remove_padding(actual)
  total_f1 += _f1_score(generation, actual)
  total_rougel += _rougel_score(generation, actual)

f1 = total_f1 / aTA_total_gen
rougel = total_rougel / aTA_total_gen

print(f"f1: {f1*100:.2f}\nrougel:{rougel*100:.2f}")

f1: 17.30
rougel:15.94


In [14]:
total_f1 = 0
total_rougel = 0

for i in google_gen:
  generation = google_gen[i]['gen:']
  generation = generation.replace('_', ' ')
  generation = generation.replace('=', ' ')
  generation = generation.replace('-', ' ')
  
  # No padding in google gen actual.
  actual = google_gen[i]['actual:']
  total_f1 += _f1_score(generation, actual)
  total_rougel += _rougel_score(generation, actual)

f1 = total_f1 / google_total_gen
rougel = total_rougel / google_total_gen

print(f"f1: {f1*100:.2f}\nrougel:{rougel*100:.2f}")

f1: 19.16
rougel:16.50
