In [None]:
!pip install evaluate rouge_score bert_score

In [None]:
!pip install py-readability-metrics
!python -m nltk.downloader punkt

In [None]:
!pip install tabulate

In [None]:
!pip install -U 'spacy[cuda-autodetect]'

In [None]:
!python -m spacy download en_core_web_sm

In [None]:
!python -m spacy download en_core_web_lg

In [None]:
!pip install textdescriptives

In [2]:
from evaluate import load
import json
import math
from tabulate import tabulate

## Test metrics

In [3]:
with open(f'results/cnn_curie.json') as f:
    cnn_pred = json.load(f)

In [13]:
with open(f'data/cnn_sample.json') as f:
    cnn_ref = json.load(f)

In [4]:
cnn_re[0]

'Photographer James Oatway captured the attack on Mozambican Emmanuel Sithole that left him dead in broad daylight. The attackers looked like hardened thugs, and wanted one thing and that was to kill Emmanuel. Oatway tried to get as close as possible, conscious that the attackers were aware of his presence, but they finally moved on and left Sithole alone. Oatway\'s series of images of the ordeal landed on the front page of South Africa\'s Sunday Times under the headline, "Kill thy neighbor: Alex attack brings home SA\'s shame." Seven people have been killed in the latest round of xenophobic violence against poorer immigrants, many from South Africa\'s neighbors. Xenophobic attacks: How did we get here?'

In [11]:
rouge = load('rouge')
predictions = ["hello there", "general kenobi"]
references = ["hi there", "general Obii Van"]
results = rouge.compute(predictions=predictions,
                      references=references)
print(results)

{'rouge1': 0.45, 'rouge2': 0.0, 'rougeL': 0.45, 'rougeLsum': 0.45}


In [12]:
bertscore = load("bertscore")
predictions = ["hello there", "general kenobi"]
references = ["hello there", "general kenobi"]
bertscore.compute(predictions=predictions, references=references, lang="en")

{'precision': [0.9999998807907104, 1.0],
 'recall': [0.9999998807907104, 1.0],
 'f1': [0.9999998807907104, 1.0],
 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.27.4)'}

In [15]:
%%time

bertscore.compute(predictions=[cnn_pred[0]], references=[cnn_ref[0]], lang="en")

CPU times: user 234 ms, sys: 0 ns, total: 234 ms
Wall time: 227 ms


{'precision': [0.7684018611907959],
 'recall': [0.8173803687095642],
 'f1': [0.792134702205658],
 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.27.4)'}

In [7]:
import spacy
import textdescriptives as td
nlp_read = spacy.blank("en")
nlp_read.add_pipe("textdescriptives/readability")


<textdescriptives.components.readability.Readability at 0x7f6bceda5ac0>

In [10]:
nlp_coh = spacy.load("en_core_web_lg")
nlp_coh.add_pipe("textdescriptives/coherence")
doc = nlp_coh("The world is changed. I feel it in the water. I feel it in the earth. I smell it in the air. Much that once was is lost, for none now live who remember it.")

# all attributes are stored as a dict in the ._.coherence attribute
doc._.coherence



{'first_order_coherence': 0.7807351499795914,
 'second_order_coherence': 0.7494750618934631}

In [17]:
from transformers import pipeline

In [18]:
sentiment_pipeline = pipeline("sentiment-analysis")
data = ["I love you", "I hate you"]
sentiment_pipeline(data)

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading (…)lve/main/config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

[{'label': 'POSITIVE', 'score': 0.9998656511306763},
 {'label': 'NEGATIVE', 'score': 0.9991129040718079}]

In [26]:
sentiment_pipeline(cnn_pred[0])[0]

{'label': 'NEGATIVE', 'score': 0.9943841695785522}

In [24]:
sentiment_pipeline([doc['summary'] for doc in cnn_ref[:10]])

[{'label': 'NEGATIVE', 'score': 0.9348874688148499},
 {'label': 'NEGATIVE', 'score': 0.9997125267982483},
 {'label': 'NEGATIVE', 'score': 0.9938111901283264},
 {'label': 'NEGATIVE', 'score': 0.9717812538146973},
 {'label': 'POSITIVE', 'score': 0.990036129951477},
 {'label': 'POSITIVE', 'score': 0.9938215017318726},
 {'label': 'POSITIVE', 'score': 0.9398972988128662},
 {'label': 'POSITIVE', 'score': 0.9674845933914185},
 {'label': 'NEGATIVE', 'score': 0.9985628724098206},
 {'label': 'NEGATIVE', 'score': 0.9922595620155334}]

## Run evaluation

In [27]:
def run_eval(dataset, experiment, metric):
    with open(f'data/{dataset}_sample.json') as f:
        references = [doc['summary'] for doc in json.load(f)]
    with open(f'results/{experiment}.json') as f:
        predictions = json.load(f)
    results = metric(predictions, references)
    return results

In [None]:
from collections import defaultdict

results = defaultdict(lambda: defaultdict(dict))
metrics = {
'rouge': lambda p,r: rouge.compute(predictions=p, references=r),
'bertscore': lambda p,r: bertscore.compute(predictions=p, references=r, lang="en"),
'readability': lambda p,r: [nlp_read(pr)._.readability for pr in p],
'coherence': lambda p,r: [nlp_coh(pr)._.coherence for pr in p],
'sentiment': lambda p,r: sentiment_pipeline(p),
}


for dataset in ['cnn', 'xsum', 'newsroom']:
    for exp in ['eleuther1b', 'curie', 'style_curie', 'davinci', 'style_davinci', 'brio']:
        for m_name, metric in metrics.items():
            results[dataset][exp][m_name] = run_eval(dataset, f'{dataset}_{exp}', metric)      

In [None]:
with open('eval/eval.json', 'w') as f:
    json.dump(results, f)

In [68]:
with open('eval/eval.json') as f:
    reload_results = json.load(f)

In [72]:
for dataset, dataset_dict in reload_results.items():
    for exp, exp_dict in dataset_dict.items():
        for metric, metric_dict in exp_dict.items():
            if metric == 'readability' or metric == 'coherence':
                metric_dict = {k: [dic[k] for dic in metric_dict] for k in metric_dict[0]}
                for submetric, submetric_list in metric_dict.items():
                    not_nan_list = [e for e in submetric_list if not math.isnan(e)]
                    metric_dict[submetric] = sum(not_nan_list)/len(not_nan_list) if len(not_nan_list) > 0 else 0
                exp_dict[metric] = metric_dict
                
            if metric == 'bertscore':
                del metric_dict['hashcode']
                for submetric, submetric_list in metric_dict.items():
                    if type(submetric_list) is list:
                        metric_dict[submetric] = sum(submetric_list)/len(submetric_list)

        
            if metric == 'sentiment':
                metric_list = metric_dict
                exp_dict[metric] = {'positive_rate': sum([e['label'] == 'POSITIVE' for e in metric_list])/len(metric_list)}

                

In [73]:
reload_results

{'cnn': {'eleuther1b': {'rouge': {'rouge1': 0.2648411725811938,
    'rouge2': 0.09381832710197169,
    'rougeL': 0.18282601747648564,
    'rougeLsum': 0.18325840373768898},
   'bertscore': {'precision': 0.8530396234989166,
    'recall': 0.8624498534202576,
    'f1': 0.8573568445444107},
   'readability': {'flesch_reading_ease': 70.36363053081172,
    'flesch_kincaid_grade': 8.882589694152609,
    'smog': 9.833532052208998,
    'gunning_fog': 11.778323965879077,
    'automated_readability_index': 11.438746698749439,
    'coleman_liau_index': 10.016228074391494,
    'lix': 44.222421996991024,
    'rix': 4.913555555555556},
   'coherence': {'first_order_coherence': 0.7992374857731618,
    'second_order_coherence': 0.7528428209088153},
   'sentiment': {'positive_rate': 0.42}},
  'curie': {'rouge': {'rouge1': 0.3208694843107881,
    'rouge2': 0.1265278551141452,
    'rougeL': 0.21450751041668298,
    'rougeLsum': 0.21543013315184642},
   'bertscore': {'precision': 0.8608401000499726,
    'r

In [74]:
with open('eval/eval_agg.json', 'w') as f:
    json.dump(reload_results, f)

## Save results to tables

In [95]:
def save_table(dataset):
    cols = ['model']+[submetric for metric, metric_dict in reload_results[dataset]['curie'].items() 
                     for submetric, val in metric_dict.items()]



    vals = [[exp] + [val for metric, metric_dict in exp_dict.items() 
                     for submetric, val in metric_dict.items()]
                for exp, exp_dict in reload_results[dataset].items()]
    
    html = tabulate(vals, headers=cols, tablefmt="html")
    with open(f'eval/eval_table_{dataset}.html', 'w') as f:
        f.write(html)      

In [96]:

save_table('cnn')
save_table('newsroom')
save_table('xsum')