In [2]:
import pandas as pd
import numpy as np
import scipy
from tqdm import tqdm

# from bert_score import BERTScorer

from easse.bleu import corpus_bleu
from easse.fkgl import corpus_fkgl
from easse.samsa import get_samsa_sentence_scores
from easse.sari import corpus_sari
from easse.bertscore import corpus_bertscore

from utils import (
    read_test_set,
    collect_references,
    sigmoid,
)

## Experimental Setting

Read the datasets with original sentences and references

In [3]:
asset_orig, asset_refs = read_test_set("asset_test", as_lists=True)
turk_orig, turk_refs = read_test_set("turkcorpus_test", as_lists=True)
hsplit_orig, hsplit_refs = read_test_set("hsplit_test", as_lists=True)

# We create a dataset composed of all references together
all_orig = asset_orig
all_refs = asset_refs + turk_refs + hsplit_refs

EVAL_DATASETS = {
    "asset": (asset_orig, asset_refs, 10),  # (original, references, number of references)
    "turk": (turk_orig, turk_refs, 8),
    "hsplit": (hsplit_orig, hsplit_refs, 4),
    "all": (all_orig, all_refs, 22)
}

Same pre-processing parameters for all metrics

In [4]:
lowercase = False  # case-insensitive
tokenizer = "moses"

## Compute metrics for the Simplicity-DA dataset

In [5]:
df_simplicityDA = pd.read_csv("../data/simplicity_DA.csv")

In [6]:
df_simplicityDA.groupby("sys_name").count()  # 100 sentences per system

Unnamed: 0_level_0,sent_id,orig_sent,simp_sent,sys_type,fluency,fluency_zscore,meaning,meaning_zscore,simplicity,simplicity_zscore
sys_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ACCESS,100,100,100,100,100,100,100,100,100,100
DMASS-DCSS,100,100,100,100,100,100,100,100,100,100
Dress-Ls,100,100,100,100,100,100,100,100,100,100
Hybrid,100,100,100,100,100,100,100,100,100,100
PBMT-R,100,100,100,100,100,100,100,100,100,100
SBMT-SARI,100,100,100,100,100,100,100,100,100,100


In [7]:
# bertscore_rescale = BERTScorer(lang="en", rescale_with_baseline=True)

In [8]:
metrics = []
for _, row in tqdm(df_simplicityDA.iterrows()):
    for test_set, (test_set_orig, test_set_refs, num_refs) in EVAL_DATASETS.items():
        orig_sents, ref_sents = collect_references(
            [row["sent_id"]], test_set_orig, test_set_refs, num_refs
        )
        
        # BLEU
        bleu_sys_refs = corpus_bleu(
            [row["simp_sent"]],
            ref_sents,
            smooth_method="floor",
            tokenizer=tokenizer,
            lowercase=lowercase,
            effective_order=True,
        )
        
        # SARI
        sari_score = corpus_sari(
            orig_sents,
            [row["simp_sent"]],
            ref_sents,
            tokenizer=tokenizer,
            lowercase=lowercase,
            use_f1_for_deletion=False,
        )
        
        # iBLEU (alpha = 0.9)
        bleu_sys_orig = corpus_bleu(
            [row["simp_sent"]],
            [orig_sents],
            force=True,
            tokenizer=tokenizer,
            lowercase=lowercase,
        )
        ibleu_score = 0.9 * bleu_sys_refs - (1 - 0.9) * bleu_sys_orig
        
        # Avg. of BLEU and SARI
        amean_bleu_sari = np.mean([bleu_sys_refs, sari_score])
        gmean_bleu_sari = scipy.stats.gmean([bleu_sys_refs, sari_score])
        
        # Flesch
        fkgl_sys = corpus_fkgl([row["simp_sent"]], tokenizer=tokenizer)
        
        # FKBLEU
        fkgl_orig = corpus_fkgl(orig_sents, tokenizer=tokenizer)
        fk_diff = sigmoid(fkgl_sys - fkgl_orig)
        fkbleu_score = ibleu_score * fk_diff
                
        # BERTScore
        # ref_sents = [ref for [ref] in ref_sents]
        # bertscore_rescale_scores = bertscore_rescale.score([row["simp_sent"]], [ref_sents])
        # bertscores = corpus_bertscore([
        #     row["simp_sent"]], 
        #     ref_sents, 
        #     tokenizer=tokenizer,
        #     lowercase=lowercase
        # )
        
        metrics.append(
            {
                "sent_id": row["sent_id"],
                "sys_name": row["sys_name"],
                "test_set": test_set,
                "bleu": bleu_sys_refs,
                "sari": sari_score,
                "ibleu": ibleu_score,
                "amean_bleu_sari": amean_bleu_sari,
                "gmean_bleu_sari": gmean_bleu_sari,
                "fkgl": fkgl_sys,
                "fkbleu": fkbleu_score,
                # "bertscore_P": bertscore_rescale_scores[0].cpu().item(),
                # "bertscore_R": bertscore_rescale_scores[1].cpu().item(),
                # "bertscore_F1": bertscore_rescale_scores[2].cpu().item(),
                # "bertscore_P": bertscores[0],
                # "bertscore_R": bertscores[1],
                # "bertscore_F1": bertscores[2],
            }
        )

df_metrics_segment = pd.DataFrame(metrics)

  log_a = np.log(np.array(a, dtype=dtype))
600it [02:38,  3.78it/s]


### Compute SAMSA

In [12]:
samsa_scores = get_samsa_sentence_scores(
    df_simplicityDA["orig_sent"],
    df_simplicityDA["simp_sent"],
    tokenizer=tokenizer,
    lowercase=lowercase,
)

# Since SAMSA is reference-less, this reformating is only done so that it can appear in thae same dataframe as the other metrics
df_metrics_segment["samsa"] = [
    s for s in samsa_scores for _ in range(len(EVAL_DATASETS))
]

Loading spaCy model 'en_core_web_md'... [38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')


OSError: Failed to get spaCy model. Download it manually using `python -m spacy download en_core_web_md`.

### Compute Additional Averages

In [None]:
df_metrics_segment['amean_bleu_samsa'] = np.mean(df_metrics_segment[['bleu', 'samsa']], axis=1)
df_metrics_segment['amean_sari_samsa'] = np.mean(df_metrics_segment[['sari', 'samsa']], axis=1)
df_metrics_segment['amean_bleu_sari_samsa'] = np.mean(df_metrics_segment[['bleu', 'sari', 'samsa']], axis=1)

df_metrics_segment['gmean_bleu_samsa'] = scipy.stats.gmean(df_metrics_segment[['bleu', 'samsa']], axis=1)
df_metrics_segment['gmean_sari_samsa'] = scipy.stats.gmean(df_metrics_segment[['sari', 'samsa']], axis=1)
df_metrics_segment['gmean_bleu_sari_samsa'] = scipy.stats.gmean(df_metrics_segment[['bleu', 'sari', 'samsa']], axis=1)