# Semantic and Synthetic Evaluation Metrics
  - BERTScore
  - BELURT
  - COMET
  - LENS (only English)
  - BARTScore (only English)
  - BELU
  - ROUGE
  - SARI

In [1]:
import pandas as pd
import tensorflow as tf
import warnings
import re
import numpy as np
warnings.filterwarnings('ignore')

2024-05-13 21:17:51.239528: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-05-13 21:17:52.318267: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-05-13 21:17:52.324569: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
df = pd.read_csv('standard_examples.csv', encoding= 'utf-8')


In [51]:
original_text= df['OriginalText'].to_list()
simple_user= df['SimpleTextUserEdited'].to_list()
model_gent1= df['model_gen1'].to_list()
model_gent2= df['model_gen2'].to_list()

print('number of samples:', len(original_text))

number of samples: 33


In [4]:
idx= 15
source= re.sub('\n', ' ',original_text[idx].rstrip())
user_simplified= re.sub('\n', ' ', simple_user[idx].rstrip())
model_gen1= re.sub('\n', ' ', model_gent1[idx].rstrip())
model_gen2= re.sub('\n', ' ', model_gent2[idx].rstrip())

In [53]:
! pip install --upgrade pip
! pip install evaluate
! pip install rouge_score
! pip install git+https://github.com/Unbabel/COMET.git

[0mCollecting git+https://github.com/Unbabel/COMET.git
  Cloning https://github.com/Unbabel/COMET.git to /tmp/pip-req-build-m9nn2o9l
  Running command git clone --filter=blob:none --quiet https://github.com/Unbabel/COMET.git /tmp/pip-req-build-m9nn2o9l
  Resolved https://github.com/Unbabel/COMET.git to commit fd3c2d9f72b69ed9035cf778f76721f6996efb35
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[0m

In [54]:
from comet import download_model, load_from_checkpoint

model_path = download_model("Unbabel/wmt22-comet-da")
model = load_from_checkpoint(model_path)

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.2.1. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/371e9839ca4e213dde891b066cf3080f75ec7e72/checkpoints/model.ckpt`


# COMET

In [55]:
data = [
    { "src": source,
        "mt": model_gen1,
        "ref": user_simplified }
]

model_output = model.predict(data, batch_size=8, gpus=1)
print('\n')
print('COMET score:', model_output.scores) # sentence-level scores
#print(model_output)
#print(model_output.system_score) # system-level score

INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
Predicting DataLoader 0: 100%|██████████| 1/1 [00:02<00:00,  2.10s/it]




COMET score: [0.7249844670295715]


In [56]:
data = [
    { "src": source,
        "mt": model_gen2,
        "ref":  user_simplified}
]

model_output = model.predict(data, batch_size=8, gpus=1)
print('\n')
print('COMET score:', model_output.scores) # sentence-level scores

INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
Predicting DataLoader 0: 100%|██████████| 1/1 [00:03<00:00,  3.81s/it]




COMET score: [0.7623525857925415]


# BERTScore

In [57]:
import evaluate
from evaluate import load

origin= original_text[idx].splitlines()
ref = simple_user[idx].splitlines()
pred1= model_gent1[idx].splitlines()
pred2= model_gent2[idx].splitlines()

bertscore = load("bertscore")
references = ref
predictions = pred1
results = bertscore.compute(predictions=predictions, references=references, lang="de")

print("BERTscore precision:", round( np.mean(results['precision']), 3))
print("BERTscore recall:", round(np.mean(results['recall']), 3))
print("BERTscore f1:", round( np.mean(results['f1']),3))

BERTscore precision: 0.854
BERTscore recall: 0.828
BERTscore f1: 0.841


In [58]:
from evaluate import load

references = ref
predictions = pred2
results = bertscore.compute(predictions=predictions, references=references, lang="de")
print("BERTscore precision:", round( np.mean(results['precision']), 3))
print("BERTscore recall:", round(np.mean(results['recall']), 3))
print("BERTscore f1:", round( np.mean(results['f1']),3))

BERTscore precision: 0.873
BERTscore recall: 0.852
BERTscore f1: 0.862


# BLEURT

In [59]:
references = ref
predictions = pred1

bleurt = evaluate.load("bleurt", module_type="metric")
results = bleurt.compute(predictions=predictions, references=references)
print("BLEURT score:", np.mean(results['scores']))



BLEURT score: 0.029382428154349327


In [60]:
references = ref
predictions = pred2

bleurt = evaluate.load("bleurt", module_type="metric")
results = bleurt.compute(predictions=predictions, references=references)
print("BLEURT score:", np.mean(results['scores']))




BLEURT score: -0.03498249500989914


# SacreBLEU

In [61]:
references = ref
predictions = pred1

sacrebleu = evaluate.load("sacrebleu")
results = sacrebleu.compute(predictions=predictions,
                             references=references)

print("sacrebleu score" ,round(results["score"], 4))

sacrebleu score 8.6166


In [62]:
references = ref
predictions = pred2

sacrebleu = evaluate.load("sacrebleu")
results = sacrebleu.compute(predictions=predictions,
                             references=references)
#print(list(results.keys()))

print("sacrebleu score" , round(results["score"], 4))

sacrebleu score 8.6968


#BLEU

In [63]:
references = [user_simplified]
predictions = [model_gen1]

bleu = evaluate.load("bleu")
results = bleu.compute(predictions=predictions, references=references)
print(results)

{'bleu': 0.0, 'precisions': [0.38461538461538464, 0.16666666666666666, 0.0, 0.0], 'brevity_penalty': 0.7939226578179512, 'length_ratio': 0.8125, 'translation_length': 13, 'reference_length': 16}


In [64]:
references = [user_simplified]
predictions = [model_gen2]

bleu = evaluate.load("bleu")
results = bleu.compute(predictions=predictions, references=references)
print(results)

{'bleu': 0.0, 'precisions': [0.4166666666666667, 0.18181818181818182, 0.0, 0.0], 'brevity_penalty': 0.7165313105737893, 'length_ratio': 0.75, 'translation_length': 12, 'reference_length': 16}


# ROUGE

In [65]:
rouge = evaluate.load('rouge')

references = [user_simplified]
predictions = [model_gen1]

results = rouge.compute(predictions=predictions,references=references)
print(results)


{'rouge1': 0.5161290322580646, 'rouge2': 0.3448275862068966, 'rougeL': 0.5161290322580646, 'rougeLsum': 0.5161290322580646}


In [66]:
rouge = evaluate.load('rouge')

references = [user_simplified]
predictions = [model_gen2]

results = rouge.compute(predictions=predictions,references=references)
print(results)

{'rouge1': 0.5333333333333333, 'rouge2': 0.35714285714285715, 'rougeL': 0.5333333333333333, 'rougeLsum': 0.5333333333333333}


# SARI

In [67]:
! pip install sacremoses

[0m

In [68]:
from evaluate import load
sari = load("sari")
sources=[source]
predictions=[model_gen1]
references=[[user_simplified]]
sari_score = sari.compute(sources=sources, predictions=predictions, references=references)
print(sari_score)

{'sari': 63.97727272727273}


In [69]:
from evaluate import load
sari = load("sari")
sources=[source]
predictions=[model_gen2]
references=[[user_simplified]]
sari_score = sari.compute(sources=sources, predictions=predictions, references=references)
print(sari_score)

{'sari': 64.01334776334777}
