### [Bleu. The Bilingual Evaluation Understudy](https://aclanthology.org/P02-1040.pdf)

https://huggingface.co/spaces/evaluate-metric/bleu

https://pypi.org/project/sacrebleu/2.3.1/

https://machinelearningmastery.com/calculate-bleu-score-for-text-python/

In [None]:
import evaluate
from nltk.translate.bleu_score import sentence_bleu

candidate_1 = "It is a guide to action which ensures that the military always obeys the commands of the party."

candidate_2 = "It is to insure the troops forever hearing the activity guidebook that party direct."

reference_1 = "It is a guide to action that ensures that the military will forever heed Party commands."
reference_2 = "It is the guiding principle which guarantees the military forces always being under the command of the Party."
reference_3 = "It is the practical guide for the army always to heed the directions of the party."

bleu = evaluate.load("bleu")
sacrebleu = evaluate.load("sacrebleu")

references = [
	[
		reference_1, 
		reference_2, 
		reference_3
	], 
]

candidates = [
	candidate_1,
	candidate_2
]
print("bleu")
for candidate in candidates:
	bleu_score = bleu.compute(
		predictions=[candidate],
		references=references
	)
	print(bleu_score['bleu'])

print("sacrebleu")
for candidate in candidates:
	bleu_score = sacrebleu.compute(
		predictions=[candidate],
		references=references
	)
	print(bleu_score['score'])

print("nltk bleu") 
for candidate in candidates:
    candidate = candidate.split()
    reference = [item.split() for item in references[0]]
    bleu_score = sentence_bleu(reference, candidate)
    print(bleu_score)
    
# bleu
# 0.5401725898595141
# 0.0
# sacrebleu
# 54.017258985951415
# 6.699559159060897
# nltk bleu
# 0.4969770530031034
# 5.7264676266231995e-155

## [Rouge. Recall-Oriented Understudy for Gisting Evaluation (2004)](https://aclanthology.org/W04-1013.pdf)

In [1]:
# данная библиотека является оберткой над оригинальной имплементацией авторов
# https://github.com/li-plus/rouge-metric
from rouge_metric import PerlRouge

rouge = PerlRouge(
    rouge_n_max=3, 
    rouge_l=True, 
    rouge_w=True,
    rouge_w_weight=1.2, 
    rouge_s=True, 
    rouge_su=True, 
    skip_gap=4
)

# Load summary results and evaluate
hypotheses = [
    'how are you\ni am fine',                       # document 1: hypothesis
    'it is fine today\nwe won the football game',   # document 2: hypothesis
]
references = [
    [
        'how do you do\nfine thanks',   # document 1: reference 1
    ], 
    [
        'it is sunny today\nlet us go for a walk',  # document 2: reference 1
    ]
]

scores = rouge.evaluate(hypotheses, references)
for key in scores.keys():
    f1_score = scores[key]['f']
    print(f"{key} = {f1_score}")
# так как мы имеем дело с короткими текстами, то следует доверять больше метрике rouge-l 
# rouge-1 = 0.53622
# rouge-2 = 0.20346
# rouge-3 = 0.11765
# rouge-l = 0.53622
# rouge-w-1.2 = 0.39308
# rouge-s4 = 0.272
# rouge-su4 = 0.33382

rouge-1 = 0.40789
rouge-2 = 0.05882
rouge-3 = 0.0
rouge-l = 0.40789
rouge-w-1.2 = 0.30222
rouge-s4 = 0.14615
rouge-su4 = 0.19817


In [None]:
# his implementation is independant from the "official" ROUGE script
# https://github.com/pltrdy/rouge
from rouge import Rouge 

reference2 = [ item[0] for item in  references ] 

rouge = Rouge()
scores = rouge.get_scores(hypotheses, reference2, avg=True)
for key in scores.keys():
    f1_score = scores[key]['f']
    print(f"{key} = {f1_score}")
# Как можно заметить данные метрики отличаются от официальной обертки, с одинаковыми входными данными
# rouge-1 = 0.4306220045969644
# rouge-2 = 0.05882352692041533
# rouge-l = 0.4306220045969644

In [None]:
# https://torchmetrics.readthedocs.io/en/stable/text/rouge_score.html
from torchmetrics.text.rouge import ROUGEScore

rouge = ROUGEScore(accumulate='avg')

scores = rouge(hypotheses, reference2, )
for key in scores.keys():
    if "fmeasure" in key:
        f1_score = scores[key]
        print(f"{key} = {f1_score}")
# снова видим совпадение в некоторых местах, а где-то различие   
# rouge1_fmeasure = 0.40789473056793213
# rouge2_fmeasure = 0.05882352963089943
# rougeL_fmeasure = 0.40789473056793213
# rougeLsum_fmeasure = 0.40789473056793213

In [None]:
# https://huggingface.co/spaces/evaluate-metric/rouge
import evaluate
rouge = evaluate.load('rouge')
results = rouge.compute(
    predictions=hypotheses,
    references=references
)
for key in results.keys():
    f1_score = results[key]
    print(f"{key} = {f1_score}")
# видим почти идеальное совпадение, только только потому что это почти полная копирка кода с 
# https://github.com/google-research/google-research/tree/master/rouge
# rouge1 = 0.40789473684210525
# rouge2 = 0.058823529411764705
# rougeL = 0.40789473684210525
# rougeLsum = 0.40789473684210525

[Meteor. The Metric for Evaluation of Translation with Explicit ORdering](https://aclanthology.org/W05-0909.pdf)

In [None]:
# https://huggingface.co/spaces/evaluate-metric/meteor
import evaluate
meteor = evaluate.load('meteor')
predictions = ["It is a guide to action which ensures that the military always obeys the commands of the party"]
references = ["It is a guide to action that ensures that the military will forever heed Party commands"]
results = meteor.compute(
    predictions=predictions, 
    references=references
)
results
# результат совпадает с NLTK, потому что эта либа и есть обертка над NLTK.
print(results['meteor'])
# 0.6944444444444445