In [7]:
!pip -q install sacrebleu==2.4.2 nltk==3.9.1 bert-score==0.3.13 pycocoevalcap==1.2 pandas

In [10]:
def main():
    import pandas as pd
    import re, string, unicodedata
    from nltk.translate.bleu_score import corpus_bleu , SmoothingFunction
    from nltk.translate.meteor_score import meteor_score
    from pycocoevalcap.cider.cider import Cider
    from bert_score import score as bert_score

    refs = [["a", "dog", "is", "running"], ["a", "cat", "is", "sleeping"]]
    hyps = [["a", "dog", "runs"],          ["a", "cat", "sleeps"]]

    #BLEU-4 (Use smoothing as captions are short)
    smooth = SmoothingFunction().method4
    bleu4 = corpus_bleu([[ref] for ref in refs], hyps, smoothing_function=smooth)
    print(f"BLEU-4: {bleu4:.4f}")

    #METEOR (Expects tokenized inputs in this NLTK version)
    meteorScores = [meteor_score([ref], hyp) for ref, hyp in zip(refs, hyps)]
    meteor = sum(meteorScores) / max(len(meteorScores), 1)
    print(f"METEOR: {meteor:.4f}")

    #CIDER (COCO style)
    gts = {i: [" ".join(refs[i])] for i in range(len(refs))}
    res = {i: [" ".join(hyps[i])] for i in range(len(hyps))}
    cider = Cider()
    ciderScore, _ = cider.compute_score(gts, res)
    print(f"CIDEr: {ciderScore:.4f}")

    # BERTScore (Use raw strings)
    hypStrs = [" ".join(h) for h in hyps]
    refStrs = [" ".join(r) for r in refs]
    P, R, F1 = bert_score(hypStrs, refStrs, lang="en", rescale_with_baseline=True, verbose=False)
    print(f"BERTScore - P: {P.mean().item():.4f}, R: {R.mean().item():.4f}, F: {F1.mean().item():.4f}")

if __name__ == "__main__":
    main()

BLEU-4: 0.1370
METEOR: 0.6553
CIDEr: 2.2393


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore - P: 0.8398, R: 0.7417, F: 0.7907


In [None]:
import json
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from pycocoevalcap.cider.cider import Cider
from bert_score import score as bert_score

with open("youcook2Refs.json", "r") as f:   #GroundTruth
    refsRaw = json.load(f)

with open("youcook2Preds.json", "r") as f: #Predicted
    hypsRaw = json.load(f)


def normalize(s):
    import re, string, unicodedata
    _PUNCT_TABLE = str.maketrans("", "", string.punctuation)
    s = unicodedata.normalize("NFKC", s).lower().strip().translate(_PUNCT_TABLE)
    s = re.sub(r"\s+", " ", s)
    return s

refs = [normalize(x) for x in refsRaw]
hyps = [normalize(x) for x in hypsRaw]

# BLEU-4
smooth = SmoothingFunction().method4
bleu4 = corpus_bleu([[r] for r in refs], hyps, smoothing_function=smooth) * 100

# METEOR
meteorScores = [meteor_score([r.split()], h.split()) for r, h in zip(refs, hyps)]
meteor = 100 * (sum(meteorScores) / len(meteorScores))

# CIDER
gts = {i: [refs[i]] for i in range(len(refs))}
res = {i: [hyps[i]] for i in range(len(hyps))}
ciderMean, _ = Cider().compute_score(gts, res)

# BERTScore
P, R, F1 = bert_score(hypsRaw, refsRaw, lang="en", rescale_with_baseline=True, verbose=False)

metrics = {
    "BLEU4": bleu4,
    "METEOR": meteor,
    "CIDEr": ciderMean,
    "PBERT": float(P.mean()),
    "RBERT": float(R.mean()),
    "FBERT": float(F1.mean())
}

print(metrics)