In [2]:
# !pip install evaluate rouge_score transformers torch
import evaluate
import numpy as np

# Accuracy and F1 Score

In [3]:
# Sample dataset about Japanese tea ceremony
references = [
    "The Japanese tea ceremony is a profound cultural practice emphasizing harmony and respect.",
    "Matcha is carefully prepared using traditional methods in a tea ceremony.",
    "The tea master meticulously follows precise steps during the ritual."
]

predictions = [
    "Japanese tea ceremony is a cultural practice of harmony and respect.",
    "Matcha is prepared using traditional methods in tea ceremonies.",
    "The tea master follows precise steps during the ritual."
]

# Accuracy and F1 Score
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

# Simulate binary classification (e.g., ceremony vs. non-ceremony)
labels = [1, 1, 1]  # All are about tea ceremony
pred_labels = [1, 1, 1]  # Model predicts all correctly

accuracy = accuracy_metric.compute(predictions=pred_labels, references=labels)
f1 = f1_metric.compute(predictions=pred_labels, references=labels, average='weighted')

print("Accuracy:", accuracy)
print("F1 Score:", f1)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

Accuracy: {'accuracy': 1.0}
F1 Score: {'f1': 1.0}


# Perplexity

In [4]:
# Perplexity (using a small GPT2 language model)
perplexity_metric = evaluate.load("perplexity", module_type="metric")
perplexity = perplexity_metric.compute(
    predictions=predictions, 
    model_id='gpt2'  # Using a small pre-trained model
)
print("Perplexity:", perplexity)

Downloading builder script:   0%|          | 0.00/8.46k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Perplexity: {'perplexities': [115.2373046875, 324.3690185546875, 417.4598083496094], 'mean_perplexity': 285.68871053059894}


# ROUGE, BLEU and METEOR

In [5]:
# ROUGE Score (no LLM loaded, using pre-defined lists of texts as LLM outputs (predictions) and references)
rouge_metric = evaluate.load('rouge')
rouge_results = rouge_metric.compute(
    predictions=predictions, 
    references=references
)
print("ROUGE Scores:", rouge_results)

# BLEU Score (no LLM loaded, using pre-defined lists of texts as LLM outputs (predictions) and references)
bleu_metric = evaluate.load("bleu")
bleu_results = bleu_metric.compute(
    predictions=predictions, 
    references=references
)
print("BLEU Score:", bleu_results)

# METEOR (requires references to be a list of lists)
meteor_metric = evaluate.load("meteor")
meteor_results = meteor_metric.compute(
    predictions=predictions, 
    references=[[ref] for ref in references]
)
print("METEOR Score:", meteor_results)

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

ROUGE Scores: {'rouge1': 0.8602339181286549, 'rouge2': 0.6718162012279659, 'rougeL': 0.8602339181286549, 'rougeLsum': 0.8602339181286549}


Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

BLEU Score: {'bleu': 0.5260257094802832, 'precisions': [0.9375, 0.7241379310344828, 0.5384615384615384, 0.391304347826087], 'brevity_penalty': 0.8553453273074225, 'length_ratio': 0.8648648648648649, 'translation_length': 32, 'reference_length': 37}


Downloading builder script:   0%|          | 0.00/7.02k [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to /home/rafael/nltk_data...
[nltk_data] Downloading package punkt_tab to /home/rafael/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package omw-1.4 to /home/rafael/nltk_data...


METEOR Score: {'meteor': 0.8462650219142979}


#  Exact Match

In [6]:
# 6. Exact Match
def exact_match_compute(predictions, references):
    return sum(pred.strip() == ref.strip() for pred, ref in zip(predictions, references)) / len(predictions)

em_score = exact_match_compute(predictions, references)
print("Exact Match Score:", em_score)

Exact Match Score: 0.0
