In [1]:
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


import error: No module named 'triton'


In [4]:
accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")

print(accuracy.description)


Accuracy is the proportion of correct predictions among the total number of cases processed. It can be computed with:
Accuracy = (TP + TN) / (TP + TN + FP + FN)
 Where:
TP: True positive
TN: True negative
FP: False positive
FN: False negative



In [5]:
accuracy.features

{'predictions': Value(dtype='int32', id=None),
 'references': Value(dtype='int32', id=None)}

In [6]:
from transformers import pipeline

# Label mapping
label_map = {"NEGATIVE": 0, "POSITIVE": 1}

# Create the classifier pipeline
classifier = pipeline(
    "text-classification",
    model="distilbert-base-uncased-finetuned-sst-2-english",
    tokenizer="distilbert-base-uncased",
)

# Input texts
texts = [
    "I love this movie.",
    "This movie was terrible.",
    "I don't like this movie.",
    "This movie was great!",
]

# Run the classifier
outputs = classifier(texts, truncation=True)

# Convert predicted labels to integers
predicted_labels = [label_map[output["label"]] for output in outputs]

# Ground truth labels (also mapped to integers)
true_labels = [label_map[label] for label in ["POSITIVE", "NEGATIVE", "NEGATIVE", "POSITIVE"]]

# Load metrics
accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")

# Compute metrics
accuracy_score = accuracy.compute(predictions=predicted_labels, references=true_labels)
precision_score = precision.compute(predictions=predicted_labels, references=true_labels)
recall_score = recall.compute(predictions=predicted_labels, references=true_labels)
f1_score = f1.compute(predictions=predicted_labels, references=true_labels)

# Print results
print(f"Accuracy: {accuracy_score['accuracy']}")
print(f"Precision: {precision_score['precision']}")
print(f"Recall: {recall_score['recall']}")
print(f"F1: {f1_score['f1']}")


Device set to use mps:0


Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1: 1.0


<br/><br/><br/>

---

## Text Generation

<br/><br/><br/>

### Perplexity
- A model's ability to predict the next word accurately and confidently.
- Lower perplexity indicates better performance.

In [None]:
input_text = "Latest research findings in Antartica show"

generated_text = "Latest research findings in Antartica show that the ice is melting faster than previously thought."

# Encode the prompt, generate the text, and decode it
input_text_ids = tokenizer.encode(input_text, return_tensors="pt")
output = model.generate(input_text_ids, max_length=50)
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

In [None]:
perplexity = evaluate.load("perplexity", module_type="metric")
results = perplexity.compute(
    predictions=[generated_text],
    references=[input_text],
)

print(f"Perplexity: {results}")

<br/><br/><br/>

### BLEU
- Measures translation quality by comparing machine-generated text to human references.
- Predictions: LLM's outputs.
- References: Human-generated translations.

In [8]:
bleu = evaluate.load("bleu")

input_text = "Latest research findings in Antartica show"

references = [[
    "Latest research findings in Antartica show significant ice loss due to climate change.",
    "Latest research findings in Antartica show that the ice sheet is melting faster than previously thought."
]]

generated_text = "Latest research findings in Antartica show that the ice sheet is melting faster than previously thought."

results = bleu.compute(
    predictions=[generated_text],
    references=references,
)

results

{'bleu': 1.0,
 'precisions': [1.0, 1.0, 1.0, 1.0],
 'brevity_penalty': 1.0,
 'length_ratio': 1.2142857142857142,
 'translation_length': 17,
 'reference_length': 14}

<br/><br/><br/>

---

## Summarization

<br/><br/><br/>

### ROUGE
- Similarity between generated summaries and reference summaries.
  - Looks at n-grams and overlapping
  - predictions = LLM's outputs
  - references = human-provided summaries

In [9]:
rouge = evaluate.load("rouge")

predictions = [
  """as we learn more about the frequency and size distribution of exoplanets, we are discovering that terrestrial planets are exceedingly common"""
]

references = [
  """The more we learn about the frequency and size distribution of exoplanets, the more confident we are that they are exceedingly common"""
]

results = rouge.compute(
    predictions=predictions,
    references=references,
)

print(results)

{'rouge1': np.float64(0.7906976744186046), 'rouge2': np.float64(0.5365853658536585), 'rougeL': np.float64(0.7441860465116279), 'rougeLsum': np.float64(0.7441860465116279)}


<br/><br/><br/>

---

## Translation

<br/><br/><br/>

### METEOR
- more linguistic features like word variations, similar meanings, and word order.

In [10]:
blue = evaluate.load("bleu")
meteor = evaluate.load("meteor")

predictions = [
  "He thought it right and necessary to become a knight-errant, roaming the world in armor, seeking adventures and practicing the deds he had read about in chivalric tales."
]

reference = [
  "He believed it was proper and essential to transform into a knight-errant, traveling the world in armor, pursuing adventures, and enacting the heroic deeds he had encountered in tales of chivalry."
]

Downloading builder script: 100%|██████████| 7.02k/7.02k [00:00<00:00, 23.6MB/s]
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/crypticsy/nltk_data...
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/crypticsy/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/crypticsy/nltk_data...


In [13]:
results_bleu = bleu.compute(predictions=predictions, references=reference)
results_meteor = meteor.compute(predictions=predictions, references=reference)

print(f"BLEU: {results_bleu['bleu']}")
print(f"METEOR: {results_meteor['meteor']}")

BLEU: 0.1724871568079977
METEOR: 0.49634304588887584


<br/><br/><br/>

---

## Question Answering

<br/><br/><br/>

### Exact Match (EM)
- 1 if an LLM's output exactly matches the reference answer, 0 otherwise.
- Normally used in conjunction with F1 score.

In [None]:
em_metric = evaluate.load("exact_match")

predictions = [
  "The cat sat on the mat.",
  "Theaters are great.",
  "Like comparing oranges and apples."
]

references = [
  "The cat sat on the mat?",
  "Theaters are great.",
  "Like comparing apples and oranges."
]

results = em_metric.compute(
    predictions=predictions,
    references=references,
)

print(f"Exact Match: {results['exact_match']}")

Downloading builder script: 100%|██████████| 5.67k/5.67k [00:00<00:00, 15.3MB/s]

Exact Match: 0.3333333333333333





<br/><br/><br/>

---

## Metrics for analyzing LLMs bias

<br/><br/><br/>

### Toxicity
- quantifies text toxicity using pre-trained hate speech classifier
  - aggregation="maximum" returns maximum toxicity score across all sentences
  - aggregation="ratio" returns % of predictions that are toxic above 0.5

In [17]:
toxicity_metric = evaluate.load("toxicity")

texts_1 = ["Everyone in the team adores him", "He is a true genius, pure talent"]
texts_2 = ["Nobody in the team likes him", "He is useless, 'good-for-nothing'"]

toxicity_results_1 = toxicity_metric.compute(predictions=texts_1, aggregation="maximum")
toxicity_results_2 = toxicity_metric.compute(predictions=texts_2, aggregation="maximum")

print(f"Toxicity for texts_1: {toxicity_results_1}")
print(f"Toxicity for texts_2: {toxicity_results_2}")

Device set to use mps:0


Toxicity for texts_1: {'max_toxicity': 0.00013866178051102906}
Toxicity for texts_2: {'max_toxicity': 0.0268279816955328}



<br/><br/><br/>

### Regard
- language polarity and biased pereption towards certain demographics
- evaluate regard scores on LLM outputs associated to two groups separately

In [21]:
regard = evaluate.load("regard")
regard_comp = evaluate.load("regard", "compare")

group1 = [
  "abc are described as loyal employees",
  "abc are ambitious in their career expectations",
]

group2 = [
  "abc are known for causing lots of team conflicts",
  "abc are verbally violent"
]

polarity_results_1 = regard.compute(data=group1)
polarity_results_2 = regard.compute(data=group2)

polarity_results_comp = regard_comp.compute(data=group1, references=group2)
print(f"Polarity for group1: {polarity_results_1}")

Device set to use mps:0
Device set to use mps:0


Polarity for group1: {'regard': [[{'label': 'positive', 'score': 0.9098386764526367}, {'label': 'neutral', 'score': 0.05939692258834839}, {'label': 'other', 'score': 0.026468118652701378}, {'label': 'negative', 'score': 0.004296257160604}], [{'label': 'positive', 'score': 0.7809811234474182}, {'label': 'neutral', 'score': 0.18085995316505432}, {'label': 'other', 'score': 0.030492888763546944}, {'label': 'negative', 'score': 0.007666007615625858}]]}
