### Testing LLM using HF Evaluate

#### Accuracy

In [5]:
import evaluate

accuracy = evaluate.load("accuracy")

references = [1,0,1,1]
predications = [1,0,0,0]

results = accuracy.compute(references=references, predictions=predications)

print(f"Accuracy: {results['accuracy']:.2%}")



Accuracy: 50.00%


#### Precision (Exact Match)

In [7]:
import evaluate

exact_match = evaluate.load("exact_match")

references = ["Execute", "Automation"]
predications = ["Execute", "Automations"]

results = exact_match.compute(references=references, predictions=predications)

print(f"Exact Match: {results['exact_match']:.2%}")


Exact Match: 50.00%


#### F1-Score

In [9]:
import evaluate

f1 = evaluate.load("f1")

#True label/Ground truth/reference
references = [1,0,1,1,0,1,0,1,1]
predications = [1,0,1,1,1,1,1,0,1]

results = f1.compute(references=references, predictions=predications, average="binary")

print(f"F1-Score: {results['f1']:.2%}")


F1-Score: 76.92%


### Using HF Evaluate Method to Evalute a LLM using Pipeline function 🤖⚙️

In [15]:
from transformers import pipeline
import evaluate

sentiment_pipeline = pipeline("sentiment-analysis")

accuracy = evaluate.load("accuracy")

datasets = [
    {"text": "I love learning ML and AI in Testing", "label": 0},
    {"text": "I hate working with a machine which has got no GPU for my AI training", "label": 0},
    {"text": "I like driving fast cars", "label": 1}
]

predications = sentiment_pipeline([data["text"] for data in datasets])




No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.
Device set to use 0


In [16]:
predications

[{'label': 'POSITIVE', 'score': 0.9988466501235962},
 {'label': 'NEGATIVE', 'score': 0.9995275735855103},
 {'label': 'POSITIVE', 'score': 0.9991025924682617}]

In [17]:
prediction_labels = [1 if pred["label"] == "POSITIVE" else 0 for pred in predications]
true_labels = [dataset['label'] for dataset in datasets]

prediction_labels, true_labels


([1, 0, 1], [0, 0, 1])

In [18]:
results = accuracy.compute(predictions=prediction_labels, references=true_labels)

print(f"Accuracy: {results['accuracy']:.2%}")


Accuracy: 66.67%


#### Precision, Recall, F1

In [19]:
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")

accuracy_score = accuracy.compute(predictions=prediction_labels, references=true_labels)
precision_score = precision.compute(predictions=prediction_labels, references=true_labels)
recall_score = recall.compute(predictions=prediction_labels, references=true_labels)
f1_score = f1.compute(predictions=prediction_labels, references=true_labels)


print(f"Accuracy: {accuracy_score['accuracy']:.4f}")
print(f"Precision: {precision_score['precision']:.4f}")
print(f"Recall: {recall_score['recall']:.4f}")
print(f"F1 Score: {f1_score['f1']:.4f}")


Accuracy: 0.6667
Precision: 0.5000
Recall: 1.0000
F1 Score: 0.6667
