### install evaluate

In [1]:
! pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [

## Evaluate models on the Hub

[Using the evaluator](https://huggingface.co/docs/evaluate/en/base_evaluator)

In [4]:
from evaluate import evaluator
from evaluate import combine
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, pipeline



data = load_dataset("imdb", split="test").shuffle(seed=42).select(range(1000))
task_evaluator = evaluator("text-classification")

# 1. pass model or name path
eval_results = task_evaluator.compute(
    model_or_pipeline="lvwerra/distilbert-imdb",
    data=data,
    metric=combine(["accuracy", "recall", "precision", "f1"]),  #calculate multiple metrices. if not specified By default the "accuracy" metric is computed. or single metric specify : metric="accuracy" etc
    input_column="text",                                                  #input_column="text": with this argument the column with the data for the pipeline can be specified. see imdb dataset
    label_column="label",                                                 #label_column="label": with this argument the column with the labels for the evaluation can be specified. see imdb dataset. this is the postive negative label in 0/1
    label_mapping={"NEGATIVE": 0, "POSITIVE": 1},                         #label_mapping=None: the label mapping aligns the labels in the pipeline output with the labels need for evaluation. E.g. the labels in label_column can be integers (0/1) whereas the pipeline can produce label names such as "positive"/"negative". With that dictionary the pipeline outputs are mapped to the labels.
)

# 2. Pass an instantiated model is also possible
# model = AutoModelForSequenceClassification.from_pretrained("lvwerra/distilbert-imdb")
# eval_results = task_evaluator.compute(
#     model_or_pipeline=model,
#     data=data,
#     label_mapping={"NEGATIVE": 0, "POSITIVE": 1}
# )

# 3. Pass an instantiated pipeline is also possible
# pipe = pipeline("text-classification", model="lvwerra/distilbert-imdb")
# eval_results = task_evaluator.compute(
#     model_or_pipeline=pipe,
#     data=data,
#     label_mapping={"NEGATIVE": 0, "POSITIVE": 1}
# )


print(eval_results)

config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

{'accuracy': 0.918, 'recall': 0.9180327868852459, 'precision': 0.9142857142857143, 'f1': 0.9161554192229039, 'total_time_in_seconds': 10.937744419000069, 'samples_per_second': 91.42652833091343, 'latency_in_seconds': 0.010937744419000068}


## Evaluate models using Transformer


https://huggingface.co/docs/evaluate/en/transformers_integrations

The metrics in evaluate can be easily integrated with the Trainer. The Trainer accepts a compute_metrics keyword argument that passes a function to compute metrics. One can specify the evaluation interval with evaluation_strategy in the TrainerArguments, and based on that, the model is evaluated accordingly, and the predictions and labels passed to compute_metrics.

[Customized Evaluation Metrics with Hugging Face Trainer](https://medium.com/@rakeshrajpurohit/customized-evaluation-metrics-with-hugging-face-trainer-3ff00d936f99)

In [11]:
! pip install datasets transformers torch evaluate accelerate sklearn

Collecting sklearn
  Downloading sklearn-0.0.post12.tar.gz (2.6 kB)
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See above for details.


In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer

# Prepare and tokenize dataset
dataset = load_dataset("yelp_review_full")
print(dataset)

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))


### evaluate using huggingface evaluate

In [10]:
from datasets import load_dataset
import evaluate
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
import numpy as np

#metric = evaluate.combine(["precision", "recall", "accuracy", "f1"])
metric1 = evaluate.load("precision")
metric2 = evaluate.load("recall")
metric3 = evaluate.load("accuracy")
metric4 = evaluate.load("f1")
# Setup evaluation
def compute_metrics(eval_pred):



    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    precision = metric1.compute(predictions=predictions, references=labels,  average="micro")["precision"]
    recall = metric2.compute(predictions=predictions, references=labels,  average="micro")["recall"]
    accuracy = metric3.compute(predictions=predictions, references=labels,)["accuracy"]
    f1 = metric4.compute(predictions=predictions, references=labels,  average="micro")["f1"]
    return {"precision": precision, "recall": recall, "accuracy": accuracy, "f1": f1}
    #return metric.compute(predictions=predictions, references=labels)

# Load pretrained model and evaluate model after each epoch
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)

training_args = TrainingArguments(
    output_dir="test_trainer",
    evaluation_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()
#output issue: You  can see it poduces first metric (Precision) for all other metrices if we try to compute multiple
# Epoch	Training Loss	Validation Loss	Precision	Recall	Accuracy	F1
# 1	No log	1.119715	0.527000	0.527000	0.527000	0.527000
# 2	No log	1.013838	0.566000	0.566000	0.566000	0.566000
# 3	No log	1.034243	0.592000	0.592000	0.592000	0.592000


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,Accuracy,F1
1,No log,1.119715,0.527,0.527,0.527,0.527
2,No log,1.013838,0.566,0.566,0.566,0.566
3,No log,1.034243,0.592,0.592,0.592,0.592


TrainOutput(global_step=375, training_loss=1.03717578125, metrics={'train_runtime': 397.1044, 'train_samples_per_second': 7.555, 'train_steps_per_second': 0.944, 'total_flos': 789354427392000.0, 'train_loss': 1.03717578125, 'epoch': 3.0})

### evaluate using sklearn

will fix the above issue of computing multiple metric in Trainer using HuggingFac evaluate

In [12]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    # Calculate accuracy
    accuracy = accuracy_score(labels, preds)

   # Calculate precision, recall, and F1-score
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')
    f1 = f1_score(labels, preds, average='weighted')

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }


model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)

training_args = TrainingArguments(
    output_dir="test_trainer",
    evaluation_strategy="epoch" #if max_steps=max_steps not provided it defaults to 3 steps/epoch training
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,1.128622,0.481,0.533712,0.481,0.440829
2,No log,0.979463,0.572,0.582543,0.572,0.571721
3,No log,1.039855,0.596,0.602211,0.596,0.597519


TrainOutput(global_step=375, training_loss=0.9462509765625, metrics={'train_runtime': 392.8908, 'train_samples_per_second': 7.636, 'train_steps_per_second': 0.954, 'total_flos': 789354427392000.0, 'train_loss': 0.9462509765625, 'epoch': 3.0})