# Evaluate with Trainer

In [1]:
import time

import evaluate
import numpy as np
import pandas as pd
import torch
from datasets import load_dataset
from optimum.bettertransformer import BetterTransformer
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
)
from transformers.utils import is_torch_tf32_available

assert torch.cuda.is_available()

print(torch.cuda.get_device_name(0))
print(
    f"Free memory : {round(torch.cuda.mem_get_info()[0] / 1024 ** 3,1)} / {round(torch.cuda.mem_get_info()[1] / 1024 ** 3,1)} GB"
)

if is_torch_tf32_available():
    torch.backends.cuda.matmul.allow_tf32 = True
    print("\nUsing TF32")
else:
    print("\nTF32 not available")

t = time.time()

NVIDIA GeForce RTX 3080
Free memory : 8.9 / 10.0 GB

Using TF32


In [2]:
# Data
dataset_path = "allocine"
input_column = "review"
label_column = "label"
train_split = "train"
eval_split = "validation"
test_split = "test"

# Model
model_checkpoint = "baptiste-pasquier/distilcamembert-allocine"

# Training
training_args = TrainingArguments(
    output_dir=".",
    per_device_eval_batch_size=16,
    report_to="none",
)

# Metrics
metrics = ["accuracy", "f1", "precision", "recall"]

In [3]:
dataset = load_dataset(dataset_path)
splits = dataset[train_split].info.splits.keys()

Reusing dataset allocine (C:\Users\Baptiste\.cache\huggingface\datasets\allocine\allocine\1.0.0\ea86b1dc05eae3a45a07b6281f2d4033b5fe7927b1008d06aa457ca1eae660d0)


  0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)


def tokenize_function(examples):
    return tokenizer(examples[input_column], truncation=True)


encoded_dataset = dataset.map(tokenize_function, batched=True)
encoded_dataset = dataset.map(
    tokenize_function, batched=True, remove_columns=[input_column]
)

Loading cached processed dataset at C:\Users\Baptiste\.cache\huggingface\datasets\allocine\allocine\1.0.0\ea86b1dc05eae3a45a07b6281f2d4033b5fe7927b1008d06aa457ca1eae660d0\cache-2b4071c7069eb28b.arrow
Loading cached processed dataset at C:\Users\Baptiste\.cache\huggingface\datasets\allocine\allocine\1.0.0\ea86b1dc05eae3a45a07b6281f2d4033b5fe7927b1008d06aa457ca1eae660d0\cache-508123078501f427.arrow
Loading cached processed dataset at C:\Users\Baptiste\.cache\huggingface\datasets\allocine\allocine\1.0.0\ea86b1dc05eae3a45a07b6281f2d4033b5fe7927b1008d06aa457ca1eae660d0\cache-04b447737d030e4f.arrow


  0%|          | 0/160 [00:00<?, ?ba/s]

Loading cached processed dataset at C:\Users\Baptiste\.cache\huggingface\datasets\allocine\allocine\1.0.0\ea86b1dc05eae3a45a07b6281f2d4033b5fe7927b1008d06aa457ca1eae660d0\cache-4de7aa5c909b35c0.arrow
Loading cached processed dataset at C:\Users\Baptiste\.cache\huggingface\datasets\allocine\allocine\1.0.0\ea86b1dc05eae3a45a07b6281f2d4033b5fe7927b1008d06aa457ca1eae660d0\cache-5b3760e821f50a0b.arrow


In [5]:
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
model = BetterTransformer.transform(model)

In [6]:
clf_metrics = evaluate.combine(metrics)


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return clf_metrics.compute(predictions=predictions, references=labels)

In [7]:
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    compute_metrics=compute_metrics,
)

In [8]:
results = []
for split in splits:
    eval_metrics = trainer.evaluate(encoded_dataset[split])
    results.append(eval_metrics)

***** Running Evaluation *****
  Num examples = 160000
  Batch size = 16
You're using a CamembertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  hidden_states = torch._nested_tensor_from_mask(hidden_states, ~attention_mask)


  0%|          | 0/10000 [00:00<?, ?it/s]

***** Running Evaluation *****
  Num examples = 20000
  Batch size = 16


  0%|          | 0/1250 [00:00<?, ?it/s]

***** Running Evaluation *****
  Num examples = 20000
  Batch size = 16


  0%|          | 0/1250 [00:00<?, ?it/s]

In [9]:
pd.DataFrame(results, index=splits)

Unnamed: 0,eval_loss,eval_accuracy,eval_f1,eval_precision,eval_recall,eval_runtime,eval_samples_per_second,eval_steps_per_second
train,0.022919,0.993712,0.993779,0.99046,0.997121,207.3856,771.51,48.219
validation,0.102344,0.9707,0.970311,0.963186,0.977542,25.4372,786.25,49.141
test,0.106034,0.9708,0.969738,0.964043,0.9755,26.2675,761.396,47.587


In [10]:
print(f"Total time: {time.strftime('%H:%M:%S', time.gmtime(time.time()-t))}")

Total time: 00:04:36
