https://github.com/huggingface/notebooks/blob/main/examples/text_classification.ipynb

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import shutil
import time
from pathlib import Path

import evaluate
import numpy as np
import tensorflow as tf
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TFAutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
)
from transformers.utils import is_torch_tf32_available

assert torch.cuda.is_available()
assert len(tf.config.list_physical_devices("GPU")) >= 1

print(torch.cuda.get_device_name(0))
print(
    f"Free memory : {round(torch.cuda.mem_get_info()[0] / 1024 ** 3,1)} / {round(torch.cuda.mem_get_info()[1] / 1024 ** 3,1)} GB"
)

if is_torch_tf32_available():
    torch.backends.cuda.matmul.allow_tf32 = True
    print("\nUsing TF32")
else:
    print("\nTF32 not available")


t = time.time()

NVIDIA GeForce RTX 3080
Free memory : 8.9 / 10.0 GB

Using TF32


In [3]:
# Data
dataset_path = "allocine"
input_column = "review"
label_column = "label"
new_label2id = {"NEGATIVE": 0, "POSITIVE": 1}
train_split = "train"
eval_split = "validation"
test_split = "test"

# Model
model_checkpoint = "cmarkea/distilcamembert-base"
output_model_name = "distilcamembert-allocine"
output_dir = "models/" + output_model_name

PUSH_TO_HUB = False

# Training
training_args = TrainingArguments(
    output_dir=output_dir,
    evaluation_strategy="steps",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=4,
    learning_rate=5e-5,
    weight_decay=0.01,
    num_train_epochs=3,
    warmup_steps=500,
    logging_first_step=True,
    logging_steps=10,
    save_strategy="steps",
    save_steps=500,
    eval_steps=500,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="all",
    push_to_hub=PUSH_TO_HUB,
)

# Metrics
metrics = ["accuracy", "f1", "precision", "recall"]

# Model card
language = ["fr"]

In [4]:
dataset = load_dataset(dataset_path)
dataset

Found cached dataset allocine (C:/Users/Baptiste/.cache/huggingface/datasets/allocine/allocine/1.0.0/ea86b1dc05eae3a45a07b6281f2d4033b5fe7927b1008d06aa457ca1eae660d0)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['review', 'label'],
        num_rows: 160000
    })
    validation: Dataset({
        features: ['review', 'label'],
        num_rows: 20000
    })
    test: Dataset({
        features: ['review', 'label'],
        num_rows: 20000
    })
})

In [5]:
print("old_label2id:", dataset["train"].features["label"]._str2int)
label2id = new_label2id

id2label = {value: key for key, value in label2id.items()}

print("label2id:", label2id)
print("id2label:", id2label)

old_label2id: {'neg': 0, 'pos': 1}
label2id: {'NEGATIVE': 0, 'POSITIVE': 1}
id2label: {0: 'NEGATIVE', 1: 'POSITIVE'}


In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)


def tokenize_function(examples):
    return tokenizer(examples[input_column], truncation=True)


encoded_dataset = dataset.map(
    tokenize_function, batched=True, remove_columns=[input_column]
)
train_dataset = encoded_dataset[train_split]
eval_dataset = encoded_dataset[eval_split]
test_dataset = encoded_dataset[test_split]

Loading cached processed dataset at C:\Users\Baptiste\.cache\huggingface\datasets\allocine\allocine\1.0.0\ea86b1dc05eae3a45a07b6281f2d4033b5fe7927b1008d06aa457ca1eae660d0\cache-ea4d286d3404735c.arrow
Loading cached processed dataset at C:\Users\Baptiste\.cache\huggingface\datasets\allocine\allocine\1.0.0\ea86b1dc05eae3a45a07b6281f2d4033b5fe7927b1008d06aa457ca1eae660d0\cache-ef55e725dc732a08.arrow
Loading cached processed dataset at C:\Users\Baptiste\.cache\huggingface\datasets\allocine\allocine\1.0.0\ea86b1dc05eae3a45a07b6281f2d4033b5fe7927b1008d06aa457ca1eae660d0\cache-06b4ebe2acc3478d.arrow


In [7]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, label2id=label2id, id2label=id2label
)

Some weights of the model checkpoint at cmarkea/distilcamembert-base were not used when initializing CamembertForSequenceClassification: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at cmarkea/distilcamembert-base and are newly initialized: ['classifier.dense.bias', 'classif

In [8]:
clf_metrics = evaluate.combine(metrics)


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return clf_metrics.compute(predictions=predictions, references=labels)

In [9]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [10]:
train_results = trainer.train()

***** Running training *****
  Num examples = 160000
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 4
  Total optimization steps = 7500
  Number of trainable parameters = 68096258
You're using a CamembertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
500,0.1504,0.128982,0.95545,0.954178,0.961447,0.947019
1000,0.1334,0.10495,0.96235,0.961898,0.953647,0.970294
1500,0.1158,0.105219,0.963,0.962743,0.949831,0.976011
2000,0.1153,0.094924,0.9661,0.965277,0.968551,0.962025
2500,0.1053,0.093568,0.9666,0.966337,0.95422,0.978767
3000,0.0755,0.098741,0.97,0.96954,0.964351,0.974786
3500,0.0716,0.107814,0.9688,0.968437,0.959795,0.977236
4000,0.0688,0.105061,0.9673,0.967033,0.955188,0.979175
4500,0.0691,0.094024,0.97095,0.970395,0.968766,0.972029
5000,0.0733,0.103808,0.96855,0.968317,0.955752,0.981217


***** Running Evaluation *****
  Num examples = 20000
  Batch size = 16
Saving model checkpoint to models/distilcamembert-allocine\checkpoint-500
Configuration saved in models/distilcamembert-allocine\checkpoint-500\config.json
Model weights saved in models/distilcamembert-allocine\checkpoint-500\pytorch_model.bin
tokenizer config file saved in models/distilcamembert-allocine\checkpoint-500\tokenizer_config.json
Special tokens file saved in models/distilcamembert-allocine\checkpoint-500\special_tokens_map.json
***** Running Evaluation *****
  Num examples = 20000
  Batch size = 16
Saving model checkpoint to models/distilcamembert-allocine\checkpoint-1000
Configuration saved in models/distilcamembert-allocine\checkpoint-1000\config.json
Model weights saved in models/distilcamembert-allocine\checkpoint-1000\pytorch_model.bin
tokenizer config file saved in models/distilcamembert-allocine\checkpoint-1000\tokenizer_config.json
Special tokens file saved in models/distilcamembert-allocine\che

In [11]:
train_metrics = train_results.metrics
train_metrics["train_samples"] = len(train_dataset)
trainer.log_metrics("train", train_metrics)
trainer.save_metrics("train", train_metrics)

***** train metrics *****
  epoch                    =        3.0
  total_flos               = 42405087GF
  train_loss               =     0.0894
  train_runtime            = 1:13:46.03
  train_samples            =     160000
  train_samples_per_second =    108.449
  train_steps_per_second   =      1.695


In [12]:
eval_metrics = trainer.evaluate()

***** Running Evaluation *****
  Num examples = 20000
  Batch size = 16


In [13]:
eval_metrics["eval_samples"] = len(eval_dataset)
trainer.log_metrics("eval", eval_metrics)
trainer.save_metrics("eval", eval_metrics)

***** eval metrics *****
  epoch                   =        3.0
  eval_accuracy           =     0.9714
  eval_f1                 =      0.971
  eval_loss               =     0.1066
  eval_precision          =     0.9648
  eval_recall             =     0.9772
  eval_runtime            = 0:00:52.95
  eval_samples            =      20000
  eval_samples_per_second =    377.655
  eval_steps_per_second   =     23.603


In [14]:
test_output = trainer.predict(test_dataset)

***** Running Prediction *****
  Num examples = 20000
  Batch size = 16


In [15]:
test_metrics = test_output.metrics
test_metrics["test_samples"] = len(test_dataset)
trainer.log_metrics("test", test_metrics)
trainer.save_metrics("test", test_metrics)

***** test metrics *****
  test_accuracy           =     0.9704
  test_f1                 =     0.9692
  test_loss               =     0.1095
  test_precision          =      0.966
  test_recall             =     0.9724
  test_runtime            = 0:00:54.56
  test_samples            =      20000
  test_samples_per_second =    366.503
  test_steps_per_second   =     22.906


In [16]:
trainer.save_state()

Path(output_dir, "results").mkdir(exist_ok=True)
for file in [
    "all_results.json",
    "train_results.json",
    "eval_results.json",
    "test_results.json",
    "trainer_state.json",
]:
    shutil.move(Path(output_dir, file), Path(output_dir, "results", file))

In [17]:
if PUSH_TO_HUB:
    trainer.push_to_hub(language=language)
else:
    trainer.save_model()
    trainer.create_model_card(language=language)

Saving model checkpoint to models/distilcamembert-allocine
Configuration saved in models/distilcamembert-allocine\config.json
Model weights saved in models/distilcamembert-allocine\pytorch_model.bin
tokenizer config file saved in models/distilcamembert-allocine\tokenizer_config.json
Special tokens file saved in models/distilcamembert-allocine\special_tokens_map.json


In [18]:
del trainer
del model
torch.cuda.empty_cache()

In [19]:
tf_model = TFAutoModelForSequenceClassification.from_pretrained(
    output_dir, from_pt=True
)
tf_model.config.__dict__["_name_or_path"] = model_checkpoint
# tf_model.push_to_hub(output_model_name) # modify README
tf_model.save_pretrained(output_dir)

loading configuration file models/distilcamembert-allocine\config.json
Model config CamembertConfig {
  "_name_or_path": "models/distilcamembert-allocine",
  "architectures": [
    "CamembertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "NEGATIVE",
    "1": "POSITIVE"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "NEGATIVE": 0,
    "POSITIVE": 1
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "camembert",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.26.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_siz

In [20]:
print(f"Total time: {time.strftime('%H:%M:%S', time.gmtime(time.time()-t))}")

Total time: 01:15:47
