<a href="https://colab.research.google.com/github/claudelepere/ML_GitHub/blob/main/BERT_for_multi_label_text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## IMPORTANT

Process any ipynb file only with the editor in which it was created, don't open a JupyterLab ipynb with Colab and vice versa, don't open any ipynb with an editor that can render the display of a notebook, like VS Code or VSCodium.

Why? Because any ipynb contains metadata specific to each notebook editor and another editor may modify them.

---

Do not use conda in Colab notebook, do not try to install with conda, use pip.

---

Do not launch wandb (cannot connect to the W&B server): 3 things: #!pip install, #import, os.environ disable

In [1]:
!pip install -q accelerate
!pip install -q huggingface_hub
!pip install -q scikit-learn
!pip install -q transformers datasets
!pip install -q wandb

import json
import numpy as np
import os
import sys
import time
import torch
import wandb

from datasets              import DatasetDict
from google.colab          import auth, drive, files, userdata
from huggingface_hub       import create_repo, login, upload_file
from huggingface_hub.utils import RepositoryNotFoundError
from sklearn.metrics       import accuracy_score, average_precision_score, classification_report, f1_score, precision_score, recall_score, roc_auc_score
from torch.utils.data      import DataLoader
from tqdm.auto             import tqdm
from transformers          import AutoModelForSequenceClassification, AutoTokenizer, EvalPrediction, Trainer, TrainingArguments

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.9.0 which is incompatible.[0m[31m
[0m

In [2]:
"""
# Check the Python version
print(sys.version)
print()

# Get the installed packages (you can see that conda is not installed (do not install it))
!pip list
print()

# Check system information
!cat /etc/os-release
!uname -m
print()

# Check the GPU details (only if the runtime type is T4 GPU)
#!nvidia-smi
#print()

# Check RAM
!free -h
print()

# Check disk space
!df -h
print()

# Get environment variables
for key, value in os.environ.items():
    print(f"{key}: {value}")
"""

print(f"currentdir: {os.getcwd()}")
print()

# Check the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"device: {device}")

currentdir: /content

device: cuda


In [3]:
# Google Colab authenticate

# Required for accessing Colab Secrets: not necessary for Hugging Face Hub, for wandb, you still have to enter manually
#auth.authenticate_user()

In [4]:
# Hugging Face authenticate

os.environ["HF_TOKEN"] = userdata.get("HF_TOKEN")    # Store the key in os.environ
hf_token               = os.environ.get('HF_TOKEN')
login(token=hf_token)

# Verify
!huggingface-cli whoami

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


claudelepere


In [5]:
# Create the skill_classification repo on the Hugging Face Hub

HF_name         = "claudelepere/skill_classification"
repo_id_model   = HF_name
repo_id_dataset = HF_name

repo_model_url = create_repo(
    repo_id   = repo_id_model,
    repo_type = "model",
    private   = True,
    exist_ok  = True
    )
print(f"Repo model url: {repo_model_url} created successfully as a private repo.")

repo_dataset_url = create_repo(
    repo_id   = repo_id_dataset,
    repo_type = "dataset",
    private   = True,
    exist_ok  = True
    )
print(f"Repo datasets url: {repo_dataset_url} created successfully as a private repo.")

repo_id_dataset = f"datasets/{HF_name}"

print(f"repo_id_model: {repo_id_model}")
print(f"repo_id_dataset: {repo_id_dataset}")

Repo model url: https://huggingface.co/claudelepere/skill_classification created successfully as a private repo.
Repo datasets url: https://huggingface.co/datasets/claudelepere/skill_classification created successfully as a private repo.
repo_id_model: claudelepere/skill_classification
repo_id_dataset: datasets/claudelepere/skill_classification


In [6]:
# Weights & Biases (W&B, wandb) authenticate

#os.environ["WANDB_DISABLE_CODE"] = "true"
os.environ["WANDB_API_KEY"] = userdata.get("WANDB_API_KEY")        # Store the key in os.environ
wandb_api_key               = os.environ.get('WANDB_API_KEY')

wandb.login(relogin=True)                                          # Force relogin

try:
  wandb.init(
      project="skill_classification",
      #entity="claudelepere",
      entity = "claudelepere-c-cile-cy"
      )
except wandb.errors.CommError as e:
  print(f"CommError: {e}")
except Exception as e:
  print(f"Exception: {e}")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mclaudelepere[0m ([33mclaudelepere-c-cile-cy[0m). Use [1m`wandb login --relogin`[0m to force relogin


# My fine-tuning BERT (and friends) for multi-label text classification

In this notebook, we are going to fine-tune BERT to predict one or more labels for a given piece of text. Note that this notebook illustrates how to fine-tune a bert-base-uncased model, but you can also fine-tune a RoBERTa, DeBERTa, DistilBERT, CANINE, ... checkpoint in the same way.

All of those work in the same way: they add a **linear layer on top of the base model, which is used to produce a tensor of shape (batch_size, num_labels)**, indicating the unnormalized scores for a number of labels for every example in the batch.



## Set-up environment

First, we install the libraries which we'll use: HuggingFace Transformers and Datasets.

## Load dataset

Next, let's download a multi-label text classification dataset from the [hub](https://huggingface.co/).

At the time of writing, I picked a random one as follows:   

* first, go to the "datasets" tab on huggingface.co
* next, select the "multi-label-classification" tag on the left as well as the the "1k<10k" tag (fo find a relatively small dataset).

Note that you can also easily load your local data (i.e. csv files, txt files, Parquet files, JSON, ...) as explained [here](https://huggingface.co/docs/datasets/loading.html#local-and-remote-files).



In [7]:
# Upload and unzip the dataset zip file

uploaded_files     = files.upload()
uploaded_file_name = list(uploaded_files.keys())[0]
print(f"uploaded_file_name: {uploaded_file_name} {len(uploaded_files)}")

!unzip {uploaded_file_name}

unzipped_file_dir_name = os.path.splitext(uploaded_file_name)[0]
print(f"unzipped_file_dir_name: {unzipped_file_dir_name}")

Saving dataset_11_30000.zip to dataset_11_30000.zip
uploaded_file_name: dataset_11_30000.zip 1
Archive:  dataset_11_30000.zip
  inflating: dataset_11_30000/dataset_dict.json  
  inflating: dataset_11_30000/test/data-00000-of-00001.arrow  
  inflating: dataset_11_30000/test/dataset_info.json  
  inflating: dataset_11_30000/test/state.json  
  inflating: dataset_11_30000/train/data-00000-of-00001.arrow  
  inflating: dataset_11_30000/train/dataset_info.json  
  inflating: dataset_11_30000/train/state.json  
  inflating: dataset_11_30000/validation/data-00000-of-00001.arrow  
  inflating: dataset_11_30000/validation/dataset_info.json  
  inflating: dataset_11_30000/validation/state.json  
unzipped_file_dir_name: dataset_11_30000


In [8]:
# Create the dataset: 3 Hugging Face Dataset in a Hugging Face DatasetDict

dataset = DatasetDict.load_from_disk(unzipped_file_dir_name)

print(f"dataset: {type(dataset)} {dataset.shape}\n{dataset}")

dataset: <class 'datasets.dataset_dict.DatasetDict'> {'train': (24000, 8), 'validation': (1500, 8), 'test': (4500, 8)}
DatasetDict({
    train: Dataset({
        features: ['id', 'text', '390', '135', '136', '137', '138', '139'],
        num_rows: 24000
    })
    validation: Dataset({
        features: ['id', 'text', '390', '135', '136', '137', '138', '139'],
        num_rows: 1500
    })
    test: Dataset({
        features: ['id', 'text', '390', '135', '136', '137', '138', '139'],
        num_rows: 4500
    })
})


As we can see, the dataset contains 3 splits: one for training, one for validation and one for testing.

Let's test the first example of the training split:

In [9]:
example = dataset['train'][0]
print(f"example: {type(example)} {example.keys()}\n{example}")

example: <class 'dict'> dict_keys(['id', 'text', '390', '135', '136', '137', '138', '139'])
{'id': 96386, 'text': "Deloitte - Consultant Business Continuity Management (BCM) & Crisis Management (CM) Business Continuity Management Deloitte Company Deloitte is a world leading professional services firm, providing accounting and auditing services, management consulting and legal and tax advice. In Belgium we are the largest professional service provider. Our offices offer services to multi-national and large organisations, public institutions and innumerable small, fast-growing companies. Thanks to a strong regional presence and our multi-disciplinary approach, we are ideally placed to meet the requirements of a wide range of public institutions and small and large companies. Our Risk Advisory practice is a global leader in helping clients manage risk and uncertainty from the boardroom to the network. We provide a broad array of services that allow our clients around the world to better m

In [10]:
# Create the label list and the id2label and label2id mappings.

"""
labels
    if dataset 7_1000_125_125  , 48 labels
    if dataset 7_128_18_54     , 42 labels
    if dataset 8910_1087_68_204, 206 labels
"""

labels = [label for label in dataset['train'].features.keys() if label not in ['id', 'text']]
labels.sort()
print(f"labels: {type(labels)} {len(labels)}\n{labels}")

id2label = {idx:label for idx, label in enumerate(labels)}
print(f"id2label: {type(id2label)} {len(id2label)}\n{id2label}")

label2id = {label:idx for idx, label in enumerate(labels)}
print(f"label2id: {type(label2id)} {len(label2id)}\n{label2id}")

labels: <class 'list'> 6
['135', '136', '137', '138', '139', '390']
id2label: <class 'dict'> 6
{0: '135', 1: '136', 2: '137', 3: '138', 4: '139', 5: '390'}
label2id: <class 'dict'> 6
{'135': 0, '136': 1, '137': 2, '138': 3, '139': 4, '390': 5}


The dataset consists of texts, labeled with one or more skills.

Let's create a list that contains the labels, as well as 2 dictionaries that map labels to integers and back.

In [11]:
# Upload the label list as a JSON file to the HF repo_id_dataset

labels_path = "labels.json"
with open(labels_path, 'w') as f:
    json.dump(labels, f)
print(f"labels saved to {labels_path}")

upload_file(
    path_or_fileobj = labels_path,
    path_in_repo    = labels_path,
    repo_id         = HF_name,
    repo_type       = "dataset"
    )
print(f"labels uploaded to https://huggingface.co/datasets/{HF_name}/tree/main/{labels_path}")

labels saved to labels.json


No files have been modified since last commit. Skipping to prevent empty commit.


labels uploaded to https://huggingface.co/datasets/claudelepere/skill_classification/tree/main/labels.json


## Preprocess data

As models like BERT don't expect text as direct input, but rather **`input_ids`**, etc., we tokenize the text using the tokenizer. Here I'm using the `AutoTokenizer` API, which will automatically load the appropriate tokenizer based on the checkpoint on the hub.

What's a bit tricky is that we also need to provide labels to the model. For multi-label text classification, this is a **matrix of shape (batch_size, num_labels)**. Also important: this should be a tensor of floats rather than integers, otherwise PyTorch' **BCEWithLogitsLoss** (which the model will use) will complain, as explained [here](https://discuss.pytorch.org/t/multi-label-binary-classification-result-type-float-cant-be-cast-to-the-desired-output-type-long/117915/3).

### Preprocess (examples, not example, because batched=True => examples is a batch)

In [12]:
# Tokenize 'text' in the 3 datasets, train, validation and test

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [13]:
def preprocess_data(examples, indices):
  text = examples['text']    # Batch of texts

  encoding = tokenizer(
      text,                             # Tokenize text
      truncation     = True,
      padding        = 'max_length',
      max_length     = 512,
      return_tensors = 'pt'             # Return PyTorch tensors
      )

  # Create an empty label matrix
  labels_matrix = torch.zeros((len(text), len(labels)), dtype=torch.float32)
  #print(f"labels_matrix: {type(labels_matrix)} {labels_matrix.shape}")

  # Populate label matrix
  for idx, label in enumerate(labels):
    #print(f"idx:{idx} label:{label}")
    if label in examples:
      labels_matrix[:, idx] = torch.tensor(
          [1.0 if val else 0.0 for val in examples[label]],
          dtype=torch.float32
          )
  #print(f"labels_matrix: {type(labels_matrix)} {labels_matrix.shape}")

  # Add labels to the encoding
  encoding['labels'] = labels_matrix
  #print(f"encoding['labels']: {encoding['labels']}")

  return encoding

In [14]:
# Create the 3 encoded datasets, train, validation and test

encoded_dataset = dataset.map(
    preprocess_data,
    batched        = True,
    remove_columns = dataset['train'].column_names,
    with_indices   = True
    )
train_dataset      = encoded_dataset['train']
validation_dataset = encoded_dataset['validation']
test_dataset       = encoded_dataset['test']
print(f"encoded_dataset: {type(encoded_dataset)} {encoded_dataset.shape}\n{encoded_dataset}")
print(f"train_dataset: {type(train_dataset)} {train_dataset.shape}\n{train_dataset}")
print(f"validation_dataset: {type(validation_dataset)} {validation_dataset.shape}\n{validation_dataset}")
print(f"test_dataset: {type(test_dataset)} {test_dataset.shape}\n{test_dataset}")

Map:   0%|          | 0/24000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Map:   0%|          | 0/4500 [00:00<?, ? examples/s]

encoded_dataset: <class 'datasets.dataset_dict.DatasetDict'> {'train': (24000, 4), 'validation': (1500, 4), 'test': (4500, 4)}
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 24000
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 1500
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 4500
    })
})
train_dataset: <class 'datasets.arrow_dataset.Dataset'> (24000, 4)
Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 24000
})
validation_dataset: <class 'datasets.arrow_dataset.Dataset'> (1500, 4)
Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 1500
})
test_dataset: <class 'datasets.arrow_dataset.Dataset'> (4500, 4)
Dataset({
    features: ['input_ids', 'toke

In [15]:
example = encoded_dataset['train'][0]
print(f"example: {type(example)} {example.keys()}\n{example}")

print()

print(f"example['input_ids']: {type(example['input_ids'])} {len(example['input_ids'])}\n{example['input_ids']}")
print(f"example['token_type_ids']: {type(example['token_type_ids'])} {len(example['token_type_ids'])}\n{example['token_type_ids']}")
print(f"example['attention_mask']: {type(example['attention_mask'])} {len(example['attention_mask'])}\n{example['attention_mask']}")
print(f"example['labels']:  {type(example['labels'])} {len(example['labels'])}\n{example['labels']}")

example: <class 'dict'> dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])
{'input_ids': [101, 3972, 28100, 2618, 1011, 8930, 2449, 13717, 2968, 1006, 4647, 2213, 1007, 1004, 5325, 2968, 1006, 4642, 1007, 2449, 13717, 2968, 3972, 28100, 2618, 2194, 3972, 28100, 2618, 2003, 1037, 2088, 2877, 2658, 2578, 3813, 1010, 4346, 9529, 1998, 15727, 2075, 2578, 1010, 2968, 10552, 1998, 3423, 1998, 4171, 6040, 1012, 1999, 5706, 2057, 2024, 1996, 2922, 2658, 2326, 10802, 1012, 2256, 4822, 3749, 2578, 2000, 4800, 1011, 2120, 1998, 2312, 8593, 1010, 2270, 4896, 1998, 7601, 17897, 16670, 2235, 1010, 3435, 1011, 3652, 3316, 1012, 4283, 2000, 1037, 2844, 3164, 3739, 1998, 2256, 4800, 1011, 17972, 3921, 1010, 2057, 2024, 28946, 2872, 2000, 3113, 1996, 5918, 1997, 1037, 2898, 2846, 1997, 2270, 4896, 1998, 2235, 1998, 2312, 3316, 1012, 2256, 3891, 7319, 3218, 2003, 1037, 3795, 3003, 1999, 5094, 7846, 6133, 3891, 1998, 12503, 2013, 1996, 2604, 9954, 2000, 1996, 2897, 1012, 2057, 3073, 10

In [16]:
tokenizer.decode(example['input_ids'])

"[CLS] deloitte - consultant business continuity management ( bcm ) & crisis management ( cm ) business continuity management deloitte company deloitte is a world leading professional services firm, providing accounting and auditing services, management consulting and legal and tax advice. in belgium we are the largest professional service provider. our offices offer services to multi - national and large organisations, public institutions and innumerable small, fast - growing companies. thanks to a strong regional presence and our multi - disciplinary approach, we are ideally placed to meet the requirements of a wide range of public institutions and small and large companies. our risk advisory practice is a global leader in helping clients manage risk and uncertainty from the boardroom to the network. we provide a broad array of services that allow our clients around the world to better measure, manage and control risk to enhance the reliability of systems and processes throughout the

In [17]:
[id2label[idx] for idx, label in enumerate(example['labels']) if label == 1.0]

['137', '138', '139']

Finally, we set the format of our data to PyTorch tensors. This will turn the training, validation and test sets into standard PyTorch [datasets](https://pytorch.org/docs/stable/data.html).

In [18]:
# Set PyTorch format to ensures correctness and compatibility with PyTorch pipelines

# The 3 Hugging Face Dataset are formatted as PyTorch Dataset
encoded_dataset.set_format('torch')

## Define model

Here we define a **model that includes a pre-trained base (i.e. the weights from bert-base-uncased) are loaded, with a random initialized classification head (linear layer) on top**. One should fine-tune this head, together with the pre-trained base on a labeled dataset.

This is also printed by the warning.

We set the `problem_type` to be "multi_label_classification", as this will make sure the appropriate loss function is used (namely [**BCEWithLogitsLoss**](https://pytorch.org/docs/stable/generated/torch.nn.BCEWithLogitsLoss.html)). We also make sure the output layer has `len(labels)` output neurons, and we set the id2label and label2id mappings.

In [19]:
# Define the model

model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    problem_type="multi_label_classification",
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id
    )

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Train the model!

We are going to train the model using HuggingFace's Trainer API. This requires us to define 2 things:

* `TrainingArguments`, which specify training hyperparameters. All options can be found in the [docs](https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments). Below, we for example specify that we want to evaluate after every epoch of training, we would like to save the model every epoch, we set the learning rate, the batch size to use for training/evaluation, how many epochs to train for, and so on.
* a `Trainer` object (docs can be found [here](https://huggingface.co/transformers/main_classes/trainer.html#id1)).

In [20]:
batch_size  = 8     # 8, 16, 32, 64, 128
metric_name = "f1"

### TrainingArguments

In [21]:
output_dir = "training_results"  # where model predictions and checkpoints will be written during training
args = TrainingArguments(
    output_dir                  = output_dir,
    overwrite_output_dir        = True,
    logging_dir                 = "logs",
    logging_steps               = 50,
    save_steps                  = 500,
    save_total_limit            = 2,
    eval_strategy               = "epoch",
    save_strategy               = "epoch",
    learning_rate               = 2e-5,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size  = batch_size,
    num_train_epochs            = 5,
    weight_decay                = 0.01,
    load_best_model_at_end      = True,
    metric_for_best_model       = metric_name,
    #push_to_hub                 = True,             # push the model to the HF Hub
    run_name                   = "BERT-multilabel-lr2e5-epochs5-dataset_11_50000",
    report_to                  = "none" #"wandb"
    )

We are also going to compute metrics while training. For this, we need to define a `compute_metrics` function, that returns a dictionary with the desired metric values.

In [22]:
# Metrics
#   source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/

def multi_label_metrics(predictions, labels, threshold=0.2):
    _average = 'micro'    # 'micro' or 'weighted'

    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs   = sigmoid(torch.Tensor(predictions))

    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1

    # finally, compute metrics
    y_true               = labels
    f1                   = f1_score               (y_true=y_true, y_pred=y_pred, average=_average)    #, zero_division=1)
    precision            = precision_score        (y_true=y_true, y_pred=y_pred, average=_average)    #, zero_division=1)
    recall               = recall_score           (y_true=y_true, y_pred=y_pred, average=_average)    #, zero_division=1)
    roc_auc              = roc_auc_score          (y_true=y_true, y_score=probs, average=_average)
    precision_recall_auc = average_precision_score(y_true=y_true, y_score=probs, average=_average)
    accuracy             = accuracy_score         (y_true=y_true, y_pred=y_pred)

    # return as dictionary
    metrics = {
        'f1'                  : f1,
        'precision'           : precision,
        'recall'              : recall,
        'roc_auc'             : roc_auc,
        'precision_recall_auc': precision_recall_auc,
        'accuracy'            : accuracy
        }

    return metrics

In [23]:
def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    result = multi_label_metrics(
        predictions = preds,
        labels      = p.label_ids
        )
    return result

Let's verify a batch as well as a forward pass:

In [24]:
print(f"inputids:        {type(encoded_dataset['train']['input_ids'][0])}\t{encoded_dataset['train']['input_ids'][0].shape}")
print(f"token_type_ids': {type(encoded_dataset['train']['token_type_ids'][0])}\t{encoded_dataset['train']['token_type_ids'][0].shape}")
print(f"attention_mask:  {type(encoded_dataset['train']['attention_mask'][0])}\t{encoded_dataset['train']['attention_mask'][0].shape}")
print(f"labels:          {type(encoded_dataset['train'][0]['labels'])}\t{encoded_dataset['train'][0]['labels'].shape}")

inputids:        <class 'torch.Tensor'>	torch.Size([512])
token_type_ids': <class 'torch.Tensor'>	torch.Size([512])
attention_mask:  <class 'torch.Tensor'>	torch.Size([512])
labels:          <class 'torch.Tensor'>	torch.Size([6])


In [25]:
# Execute the forward pass

outputs = model(
    input_ids      = encoded_dataset['train']['input_ids'][0].unsqueeze(0),
    attention_mask = encoded_dataset['train']['attention_mask'][0].unsqueeze(0),
    labels         = encoded_dataset['train'][0]['labels'].unsqueeze(0)
    )

print(f"outputs: {type(outputs)} {outputs.keys()}\n{outputs}")

outputs: <class 'transformers.modeling_outputs.SequenceClassifierOutput'> odict_keys(['loss', 'logits'])
SequenceClassifierOutput(loss=tensor(0.7118, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>), logits=tensor([[-0.0311,  0.1629,  0.1024, -0.0024,  0.0621,  0.2301]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


Let's start training!

In [26]:
## Create the trainer

trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
    )


  trainer = Trainer(


In [None]:
# Train, save the results as a JSON file

train_output  = trainer.train()

train_results = {
    'global_step':   train_output.global_step,    # total steps completed during training
    'training_loss': train_output.training_loss,  # average loss during training
    'metrics':       train_output.metrics         # dictionary of metrics
}

with open("train_results.json", "w") as f:
  json.dump(train_results, f, indent=4)

Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Roc Auc,Precision Recall Auc,Accuracy
1,0.2948,0.299382,0.856199,0.77994,0.948987,0.946236,0.931845,0.403333
2,0.3016,0.273552,0.861951,0.784541,0.956309,0.954314,0.942269,0.398667


In [None]:
print("Training successfully completed.")

## Evaluate

After training, we evaluate our model on the validation set.

In [None]:
# Evaluate, save the output as a JSON file

eval_output = trainer.evaluate()

with open("eval_results.json", "w") as f:
  json.dump(eval_output, f, indent=4)

In [None]:
print("Evaluation successfully completed.")

In [None]:
# Save the model

model_path = "model"
trainer.save_model(model_path)

In [None]:
# Upload the model and the tokenizer to the HF repo_id_model

model     = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

model.push_to_hub(repo_id_model)
tokenizer.push_to_hub(repo_id_model)

In [None]:
# Upload train_results.json and eval_results.json to the HF repo_id_dataset
#   BETTER to upload to wanddb repo?

upload_file(
    path_or_fileobj = "train_results.json",
    path_in_repo    = "train_results.json",
    repo_id         = HF_name,
    repo_type       = "dataset"
    )

upload_file(
    path_or_fileobj = "eval_results.json",
    path_in_repo    = "eval_results.json",
    repo_id         = HF_name,
    repo_type       = "dataset"
    )

## Predict results on the test dataset
Faire la même chose pour l'évaluation?

In [None]:
# Set the model to evaluation mode to disable dropout and other training-specific behaviors

model.eval()

In [None]:
# Predict results on the test dataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

all_preds       = []
all_probs       = []
all_true_labels = []
threshold       = 0.5

for batch in tqdm(test_loader):
  batch = {k: v.to(device) for k, v in batch.items()}
  with torch.no_grad():
    outputs = model(**batch)
  logits = outputs.logits

  # Convert logits to probabilities and probabilities to predictions
  sigmoid = torch.nn.Sigmoid()
  probs   = sigmoid(logits).cpu().numpy()    # Convert to Numpy
  preds   = (probs > threshold).astype(int)  # Convert to binary Numpy array

  # Accumulate probabilities, predictions and labels
  all_probs.append(probs)
  all_preds.append(preds)
  all_true_labels.append(batch['labels'].cpu().numpy())

# Concatenate results from all batches
all_probs       = np.concatenate(all_probs, axis=0)        # shape: [num_samples, num_labels]
all_preds       = np.concatenate(all_preds, axis=0)        # shape: [num_samples, num_labels]
all_true_labels = np.concatenate(all_true_labels, axis=0)  # shape: [num_samples, num_labels]

#np.set_printoptions(threshold=np.inf)
print(f"all_probs:       {type(all_probs)} {all_probs.shape} {all_probs}")
print(f"all_preds:       {type(all_preds)} {all_preds.shape} {all_preds}")
print(f"all_true_labels: {type(all_true_labels)} {all_true_labels.shape} {all_true_labels}")

# Classification report for precision, recall, F1 score
print(classification_report(
    y_true=all_true_labels,
    y_pred=all_preds,
    target_names=labels,
    zero_division=0
    ))

# ROC AUC for multi-label classification
roc_auc = roc_auc_score(
    y_true=all_true_labels,
    y_score=all_probs,
    average='micro'
    )
print(f"ROC AUC: {roc_auc}")

In [None]:
print("Predictions 1 successfully completed.")

### Or otherwise

In [None]:
prediction = trainer.predict(test_dataset)
print(predictions.predictions)  # Model logits
print(predictions.label_ids)    # Ground truth labels
print(prediction.metrics)       # Metrics

In [None]:
print("Predictions 2 successfully completed.")