<a href="https://colab.research.google.com/github/claudelepere/ML_GitHub/blob/main/NEW_11_48000_micro_micro_ep4_2e_5_fl05_40_thr05.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q accelerate
!pip install -q huggingface_hub
!pip install -q scikit-learn
!pip install -q transformers datasets  # 2 Hugging Face libraries
!pip install -q wandb

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m48.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m69.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m44.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m39.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import datetime
import json
import logging
import numpy as np
import os
import pandas as pd
import pickle
import shutil
import sys
import time
import torch
import torch.nn.functional as F
import wandb

from contextlib             import suppress
from datasets               import DatasetDict
from google.colab           import auth, drive, files, userdata
from huggingface_hub        import create_branch, create_repo, HfApi, login, upload_file, hf_hub_download, whoami
from huggingface_hub.errors import RepositoryNotFoundError
from sklearn.metrics        import f1_score, precision_score, recall_score, roc_auc_score, average_precision_score, accuracy_score, hamming_loss, classification_report,  precision_recall_fscore_support
from torch.optim            import AdamW
from torch.utils.data       import DataLoader
from tqdm.auto              import tqdm
from transformers           import EvalPrediction, LongformerTokenizerFast, LongformerForSequenceClassification, Trainer, TrainerCallback, TrainingArguments
#from transformers           import logging as transformers_logging
from torch.nn               import BCEWithLogitsLoss, Module

In [3]:
# Is /content the current directory?
print(f"Current directory: {os.getcwd()}")

# Ensure the logs directory exists
os.makedirs('/content/logs', exist_ok=True)

# Create handlers for both file and console output
file_handler = logging.FileHandler('/content/logs/training.log')
file_handler.setLevel(logging.INFO)                               # Log level for file

#console_handler = logging.StreamHandler()
#console_handler.setLevel(logging.INFO)                            # Log level for console

# Create a formatter and attach it to both handlers
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
file_handler.setFormatter(formatter)
#console_handler.setFormatter(formatter)

# Add the handlers to the root logger
logger = logging.getLogger()
#logger.setLevel(logging.INFO)
logger.addHandler(file_handler)
#logger.addHandler(console_handler)

#transformers_logging.set_verbosity_error()  # Only show errors, suppress info

# Remove all existing handlers of type StreamHandler
#for handler in logger.handlers[:]:
#    if isinstance(handler, logging.StreamHandler):
#        logger.removeHandler(handler)

# Verify by printing current handlers
#print("handlers", logger.handlers)  # Should only show FileHandler

# Test logging to ensure both file and console work
logging.info("Testing log before training starts.")


Current directory: /content


## Google Cloud Authentication

In [4]:
#auth.authenticate_user()  # user = c.lepere@ictjob.be

## Get skills and jobs

In [5]:
skills         = 11
all_rows_low   = 0
all_rows_high  = 48000 # 120 1200 12000 24000 48000
num_datapoints = all_rows_high - all_rows_low

datasetDict_zip_file_name = f"dataset_EN_{skills}_{all_rows_low}_{all_rows_high}.zip"
datasetDict_dir_name      = os.path.splitext(datasetDict_zip_file_name)[0]

print(f"datasetDict_zip_file_name: {datasetDict_zip_file_name}")
print(f"datasetDict_dir_name     : {datasetDict_dir_name}")
print()


datasetDict_zip_file_name: dataset_EN_11_0_48000.zip
datasetDict_dir_name     : dataset_EN_11_0_48000



## Averages
<pre>
- per sample: metrics are computed for each sample (= for each instance, = for each row of y_true and y_pred), and then averaged across all samples.
- per label : metrics are computed for each label separately, and then averaged across all labels.
- per batch : metrics are computed for each batch, and then averaged across all batches.

- 'micro': Calculate metrics globally by counting the total true positives, false negatives and false positives.
           Gives more weight to frequent labels → best for imbalanced datasets where frequent labels are more important.
- 'macro': Calculate metrics for each label separately, and find their unweighted mean.
           This does not take label imbalance into account. This is fine for balanced datasets but not for imbalanced datasets since rare labels are given equal weights.
           Averages the metric for each label without considering their imbalance, without considering label frequency.
           When to use: when wanting equal importance for all labels, including rare ones.
           Treats all labels equally → best when you care about rare labels as much as frequent ones.
- 'weighted': Calculate metrics for each label separately, and find their average weighted by support (= the number of true instances for each label). This alters ‘macro’ to account for label imbalance;
              it can result in an F-score that is not between precision and recall.
              Averages per label weighted by their support, without considering label frequency.
              When to use: when wanting to reflect label imbalance (common labels contribute more).
              Like macro but considers label frequency → best if you want a compromise between macro and micro.
- 'samples': Calculate metrics per sample instead of per label, and find their average (only meaningful for multilabel-classification where this differs from accuracy_score).
             Computes the metric per sample and then averages across all samples.
             When to use: when each sample has multiple correct labels.

- 'macro' or 'weighted' AUC is often best because AUC isn't as affected by class imbalance as F1/Precision/Recall
- 'macro'      AUC: usually the best because it treats all labels equally, avoiding the dominance of frequent labels
- 'weighted'   AUC: similar to macro but considers label frequency
- 'macro'   PR AUC: best for imbalanced datasets because it treats rare labels fairly
- 'weighted PR AUC: also good, but slightly biased toward frequent labels

PR AUC is better than ROC AUC when you care about positive examples in imbalanced data.
</pre>

In [6]:
training_average   = 'micro'             # 'weighted' (best) or 'samples
evaluation_average = 'micro'             # 'macro'    (best) or 'weighted'
test_average       = evaluation_average  #
prediction_average = 'micro'             # 'micro'    (best) or 'samples' (prediction of unseen datapoints)

## Tune thresholds?

In [7]:
threshold_tuning = True

## Upload to HF Hub?

In [8]:
upload_to_HF = True
repo_id      = ''
timestamp    = ''


## Hugging Face Hub (HF Hub) authenticate

In [9]:
if upload_to_HF is True:

  os.environ["HF_TOKEN"] = userdata.get("HF_TOKEN")    # Store the key in os.environ
  hf_token               = os.environ.get('HF_TOKEN')

  login(token=hf_token)

  # Check
  user = whoami(token=hf_token)
  assert user['name'] == 'claudelepere', f"{user['name']} is not claudelepere"
  print(f"user: {user}")

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


user: {'type': 'user', 'id': '66ec3d5f61228b02f8780beb', 'name': 'claudelepere', 'fullname': 'Claude Lepère', 'isPro': False, 'avatarUrl': 'https://cdn-avatars.huggingface.co/v1/production/uploads/66ec3d5f61228b02f8780beb/gvnf9pvm2KvE90ETMUQo3.jpeg', 'orgs': [], 'auth': {'type': 'access_token', 'accessToken': {'displayName': 'jobs_token', 'role': 'fineGrained', 'createdAt': '2025-01-04T17:44:35.493Z', 'fineGrained': {'canReadGatedRepos': False, 'global': [], 'scoped': [{'entity': {'_id': '66ec3d5f61228b02f8780beb', 'type': 'user', 'name': 'claudelepere'}, 'permissions': ['repo.content.read', 'repo.write']}]}}}}


## repo_id, branch, model and dataset repos on HF Hub
**1 repo = 1 model and 1 tokenizer**

**branch = revision**

In [10]:
if upload_to_HF is True:

  if threshold_tuning:
    repo_id   = 'claudelepere/jobs_EN_11_48000_032308'
  else:
    repo_id   = 'claudelepere/jobs_EN_11_48000_032308_no_threshold_tuning'

  timestamp = f"{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"

  model_repoUrl   = create_repo(repo_id=repo_id, repo_type="model",   private=True, exist_ok=True)
  dataset_repoUrl = create_repo(repo_id=repo_id, repo_type="dataset", private=True, exist_ok=True)

  #create_branch(repo_id=repo_id, repo_type="model",   branch=branch, exist_ok=True)
  #create_branch(repo_id=repo_id, repo_type="dataset", branch=branch, exist_ok=True)

  print(f"Model Repo Url: {model_repoUrl} created successfully as a private repo")
  print(f"Dataset Repo Url: {dataset_repoUrl} created successfully as a private repo")

Model Repo Url: https://huggingface.co/claudelepere/jobs_EN_11_48000_032308 created successfully as a private repo
Dataset Repo Url: https://huggingface.co/datasets/claudelepere/jobs_EN_11_48000_032308 created successfully as a private repo


## HF model card
Model card here => README.md on the HF Hub.

In [11]:
model_card = """
---
tags:
- "48000"
---
# Model
Model fine-tuned on higly imbalanced multilabel classification.

## Model details
- Language: English
- Task: Multilabel classification
- Architecture: Longformer
- Pretrained model: [allenai/longformer-base-4096](https://huggingface.co/allenai/longformer-base-4096)
- Framework: Pytorch
- Version 1.0.0

## Training Data
- skills: 11
- 48000 job datapoints

## Fine-tuning parameters
- batch size: 8
- gradient accumulation: 4
- fp16 precision
- input tokens max length: 1024
- epochs: 4
- learning rate: 2e-5
- attention window size: 1024
- training average: micro
- evaluation average: micro
- test average: micro
- prediction average: micro
- threshold tuning: True
- threshold: 0.5
- focal loss: alpha=0.5, gamma=4.0
"""

## Save locally and upload model card to HF Hub

In [12]:
if upload_to_HF is True:
    name            = "model_card"
    model_card_path = f"{name}.md"

    with open(model_card_path, "w") as f:
        f.write(model_card)

    upload_file(
        path_or_fileobj = model_card_path,
        path_in_repo    = 'README.md',
        repo_id         = repo_id,
        commit_message  = f"{name}_{timestamp}"
    )

    print(f"{name} results successfully uploaded to HF Hub as {model_card_path}")

model_card results successfully uploaded to HF Hub as model_card.md


In [13]:
"""
# Check the Python version
print(sys.version)
print()

# Get the installed packages (you can see that conda is not installed (do not install it))
!pip list
print()

# Check system information
!cat /etc/os-release
!uname -m
print()

# Check the GPU details (only if the runtime type is T4 GPU)
#!nvidia-smi
#print()

# Check RAM
!free -h
print()

# Check disk space
!df -h
print()

# Get environment variables
for key, value in os.environ.items():
    print(f"{key}: {value}")
"""
!python -V

Python 3.11.11


In [14]:
print(f"currentdir: {os.getcwd()}")

currentdir: /content


In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"device: {device}")

device: cuda


## Out Of Memory (OOM)

### OOM: reduce batch size
      small sizes (1 to 32):            PROs: better generalization in some cases
                                        CONs: may produce noisier gradients
      large sizes (128, 256, or higer): PROs: gradients are smoother, leading to more stable training
                                        CONs: poorer generalization (overfitting) in some cases
      intermediate sizes (32, 64):      combines the benefits of small and large sizes

In [16]:
batch_size = 8

### OOM: enable gradient accumulation

* compensate for smaller batch sizes by accumulating gradients over several steps
* **effective batch size** = per-device batch size x gradient acumulation steps
* in each iteration, the model computes the gradients, these gradients are immediately used to update the model parameters

WARNING: gradient_accumulation_steps may not be None => comment it in TrainingArguments

In [17]:
gradient_accumulation_steps = 4

### OOM: use PYTORCH_CUDA_ALLOC_CONF to handle memory fragmentation

In [18]:
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

### OOM: check for and kill zombie processes

In [19]:
!ps aux | grep python
!kill -9 <PID>
if torch.cuda.is_available():
  !nvidia-smi
  print(torch.cuda.memory_summary())

root          85  4.0  0.0      0     0 ?        Z    07:14   0:14 [python3] <defunct>
root          86  0.1  0.0  77348 57416 ?        S    07:14   0:00 python3 /usr/local/bin/colab-file
root         135  1.2  0.1 793104 130668 ?       Sl   07:14   0:04 /usr/bin/python3 /usr/local/bin/j
root        1118 17.9  1.5 12386652 1347224 ?    Ssl  07:18   0:20 /usr/bin/python3 -m colab_kernel_
root        1156  0.6  0.0 544720 20292 ?        Sl   07:18   0:00 /usr/bin/python3 /usr/local/lib/p
root        1813  0.0  0.0   7376  3500 ?        S    07:20   0:00 /bin/bash -c ps aux | grep python
root        1815  0.0  0.0   6484  2324 ?        S    07:20   0:00 grep python
/bin/bash: -c: line 1: syntax error near unexpected token `newline'
/bin/bash: -c: line 1: `kill -9 <PID>'
Sun Mar 23 07:20:20 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----

### OOM: use fp16 (half precision) mixed precision training
reduces memory requirements by up to 50%

In [20]:
fp = 'fp16'

### OOM: limit the number of GPU workers:
* 0 (default) or 1
* in Colab dataloader_num_workers = 1

### OOM: reduce model size or input tokens
* LongformerTokenizer.from_pretrained('allenai/longformer-base/large-4096'): large/base: 435M/149M parameters
* max_length: 4096 max for Longformer
* a single word can be equal to several tokens; stop words are **NOT discarded**!
* word_text_length_counts_sorted:
      jobs count                 : 50000
      jobs count under  512 words: 44794  89.59%
      jobs count under  640 words: 47894  95.79%
      jobs count under  768 words: 49123  98.25%
      jobs count under  896 words: 49691  99.38%
      jobs count under 1024 words: 49917  99.83%
      jobs count under 2048 words: 50000 100.00%
      jobs count under 4096 words: 50000 100.00%

In [21]:
#max_length =  768    #      37 min    #
max_length = 1024    #      38 min    # GPU RAM: 12.2 / 40 GB
#max_length = 2048    # 1 hr 10 min    # GPU RAM: 21.4 / 40 GB
#max_length = 4096    # 2 hr 10 min    # GPU RAM: 39.5 / 40 GB => OutOfMemoryError

### OOM: free up GPU memory

In [22]:
torch.cuda.empty_cache()

### OOM: reduce the number of transformers layers

In [23]:
# hidden_layers = 6  # default:12

## epoch
* 1 epoch is a complete pass through the entire training dataset
* with n datapoints and batch size = b, n/b iterations to complete 1 epoch
* 1 iteration is a single update of the model's parameters

In [24]:
epochs = 4

## learning rate
* A common rule is to scale the learning rate proportionaly with the effective batch size
* **note: get_linear_schedule_with_warmup**

In [25]:
learning_rate = 2e-5 #1e-5  # 1e-5 x 32/8

## threshold
default: 0.5

In [26]:
threshold = 0.5

## attention window size

In [27]:
attention_window = 1024 #512

## Upload and unzip job dataset

In [28]:
def upload_unzip_dataset(filename):
    """Upload and unzip the dataset to /content, ensuring correct placement."""

    # Get the expected directory name (same as the zip filename without extension)
    expected_dir = os.path.splitext(filename)[0]

    # Check if the file and the directory exist in /content and delete them
    with suppress(FileNotFoundError):
        if os.path.isdir(expected_dir):
            shutil.rmtree(expected_dir)  # Remove directory if it exists
        if os.path.isfile(filename):
            os.remove(filename)          # Remove file if it exists

    print(f"Removed '{expected_dir}' and '{filename}' if they were present in /content.")

    # Upload the zip file
    uploaded_files = files.upload()  # Prompt file upload dialog

    if filename not in uploaded_files:
        raise FileNotFoundError(f"'{filename}' was not uploaded.")

    print(f"'{filename}' successfully uploaded to /content.")

    # Unzip the file to /content
    shutil.unpack_archive(filename, "/content")

    print(f"Unzipped to '/content/{expected_dir}'.")

# Usage
upload_unzip_dataset(datasetDict_zip_file_name)

Removed 'dataset_EN_11_0_48000' and 'dataset_EN_11_0_48000.zip' if they were present in /content.


Saving dataset_EN_11_0_48000.zip to dataset_EN_11_0_48000.zip
'dataset_EN_11_0_48000.zip' successfully uploaded to /content.
Unzipped to '/content/dataset_EN_11_0_48000'.


## W&B initialization (not used now)

In [29]:
run_name = f"EN_{skills}_{all_rows_low}_{all_rows_high}_ml{max_length}_ep{epochs}_lr{learning_rate}_th{threshold}_at{attention_window}_{fp}"

if 'gradient_accumulation_steps' not in globals():
  run_name = f"{run_name}_ba{batch_size}"
else:
  run_name = f"{run_name}_ba{batch_size}x{gradient_accumulation_steps}"

print(f"run_name: {run_name}")

run_name: EN_11_0_48000_ml1024_ep4_lr2e-05_th0.5_at1024_fp16_ba8x4


In [30]:
os.environ["WANDB_API_KEY"] = userdata.get("WANDB_API_KEY")        # Store the key in os.environ
wandb_api_key               = os.environ.get('WANDB_API_KEY')
wandb.login(key=wandb_api_key)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mclaudelepere[0m ([33mclaudelepere-c-cile-cy[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [31]:
try:
  wandb.init(
      project = "skill_classification",
      name    = run_name,
      entity  = "claudelepere-c-cile-cy",
      config  = {
          "learning_rate": learning_rate,
          "epochs"       : epochs,
          "batch_size"   : batch_size
      }
  )
except wandb.CommError as err:
  print(f"CommError: {err}")
except Exception as exc:
  print(f"Exception: {exc}")

## Create datasetDict (HF DatasetDict) = 3 HF Dataset, train, validation and test

In [32]:
datasetDict = DatasetDict.load_from_disk(datasetDict_dir_name)

In [33]:
print(f"datasetDict: {type(datasetDict)} {datasetDict.shape}\n{datasetDict}")
print(f"datasetDict.keys(): {datasetDict.keys()}")
print(f"datasetDict['train']:      {type(datasetDict['train'])}      {datasetDict['train'].shape}")
print(f"datasetDict['validation']: {type(datasetDict['validation'])} {datasetDict['validation'].shape}")
print(f"datasetDict['test']:       {type(datasetDict['test'])}       {datasetDict['test'].shape}")

datasetDict: <class 'datasets.dataset_dict.DatasetDict'> {'train': (38400, 8), 'validation': (4800, 8), 'test': (4800, 8)}
DatasetDict({
    train: Dataset({
        features: ['id', 'text', '390', '135', '136', '137', '138', '139'],
        num_rows: 38400
    })
    validation: Dataset({
        features: ['id', 'text', '390', '135', '136', '137', '138', '139'],
        num_rows: 4800
    })
    test: Dataset({
        features: ['id', 'text', '390', '135', '136', '137', '138', '139'],
        num_rows: 4800
    })
})
datasetDict.keys(): dict_keys(['train', 'validation', 'test'])
datasetDict['train']:      <class 'datasets.arrow_dataset.Dataset'>      (38400, 8)
datasetDict['validation']: <class 'datasets.arrow_dataset.Dataset'> (4800, 8)
datasetDict['test']:       <class 'datasets.arrow_dataset.Dataset'>       (4800, 8)


In [34]:
example = datasetDict['train'][0]
print(f"datasetDict['train'][0]: {type(example)} {example.keys()}\n{example}")

datasetDict['train'][0]: <class 'dict'> dict_keys(['id', 'text', '390', '135', '136', '137', '138', '139'])
{'id': 58861, 'text': 'Gentis - Project Coordinator iOS/ Android   Project Coordinator iOS/ Android   Gentis   For one of my client located in Brussels, I am looking for an experienced business analyst / project coordinator to migrate an existing application to mobile platforms (smartphones). Function You will be in charge of coordinating activities with 2 developers. The project starts on January 2015 for an initial length of 3 months. Profile Minimum 2 successful projects implementations as a business analyst / project coordinator Experience on mobile applications migration is requested French or Dutch + English Feel free apply if you are interested in this role.', '390': False, '135': False, '136': False, '137': True, '138': True, '139': True}


## Create labels (list), id2label (dict) and label2id (dict).
* dataset 7_1000_125_125  ,  48 labels
* dataset 7_128_18_54     ,  42 labels
* dataset 8910_1087_68_204, 206 labels
* dataset 11_1000         ,   6 labels

In [35]:
labels = [label for label in datasetDict['train'].features.keys() if label not in ['id', 'text']]
print(f"labels: {type(labels)} {len(labels)}\n{labels}")

num_labels = len(labels)

id2label = {idx: label for idx, label in enumerate(labels)}
print(f"id2label: {type(id2label)} {len(id2label)}\n{id2label}")

label2id = {label: idx for idx, label in enumerate(labels)}
print(f"label2id: {type(label2id)} {len(label2id)}\n{label2id}")

labels: <class 'list'> 6
['390', '135', '136', '137', '138', '139']
id2label: <class 'dict'> 6
{0: '390', 1: '135', 2: '136', 3: '137', 4: '138', 5: '139'}
label2id: <class 'dict'> 6
{'390': 0, '135': 1, '136': 2, '137': 3, '138': 4, '139': 5}


## Load the pretrained tokenizer and the model

In [36]:
model_name = "allenai/longformer-base-4096"

In [37]:
tokenizer = LongformerTokenizerFast.from_pretrained(model_name)

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

In [38]:
model = LongformerForSequenceClassification.from_pretrained(
    model_name,
    num_labels   = num_labels,
    id2label     = id2label,
    label2id     = label2id,
    problem_type = 'multi_label_classification'
)

# Configure attention window size
model.config.attention_window = attention_window

pytorch_model.bin:   0%|          | 0.00/597M [00:00<?, ?B/s]

Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [39]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

## Tokenize ('input_ids' and 'attention_mask'), add 'global_attention_mask' (for Longformer), add 'labels'

In [40]:
def preprocess_data(examples, indices):
  # Step 1: Extract text and tokenize
  text = examples['text']             # Batch of texts
  encoding = tokenizer(
      text,                           # Tokenize text
      truncation     = True,
      padding        = 'max_length',
      max_length     = max_length,
      return_tensors = 'pt'           # Return PyTorch tensors
  )

  # Step 2: Create and add the global attention mask
  global_attention_mask             = torch.zeros_like(encoding['input_ids'])  # Initialize global attention mask with zeros (same shape as input_ids)
  global_attention_mask[:, 0]       = 1                                        # Set global attention on the first token ([CLS], token ID=0) in each sequence
  encoding['global_attention_mask'] = global_attention_mask                    # Add the global_attention_mask to the batch

  # Step 3: Create and populate the label matrix
  labels_matrix = torch.zeros((len(text), len(labels)), dtype=torch.float32)   # Create an empty label matrix
  #print(f"labels_matrix: {type(labels_matrix)} {labels_matrix.shape}")
  #---------Populate label matrix
  for idx, label in enumerate(labels):
    #print(f"idx:{idx} label:{label}")
    if label in examples:
      labels_matrix[:, idx] = torch.tensor(
          [1.0 if val else 0.0 for val in examples[label]],
          dtype=torch.float32
          )
  print(f"labels_matrix: {type(labels_matrix)} {labels_matrix.shape}")

  encoding['labels'] = labels_matrix                                           # Add labels to the encoding
  print(f"encoding['labels']: {type(encoding['labels'])} {encoding['labels'].shape}")

  # encoding: <class 'transformers.tokenization_utils_base.BatchEncoding'> dict_keys(['input_ids', 'attention_mask', 'global_attention_mask', 'labels'])
  #   'input_ids': tensor([[
  #   'attention_mask': tensor([[
  #   'global_attention_mask': tensor([[
  #   'labels': tensor([[
  #print(f"1 preprocess_data call: encoding: {type(encoding)} {encoding.keys()}")

  return encoding

## Create encoded_dataset (datasets.dataset_dict.DatasetDict) = 3 encoded datasets.arrow_dataset.Dataset, train, validation and test

In [41]:
encoded_dataset = datasetDict.map(
    preprocess_data,
    batched        = True,
    remove_columns = datasetDict['train'].column_names,
    with_indices   = True
)

print(f"encoded_dataset: {type(encoded_dataset)} shape={encoded_dataset.shape}")

Map:   0%|          | 0/38400 [00:00<?, ? examples/s]

labels_matrix: <class 'torch.Tensor'> torch.Size([1000, 6])
encoding['labels']: <class 'torch.Tensor'> torch.Size([1000, 6])


model.safetensors:   0%|          | 0.00/597M [00:00<?, ?B/s]

labels_matrix: <class 'torch.Tensor'> torch.Size([1000, 6])
encoding['labels']: <class 'torch.Tensor'> torch.Size([1000, 6])
labels_matrix: <class 'torch.Tensor'> torch.Size([1000, 6])
encoding['labels']: <class 'torch.Tensor'> torch.Size([1000, 6])
labels_matrix: <class 'torch.Tensor'> torch.Size([1000, 6])
encoding['labels']: <class 'torch.Tensor'> torch.Size([1000, 6])
labels_matrix: <class 'torch.Tensor'> torch.Size([1000, 6])
encoding['labels']: <class 'torch.Tensor'> torch.Size([1000, 6])
labels_matrix: <class 'torch.Tensor'> torch.Size([1000, 6])
encoding['labels']: <class 'torch.Tensor'> torch.Size([1000, 6])
labels_matrix: <class 'torch.Tensor'> torch.Size([1000, 6])
encoding['labels']: <class 'torch.Tensor'> torch.Size([1000, 6])
labels_matrix: <class 'torch.Tensor'> torch.Size([1000, 6])
encoding['labels']: <class 'torch.Tensor'> torch.Size([1000, 6])
labels_matrix: <class 'torch.Tensor'> torch.Size([1000, 6])
encoding['labels']: <class 'torch.Tensor'> torch.Size([1000, 6])


Map:   0%|          | 0/4800 [00:00<?, ? examples/s]

labels_matrix: <class 'torch.Tensor'> torch.Size([1000, 6])
encoding['labels']: <class 'torch.Tensor'> torch.Size([1000, 6])
labels_matrix: <class 'torch.Tensor'> torch.Size([1000, 6])
encoding['labels']: <class 'torch.Tensor'> torch.Size([1000, 6])
labels_matrix: <class 'torch.Tensor'> torch.Size([1000, 6])
encoding['labels']: <class 'torch.Tensor'> torch.Size([1000, 6])
labels_matrix: <class 'torch.Tensor'> torch.Size([1000, 6])
encoding['labels']: <class 'torch.Tensor'> torch.Size([1000, 6])
labels_matrix: <class 'torch.Tensor'> torch.Size([800, 6])
encoding['labels']: <class 'torch.Tensor'> torch.Size([800, 6])


Map:   0%|          | 0/4800 [00:00<?, ? examples/s]

labels_matrix: <class 'torch.Tensor'> torch.Size([1000, 6])
encoding['labels']: <class 'torch.Tensor'> torch.Size([1000, 6])
labels_matrix: <class 'torch.Tensor'> torch.Size([1000, 6])
encoding['labels']: <class 'torch.Tensor'> torch.Size([1000, 6])
labels_matrix: <class 'torch.Tensor'> torch.Size([1000, 6])
encoding['labels']: <class 'torch.Tensor'> torch.Size([1000, 6])
labels_matrix: <class 'torch.Tensor'> torch.Size([1000, 6])
encoding['labels']: <class 'torch.Tensor'> torch.Size([1000, 6])
labels_matrix: <class 'torch.Tensor'> torch.Size([800, 6])
encoding['labels']: <class 'torch.Tensor'> torch.Size([800, 6])
encoded_dataset: <class 'datasets.dataset_dict.DatasetDict'> shape={'train': (38400, 4), 'validation': (4800, 4), 'test': (4800, 4)}


In [42]:
encoded_dataset.set_format('torch')
train_dataset      = encoded_dataset['train']
validation_dataset = encoded_dataset['validation']
test_dataset       = encoded_dataset['test']

print(f"train_dataset_tensor:                          {type(train_dataset)}                              {train_dataset.shape} {train_dataset.features}\n{train_dataset}")
print(f"train_dataset_tensor['input_ids']:             {type(train_dataset['input_ids'])}             len={len(train_dataset['input_ids'])}             shape={train_dataset['input_ids'].shape}            ") #\n{train_dataset['input_ids']}")
print(f"train_dataset_tensor['attention_mask']:        {type(train_dataset['attention_mask'])}        len={len(train_dataset['attention_mask'])}        shape={train_dataset['attention_mask'].shape}       ") #\n{train_dataset['attention_mask']}")
print(f"train_dataset_tensor['global_attention_mask']: {type(train_dataset['global_attention_mask'])} len={len(train_dataset['global_attention_mask'])} shape={train_dataset['global_attention_mask'].shape}") #\n{train_dataset['global_attention_mask']}")
print(f"train_dataset_tensor['labels']:                {type(train_dataset['labels'])}                len={len(train_dataset['labels'])}                shape={train_dataset['labels'].shape}               ") #\n{train_dataset['labels']}")

train_dataset_tensor:                          <class 'datasets.arrow_dataset.Dataset'>                              (38400, 4) {'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None), 'global_attention_mask': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), 'labels': Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None)}
Dataset({
    features: ['input_ids', 'attention_mask', 'global_attention_mask', 'labels'],
    num_rows: 38400
})
train_dataset_tensor['input_ids']:             <class 'torch.Tensor'>             len=38400             shape=torch.Size([38400, 1024])            
train_dataset_tensor['attention_mask']:        <class 'torch.Tensor'>        len=38400        shape=torch.Size([38400, 1024])       
train_dataset_tensor['global_attention_mask']: <class 'torch.Tensor'> len=38400 shape=torch.Size([38400, 1024])
train_dataset_tensor

## Truncated part

In [43]:
def get_truncated_part(text):
  tokens = tokenizer(
      text,
      truncation                = True,
      padding                   = 'max_length',
      max_length                = max_length,
      return_overflowing_tokens = True,
      return_tensors            = None
  )
  print(f"tokens.keys(): {tokens.keys()}")

  # Get the truncated tokens
  truncated_ids = tokens["input_ids"][0]
  print(f"truncated_ids: {type(truncated_ids)} {truncated_ids}")
  #overflow_ids  = tokens["overflow_to_sample_mapping"][0]
  #print(f"overflow_ids: {type(overflow_ids)} {overflow_ids}")

  # Decode the tokens back to text
  truncated_text = tokenizer.decode(truncated_ids, skip_special_tokens=True)
  #overflow_text  = tokenizer.decode(overflow_ids, skip_special_tokens=True)

  print(f"original_text :\n{text}")
  print(f"truncated_text:\n{truncated_text}")
  #print(f"overflow_text:\n{overflow_text}")

  original_tokens  = tokenizer.tokenize(text)
  truncated_tokens = tokenizer.tokenize(truncated_text)
  #overflow_tokens  = tokenizer.tokenize(overflow_text)

  print(f"original_tokens count : {len(original_tokens)}")
  print(f"truncated_tokens count: {len(truncated_tokens)}")
  #print(f"overflow_tokens count: {len(overflow_tokens)}")

In [44]:
example_text = datasetDict['train'][0]['text']
#get_truncated_part(example_text)

In [45]:
inputs = tokenizer(
    example_text,
    truncation     = True,
    padding        = 'max_length',
    max_length     = max_length,
    return_tensors = 'pt'
)

## Forward pass for multi-label classification

In [46]:
outputs = model(
    input_ids      = inputs.input_ids,
    attention_mask = inputs.attention_mask
)

Initializing global attention on CLS token...


In [47]:
print(f"outputs: {type(outputs)} {outputs.keys()}\n{outputs}")

outputs: <class 'transformers.models.longformer.modeling_longformer.LongformerSequenceClassifierOutput'> odict_keys(['logits'])
LongformerSequenceClassifierOutput(loss=None, logits=tensor([[-0.1534,  0.0319,  0.0607, -0.0543, -0.0856, -0.0277]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None, global_attentions=None)


In [48]:
# Logits (= raw model outputs)
logits = outputs.logits
print(f"logits: {type(logits)} {logits.shape}\n{logits}")

logits: <class 'torch.Tensor'> torch.Size([1, 6])
tensor([[-0.1534,  0.0319,  0.0607, -0.0543, -0.0856, -0.0277]],
       grad_fn=<AddmmBackward0>)


In [49]:
# Convert logits to probabilities
sigmoid = torch.nn.Sigmoid()
probs   = sigmoid(logits)
print(f"probs: {type(probs)} {probs.shape}\n{probs}")

probs: <class 'torch.Tensor'> torch.Size([1, 6])
tensor([[0.4617, 0.5080, 0.5152, 0.4864, 0.4786, 0.4931]],
       grad_fn=<SigmoidBackward0>)


In [50]:
example = encoded_dataset['train'][0]

In [51]:
print(f"example: {type(example)} {example.keys()}\n{example}")
print()
#print(f"example['input_ids']: {type(example['input_ids'])} {len(example['input_ids'])}\n{example['input_ids']}")
#print(f"example['attention_mask']: {type(example['attention_mask'])} {len(example['attention_mask'])}\n{example['attention_mask']}")
#print(f"example['labels']:  {type(example['labels'])} {len(example['labels'])}\n{example['labels']}")

example: <class 'dict'> dict_keys(['input_ids', 'attention_mask', 'global_attention_mask', 'labels'])
{'input_ids': tensor([   0,  534, 1342,  ...,    1,    1,    1]), 'attention_mask': tensor([1, 1, 1,  ..., 0, 0, 0]), 'global_attention_mask': tensor([1, 0, 0,  ..., 0, 0, 0]), 'labels': tensor([0., 0., 0., 1., 1., 1.])}



In [52]:
tokenizer.decode(example['input_ids'])

'<s>Gentis - Project Coordinator iOS/ Android   Project Coordinator iOS/ Android   Gentis   For one of my client located in Brussels, I am looking for an experienced business analyst / project coordinator to migrate an existing application to mobile platforms (smartphones). Function You will be in charge of coordinating activities with 2 developers. The project starts on January 2015 for an initial length of 3 months. Profile Minimum 2 successful projects implementations as a business analyst / project coordinator Experience on mobile applications migration is requested French or Dutch + English Feel free apply if you are interested in this role.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><p

In [53]:
[id2label[idx] for idx, label in enumerate(example['labels']) if label == 1.0]

['137', '138', '139']

## Set PyTorch format to ensures correctness and compatibility with PyTorch pipelines
The 3 Hugging Face Dataset are formatted as PyTorch Dataset.

In [54]:
encoded_dataset.set_format('torch')

## Workflow

- 3 steps: training, evaluation, test
- 3 datasets: train, validation, test
- 3 Trainer functions: train, evaluate, predict
---
* training uses train_dataset
* evaluation uses validation_dataset
* test uses test_dataset

## Training step


In [55]:
batch_size  = batch_size
metric_name = "f1"

In [56]:
print(f"input_ids:              {type(encoded_dataset['train']['input_ids'][0])}\t{encoded_dataset['train']['input_ids'][0].shape}")
print(f"attention_mask:         {type(encoded_dataset['train']['attention_mask'][0])}\t{encoded_dataset['train']['attention_mask'][0].shape}")
print(f"global_attention_mask:  {type(encoded_dataset['train']['global_attention_mask'][0])}\t{encoded_dataset['train']['global_attention_mask'][0].shape}")
print(f"labels:                 {type(encoded_dataset['train'][0]['labels'])}\t{encoded_dataset['train'][0]['labels'].shape}")

input_ids:              <class 'torch.Tensor'>	torch.Size([1024])
attention_mask:         <class 'torch.Tensor'>	torch.Size([1024])
global_attention_mask:  <class 'torch.Tensor'>	torch.Size([1024])
labels:                 <class 'torch.Tensor'>	torch.Size([6])


### Execute a forward pass for debugging or verification purposes (cf. BERT_3_1 in Notion BERT database)

In [57]:
outputs = model(
    input_ids      = encoded_dataset['train']['input_ids'][0].unsqueeze(0),
    attention_mask = encoded_dataset['train']['attention_mask'][0].unsqueeze(0),
    labels         = encoded_dataset['train'][0]['labels'].unsqueeze(0)
)

In [58]:
print(f"outputs: {type(outputs)} {outputs.keys()}\n{outputs}")

outputs: <class 'transformers.models.longformer.modeling_longformer.LongformerSequenceClassifierOutput'> odict_keys(['loss', 'logits'])
LongformerSequenceClassifierOutput(loss=tensor(0.7029, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>), logits=tensor([[-0.1534,  0.0319,  0.0607, -0.0543, -0.0856, -0.0277]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None, global_attentions=None)


### Weighted loss function
**weight and pos_weight**:
- torch.nn.BCEWithLogitsLoss function is a commonly used loss function for binary classification problems, where model output is a probability value between 0 and 1. It combines a sigmoid activation function with a binary cross-entropy loss.
- For imbalanced datasets, where number of a class is significantly smaller than other, BCEWithLogitsLoss can be modified by adding a weight parameter to loss function.<br/>
BCEWithLogitsLoss also has a pos_weight parameter, which is a simpler way to specify weight for positive class (equivalent to weight parameter = [ 1, pos_weight], where weight for negative class = 1.<br>
Negative samples (0s) are not weighted explicitly because the loss function already balances them implicitly.
- pos_weights stands for positive weights in the BCEWithLogitsLoss function.<br/>
In multi-label classification, each label is a separate binary classification problem.<br/>
Each label has positive sample (1s) and negative samples (0s) in the dataset.
    - If a label is rare (fewer 1s), its weight will be higher -> encourages the model to predict it more often
    - If a label is common (many 1s), its weight will be lower -> prevents the model to overpredicting it

**Normalization**:
Without normalization, pos_weights might have huge variations across labels, that could destabilize training.
- Min-Max Scaling:
    - Rescales values between 0 and 1
    - Reduces large variations but keeps relative ranking
- Z-Score Normalization:
    - Centers values around 0 with a standard deviation of 1
    - Handles outliers better than min-max
- Sum-to-One Scaling:
    - Makes weights sum to 1, preventing extremely large values
- Recommended approach: try sum-to-one normalization first. If performance is unstable, test z-score.

### Weighted BCEWithLogitsLoss
Assigns higher weights to rare labels using class weights.

In [59]:
def class_weights(labels):
    print(f"labels: {type(labels)} len={len(labels)} shape={labels.shape}\n{labels}")

    num_samples, num_labels = labels.shape
    print(f"num_samples: {type(num_samples)} {num_samples}")
    print(f"num_labels:  {type(num_labels)}  {num_labels}")

    # class_counts = how many times each label appears (i.e. number of 1s per label)
    # (dim=0 means summing across all samples; equivalent to axis = 0 for Pandas DataFrame)
    class_counts = labels.sum(dim=0)
    print(f"class_counts: {type(class_counts)} len={len(class_counts)}\n{class_counts}")

    # pos_weights = negative samples (0s) per label / positive samples (1s) per label
    pos_weights = (num_samples-class_counts) / (class_counts + 1e-6)  # Avoid division by zero
    print(f"pos_weights: {type(pos_weights)} len={len(pos_weights)}\n{pos_weights}")

    # Normalization
    normalized_pos_weights_minmax = (pos_weights-pos_weights.min()) / (pos_weights.max()-pos_weights.min())
    print(f"normalized_pos_weights_minmax: {type(normalized_pos_weights_minmax)} {len(normalized_pos_weights_minmax)} {normalized_pos_weights_minmax}")

    normalized_pos_weights_zscore = (pos_weights-pos_weights.mean()) / pos_weights.std()
    print(f"normalized_pos_weights_zscore: {type(normalized_pos_weights_zscore)} {len(normalized_pos_weights_zscore)} {normalized_pos_weights_zscore}")

    normalized_pos_weights_sum1 = pos_weights / pos_weights.sum()
    print(f"normalized_pos_weights_sum1: {type(normalized_pos_weights_sum1)} {len(normalized_pos_weights_sum1)} {normalized_pos_weights_sum1}")

    #return pos_weights
    #return normalized_pos_weights_minmax
    #return normalized_pos_weights_zscore
    return normalized_pos_weights_sum1

In [60]:
pos_weights = class_weights(encoded_dataset['train']['labels'])
bce_loss_fn = BCEWithLogitsLoss(pos_weight=pos_weights.to(device))

labels: <class 'torch.Tensor'> len=38400 shape=torch.Size([38400, 6])
tensor([[0., 0., 0., 1., 1., 1.],
        [0., 0., 0., 1., 1., 1.],
        [0., 0., 0., 0., 1., 1.],
        ...,
        [0., 0., 0., 1., 1., 1.],
        [0., 0., 0., 1., 1., 1.],
        [0., 0., 0., 0., 1., 1.]])
num_samples: <class 'int'> 38400
num_labels:  <class 'int'>  6
class_counts: <class 'torch.Tensor'> len=6
tensor([ 3131.,  2150.,  8401., 27341., 34391., 29145.])
pos_weights: <class 'torch.Tensor'> len=6
tensor([11.2645, 16.8605,  3.5709,  0.4045,  0.1166,  0.3176])
normalized_pos_weights_minmax: <class 'torch.Tensor'> 6 tensor([0.6658, 1.0000, 0.2063, 0.0172, 0.0000, 0.0120])
normalized_pos_weights_zscore: <class 'torch.Tensor'> 6 tensor([ 0.8299,  1.6248, -0.2630, -0.7128, -0.7537, -0.7252])
normalized_pos_weights_sum1: <class 'torch.Tensor'> 6 tensor([0.3462, 0.5182, 0.1098, 0.0124, 0.0036, 0.0098])


### Focal Loss
Reduces the impact of easy examples (majority class) and focuses on difficult cases.

- α (alpha): Adjusts class weighting (0.5 means equal weight). Higher α gives more weight to minority classes.
- γ (gamma): Controls how much hard-to-classify samples are emphasized. Higher γ reduces the influence of easy samples.

In [61]:
class FocalLoss(Module):
    """
    Focal Loss implementation for handling class imbalance.
    """
    def __init__(self, alpha=1.0, gamma=2.0, logits=True, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.alpha     = alpha
        self.gamma     = gamma
        self.logits    = logits     # True if inputs are logits, False if probabilies
        self.reduction = reduction  # 'mean' or 'none'

    # inputs  = model's predictions: PyTorch tensor, shape=(batch_size, num_classes)
    # targets = ground truth labels: PyTorch tensor, shape=same as inputs shape
    def forward(self, inputs, targets):
        targets = targets.to(inputs.device)  # Ensure labels are on the same device

        #print(f"inputs: {type(inputs)} {inputs.shape}\ntargets: {type(targets)} {targets.shape}"
        # Here, we check if input is probability or logits
        if self.logits:
            BCE_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction='none')
        else:
            BCE_loss = F.binary_cross_entropy(inputs, targets, reduction='none')

        pt         = torch.exp(-BCE_loss)  # Probability of the correct class
        focal_loss = self.alpha * (1-pt)**self.gamma * BCE_loss

        return focal_loss.mean() if self.reduction == 'mean' else focal_loss

    def __repr__(self):
        return f"FocalLoss(alpha={self.alpha}, gamma={self.gamma}, logits={self.logits}, reduction={self.reduction})"

    def __str__(self):
        return self.__repr__()


In [62]:
#focal_loss_fn = FocalLoss(alpha=0.5, gamma=3.0, logits=True, reduction='mean')
#focal_loss_fn = FocalLoss(alpha=0.25, gamma=4.0, logits=True, reduction='mean')
focal_loss_fn = FocalLoss(alpha=0.5, gamma=4.0, logits=True, reduction='mean')
#focal_loss_fn = FocalLoss(alpha=0.625, gamma=4.0, logits=True, reduction='mean')
print(f"focal_loss_fn: {type(focal_loss_fn)} {focal_loss_fn}")

focal_loss_fn: <class '__main__.FocalLoss'> FocalLoss(alpha=0.5, gamma=4.0, logits=True, reduction=mean)


In [63]:
class LossLoggerCallback(TrainerCallback):
    def on_epoch_end(self, args, state, control, **kwargs):
        # Retrieve loss from logs
        logs       = kwargs.get("logs", {})
        epoch_loss = logs.get("logs", None)  # Loss from the Trainer logs

        if epoch_loss is not None:
            logging.info(f"Epoch {state.epoch:.0f} - Average Loss: {epoch_loss:.6f}")
            #print(f"Epoch {state.epoch:.0f} - Average Loss: {epoch_loss:.6f}", flush=True)
        else:
            logging.warning(f"Epoch {state.epoch:.0f} - No loss logged!")
            #print(f"Epoch {state.epoch:.0f} - No loss logged!", flush=True)

#

In [64]:
class MetricsLoggerCallback(TrainerCallback):
    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        if metrics is None:
            metrics = kwargs.get("metrics")  # Ensure we get the metrics if passed in kwargs
        if metrics:                          # Check if metrics exist
            logging.info(f"Epoch {state.epoch:.0f} - "
                         f"Precision: {metrics.get('precision', float('nan')):.4f} - "
                         f"Recall: {metrics.get('recall', float('nan')):.4f} - "
                         f"F1: {metrics.get('f1', float('nan')):.4f}")

In [65]:
class ProgressLoggerCallback(TrainerCallback):
    def on_epoch_end(self, args, state, control, **kwargs):
        logs       = kwargs.get("logs", {})  # Extract from logs
        epoch_loss = logs.get("loss", None)  # Get loss value

        if epoch_loss is not None:
            logging.info(f"Epoch {state.epoch:.0f} - Average Loss: {epoch_loss:.6f}")
            #print(f"Epoch {state.epoch:.0f} - Average Loss: {epoch_loss:.6f}", flush=True)
        else:
            logging.warning(f"Epoch {state.epoch:.0f} - No loss logged!")
            #rint(f"Epoch {state.epoch:.0f} - No loss logged!", flush=True)

    def on_train_end(self, args, state, control, **kwargs):
        logging.info("Training Completed!")
        #print("=== Training Completed! ===", flush=True)

### Training Metrics
  source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/

#### UndefinedMetricWarning
Only one class is present in y_true. ROC AUC score is not defined in that case.

This warning typically arises when you're trying to calculate the ROC AUC score for a label where either all true values are 0 or all are 1 in a particular batch.

The ROC AUC score is calculated by comparing the true positive rate (TPR) against the false positive rate (FPR) at various thresholds. If a label is only ever predicted as 0 or 1, you cannot generate a meaningful ROC curve and thus the AUC is undefined.

AUC scores rely on the presence of both positive and negative samples for each label.

Solution:

- Check Label Distribution: Add a check at the start to see if either true_labels or preds for a particular label contain only one unique value (0 or 1). If so, for that label, either skip the ROC AUC calculation or set the ROC AUC to a default value (like 0 or NaN).
- Ignore the warning (not recommended):
- **Stratified sampling: While you did split into train/validation/test, the warning may indicate you didn't maintain the label balance during the splitting process. Stratified sampling would do this.**

#### TPR (True Positive Rate) and FPR (False Positive Rate)
TPR = Sensitivity = Recall = TP / (TP + FN)
- TPR close to 1: the model identifies most positives
- TPR close to 0: the model is missing many positives

FPR = FP / (FP + TN)
- FPR close to 1: the model produces many false alarms
- FPR close to 0: the model makes few false alarms

The ROC curve plots TPR (y-axis) vs. FPR (x-axis) at different thresholds.
A perfect model has:
- TPR = 1 (detect all positives)
- FPR = 0 (no false alarms)

The ideal ROC curve is a steep rise towards the top-left corner.

#### zero_division=0

- only for f1, precision and recall because they involve division where the denominator can be zero: some labels might never       be predicted (y_pred = 0 for all samples), or they might not appear in the true_labels (y_true = 0 for all samples)
- ROC AUC: works with probabilities and does not involve division by zero
- Precision-Recall AUC: also based on ranking, so no zero division issue
- Accuracy: just compares exact matches, so no zero division issue

In [66]:
def multi_label_metrics(logits, true_labels, threshold):
    """
    Compute multi-label classification metrics.

    Parameters:
    - logits     : raw, unnormalized scores from the model  (numpy ndarray of shape (batch_size, num_labels))
    - true_labels: actual labels                            (numpy ndarray of shape (batch_size, num_labels))
    - threshold  : decision threshold for converting probabilities to binary predictions

    Returns:
    - metrics: dictionary of scores
    """
    #print(">>>>>>>>>>multi_label_metrics called!<<<<<<<<<<", flush=True)
    print(f"threshold: {type(threshold)} {threshold}")
    #print(f"ZZZlogits: {type(logits)} {logits.shape}\n{logits}")                      # <class 'numpy.ndarray'> (12, 6)
    #print(f"ZZZtrue_labels: {type(true_labels)} {true_labels.shape}\n{true_labels}")  # <class 'numpy.ndarray'> (12, 6)

    # Ensure logits is a PyTorch tensor before applying sigmoid
    if isinstance(logits, np.ndarray):
        logits = torch.as_tensor(logits)
    #print(f"ZZZlogits: {type(logits)} {logits.shape}\n{logits}")                      # <class 'torch.Tensor'> torch.Size([12, 6])

    # Convert logits to probabilities
    probs = torch.sigmoid(logits).detach().cpu().numpy()  # Convert to NumPy safely:
                                                          # - detach() remove the tensor from the computation graph,
                                                          #   making it a regular tensor without gradients
                                                          # - cpu() moves the tensor from the GPU to the CPU before converting to NumPy
    #print(f"ZZZprobs: {type(probs)} {probs.shape}\n{probs}")  # <class 'numpy.ndarray'> (12, 6)

    # Apply threshold to get binary predictions
    preds = (probs > threshold).astype(int)
    #print(f"ZZZpreds: {type(preds)} {preds.shape}\n{preds}")  # <class 'numpy.ndarray'> (12, 6)

    # Compute metrics
    f1                   = f1_score               (y_true=true_labels, y_pred=preds, average=training_average, zero_division=0)
    precision            = precision_score        (y_true=true_labels, y_pred=preds, average=training_average, zero_division=0)
    recall               = recall_score           (y_true=true_labels, y_pred=preds, average=training_average, zero_division=0)

    # Identify valid labels (those with both 0s and 1s in 'y_true')
    valid_labels = np.where((true_labels.sum(axis=0) > 0) & (true_labels.sum(axis=0) < true_labels.shape[0]))[0]

    if len(valid_labels) > 0:
        roc_auc              = np.mean([roc_auc_score          (y_true=true_labels[:, i], y_score=probs[:, i]) for i in valid_labels])
        precision_recall_auc = np.mean([average_precision_score(y_true=true_labels[:, i], y_score=probs[:, i]) for i in valid_labels])
    else:
        roc_auc              = np.nan  # Set to NaN if no valid labels exist
        precision_recall_auc = np.nan  # Set to NaN if no valid labels exist

    subset_acc = accuracy_score(true_labels, preds)  # Subset accuracy (requires exact match per sample)
    hamming    = hamming_loss(true_labels, preds)    # Better for imbalanced multi-label tasks

    metrics = {
        'f1'                  : f1,
        'precision'           : precision,
        'recall'              : recall,
        'roc_auc'             : roc_auc,               # Avoid warning by checking valid labels
        'precision_recall_auc': precision_recall_auc,  # Avoid warning by checking valid labels
        'subset_accuracy'     : subset_acc,
        'hamming_loss'        : hamming
    }

    return metrics

In [67]:
# Evaluation batch per batch
def compute_metrics(p: EvalPrediction):
    #print(f"p.predictions: {type(p.predictions)} {p.predictions.shape}\n{p.predictions[:5]}")
    #print(f"p.label_ids: {type(p.label_ids)} {p.label_ids.shape}\n{p.label_ids[:5]}")
    print(f"threshold: {type(threshold)} {threshold}")

    preds  = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    result = multi_label_metrics(preds,p.label_ids, threshold)

    return result

### HF transformer Trainer and CustomTrainer
Abstracts the training loop.

In [68]:
training_args = TrainingArguments(
    output_dir                  = './training_results',  # where model predictions and checkpoints will be written during training
    overwrite_output_dir        = True,
    save_steps                  = 500,
    save_total_limit            = 2,
    eval_strategy               = 'epoch',               # Evaluate at the end of each epoch
    save_strategy               = 'epoch',               # Save checkpoints every epoch
    learning_rate               = learning_rate,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size  = batch_size,
    gradient_accumulation_steps = gradient_accumulation_steps,
    num_train_epochs            = epochs,
    weight_decay                = 0.01,
    load_best_model_at_end      = True,
    metric_for_best_model       = metric_name,
    fp16                        = fp,
    run_name                    = run_name,
    report_to                   = 'none'                 # Disable wandb if not needed
)

In [69]:
class CustomTrainer(Trainer):

    def __init__(self, model, *args, loss_fn=None, **kwargs):
        super().__init__(model, *args, **kwargs)
        self.loss_fn = loss_fn
        #print(f">>>>>>>>>>CustomTrainer initialized with loss_fn: {loss_fn}<<<<<<<<<<")

    """
    # No print in compute_loss because out of memory because prints are batch per batch
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):

    #print(f"inputs passed to compute_loss: {inputs.keys()}")
    #input_ids             = inputs['input_ids']                        # shape: batch_size, sequence_length
    #attention_mask        = inputs['attention_mask']                   # shape: batch_size, sequence_length
    #global_attention_mask = inputs.get('global_attention_mask', None)  # shape: batch_size, sequence_length; optional as LongFormer specific
    labels                = inputs.pop('labels', None)                 # shape: batch_size, num_labels; needed for loss computation, not required by the model

    #outputs = model(**inputs, global_attention_mask=global_attention_mask)  # Forward pass
    # Forward pass
    #outputs = model(
    #    input_ids             = input_ids,
    #    attention_mask        = attention_mask,
    #    global_attention_mask = global_attention_mask,
    #    labels                = labels
    #)
    outputs = model(**inputs, labels=labels)
    #print(f"outputs: {type(outputs)} {outputs.keys()}\n{outputs}")
    logits = outputs.logits  # shape: (batch_size, num_labels)

    # If labels are provided, compute loss
    if labels is not None:
      # Use the custom loss function if provided
      if self.loss_fn is not None:
        loss = self.loss_fn(logits, labels)  # Compute weighted loss
      else:
        # Default loss: BCEWithLogitsLoss
        loss_fn = BCEWithLogitsLoss()
        loss    = loss_fn(logits, labels)    # Compute loss
      return (loss, outputs) if return_outputs else loss

    # If no labels, return outputs only, for evaluation or prediction
    return outputs
    """
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        #print(f">>>>>>>>>>compute_loss called!<<<<<<<<<<", flush=True)
        labels  = inputs.get('labels')
        outputs = model(**inputs)
        logits  = outputs.logits  # (batch_size, num_labels)

        if labels is not None:
            labels = labels.to(logits.device).float()  # Ensure same device

            if self.loss_fn is not None:
                loss = self.loss_fn(logits, labels)
                logging.info(f"Step Loss ({self.loss_fn.__class__.__name__}): {loss.item():.6f}")  # Log loss value
                #print(f"Epoch {self.state.epoch:.0f}, Step {self.state.global_step}: Loss ({self.loss_fn.__class__.__name__}): {loss.item():.6f}", flush=True)
            else:
                loss_fn = BCEWithLogitsLoss()
                loss    = loss_fn(logits, labels)
                logging.info(f"Step Loss (BCEWithLogitsLoss): {loss.item():.6f}")                  # Log loss value
                #print(f"Epoch {self.state.epoch:.0f}, Step {self.state.global_step}: Loss (BCEWithLogitsLoss): {loss.item():.6f}", flush=True)

            return (loss, outputs) if return_outputs else loss

        return outputs

In [70]:
trainer = CustomTrainer(
    model           = model,
    args            = training_args,
    train_dataset   = encoded_dataset["train"],
    eval_dataset    = encoded_dataset["validation"],
    compute_metrics = compute_metrics,
    loss_fn         = focal_loss_fn,  # bce_loss_fn or focal_loss_fn
    #callbacks       = [LossLoggerCallback(), MetricsLoggerCallback(), ProgressLoggerCallback()]  # Attach logging callbacks
)

#trainer = Trainer(
#    model           = model,
#    args            = training_args,
#    train_dataset   = encoded_dataset["train"],
#    eval_dataset    = encoded_dataset["validation"],
#    compute_metrics = compute_metrics,
#)

print(f"trainer: {type(trainer)} {trainer}")

trainer: <class '__main__.CustomTrainer'> <__main__.CustomTrainer object at 0x792680423450>


### trainer.train

In [71]:
trainer_train = trainer.train()

print(f"trainer_train: {type(trainer_train)} len={len(trainer_train)}\n{trainer_train}")
print()
print(f"trainer_train.metrics: {type(trainer_train.metrics)} len={len(trainer_train.metrics)}\n{json.dumps(trainer_train.metrics, indent=4)}")

Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Roc Auc,Precision Recall Auc,Subset Accuracy,Hamming Loss,Runtime,Samples Per Second,Steps Per Second
1,0.0101,0.009721,0.875416,0.866502,0.884515,0.889925,0.816758,0.54125,0.114375,72.1734,66.507,8.313
2,0.0092,0.009322,0.875796,0.879101,0.872516,0.898993,0.828439,0.535208,0.112431,71.5361,67.099,8.387
3,0.0085,0.009365,0.875798,0.875563,0.876032,0.903292,0.832176,0.54,0.112882,71.389,67.237,8.405
4,0.0079,0.009301,0.879782,0.871243,0.88849,0.904553,0.832311,0.545625,0.110312,71.5403,67.095,8.387


threshold: <class 'float'> 0.5
threshold: <class 'float'> 0.5
threshold: <class 'float'> 0.5
threshold: <class 'float'> 0.5
threshold: <class 'float'> 0.5
threshold: <class 'float'> 0.5
threshold: <class 'float'> 0.5
threshold: <class 'float'> 0.5
trainer_train: <class 'transformers.trainer_utils.TrainOutput'> len=3
TrainOutput(global_step=4800, training_loss=0.009176255514224371, metrics={'train_runtime': 7610.0941, 'train_samples_per_second': 20.184, 'train_steps_per_second': 0.631, 'total_flos': 1.008953104269312e+17, 'train_loss': 0.009176255514224371, 'epoch': 4.0})

trainer_train.metrics: <class 'dict'> len=6
{
    "train_runtime": 7610.0941,
    "train_samples_per_second": 20.184,
    "train_steps_per_second": 0.631,
    "total_flos": 1.008953104269312e+17,
    "train_loss": 0.009176255514224371,
    "epoch": 4.0
}


In [72]:
print("trainer.train successfully completed")

trainer.train successfully completed


### trainer.train: save locally and upload to HF Hub

In [73]:
if upload_to_HF is True:

    name               = "trainer_train"
    trainer_train_path = f"{name}.json"

    with open(trainer_train_path, "w") as f:
        json.dump(trainer_train, f)

    print(f"{name} results successfully saved locally to {trainer_train_path}")

    upload_file(
        path_or_fileobj = trainer_train_path,
        path_in_repo    = trainer_train_path,
        repo_id         = repo_id,
        repo_type       = 'dataset',
        commit_message  = f"{name}_{timestamp}"
    )

    print(f"{name} results successfully uploaded to HF Hub as {trainer_train_path}")

trainer_train results successfully saved locally to trainer_train.json
trainer_train results successfully uploaded to HF Hub as trainer_train.json


### trainer.train: check that the uploaded file can be downloaded
File locally downloaded to:
/root/.cache/huggingface/hub/datasets-claudelepere-skill_classification/snapshots/full_commit_hash/trainer_train_results.json

In [74]:
if upload_to_HF is True:
  file_path = hf_hub_download(repo_type="dataset", repo_id=repo_id, filename=trainer_train_path)

  print(f"file_path: {file_path}")

trainer_train.json:   0%|          | 0.00/214 [00:00<?, ?B/s]

file_path: /root/.cache/huggingface/hub/datasets--claudelepere--jobs_EN_11_48000_032308/snapshots/f4a22e9da0a3bda0613c3a2d68a01b3883d93382/trainer_train.json


##Upload tokenizer and model to HF Hub and check

In [75]:
if upload_to_HF is True:

    # Upload
    commit_message = f"tokenizer_{timestamp}"
    tokenizer.push_to_hub(repo_id, commit_message=commit_message)  # commit_message as named parameter

    commit_message = f"model_{timestamp}"
    model.push_to_hub(    repo_id, commit_message=commit_message)  # commit_message as named parameter

    print(f"tokenizer and model successfully uploaded to HF Hub at {repo_id}")

    # Check
    def check_upload(repo_id):
        print()
        print("Tokenizer")
        tokenizer = LongformerTokenizerFast.from_pretrained(repo_id)
        print()
        print("Model")
        model = LongformerForSequenceClassification.from_pretrained(repo_id)
        print()

        inputs = tokenizer("Hello, my dog is cute", return_tensors="pt", truncation=True, padding=True)
        outputs = model(**inputs)

        print(f"outputs: {type(outputs)} {outputs.keys()}\n{outputs}")

    # To check if the upload was successful, download the tokenizer and the model
    check_upload(repo_id)

README.md:   0%|          | 0.00/755 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/754 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/595M [00:00<?, ?B/s]

tokenizer and model successfully uploaded to HF Hub at claudelepere/jobs_EN_11_48000_032308

Tokenizer


tokenizer_config.json:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.56M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]


Model


config.json:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/595M [00:00<?, ?B/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Input ids are automatically padded to be a multiple of `config.attention_window`: 1024



outputs: <class 'transformers.models.longformer.modeling_longformer.LongformerSequenceClassifierOutput'> odict_keys(['logits'])
LongformerSequenceClassifierOutput(loss=None, logits=tensor([[-0.2495, -0.4575,  0.0759,  0.4247,  0.2985,  0.0846]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None, global_attentions=None)


## Evaluation step

### Evaluation 1: trainer.evaluate
trainer.evaluate uses a fixed threshold of 0.5 to convert logits into binary labels, which is often suboptimal for imbalanced data.

In [76]:
evaluation_trainer_evaluate_metrics = trainer.evaluate(
    #eval_dataset = encoded_dataset["validation"],  # by default, trainer.evaluate() evaluates the dataset passed as eval_dataset during training
    metric_key_prefix="eval"                       # prefix for the evaluation metrics
)

print(f"evaluation_trainer_evaluate_metrics: {type(evaluation_trainer_evaluate_metrics)} len={len(evaluation_trainer_evaluate_metrics)}\n{json.dumps(evaluation_trainer_evaluate_metrics, indent=4)}")


threshold: <class 'float'> 0.5
threshold: <class 'float'> 0.5
evaluation_trainer_evaluate_metrics: <class 'dict'> len=12
{
    "eval_loss": 0.00930106546729803,
    "eval_f1": 0.8797820410943353,
    "eval_precision": 0.8712433485722851,
    "eval_recall": 0.8884897584836442,
    "eval_roc_auc": 0.9045531587320514,
    "eval_precision_recall_auc": 0.832310919307515,
    "eval_subset_accuracy": 0.545625,
    "eval_hamming_loss": 0.1103125,
    "eval_runtime": 71.4259,
    "eval_samples_per_second": 67.203,
    "eval_steps_per_second": 8.4,
    "epoch": 4.0
}


In [77]:
print("evaluation 1: trainer.evaluate: successfully completed")

evaluation 1: trainer.evaluate: successfully completed


### Evaluation 1: trainer.evaluate: save locally and upload to HF Hub

In [78]:
if upload_to_HF is True:

    name                             = "evaluation_trainer_evaluate"
    evaluation_trainer_evaluate_path = f"{name}.json"

    with open(evaluation_trainer_evaluate_path, "w") as f:
        json.dump(evaluation_trainer_evaluate_metrics, f)

    print(f"{name} successfully saved locally to {evaluation_trainer_evaluate_path}")

    upload_file(
        path_or_fileobj = evaluation_trainer_evaluate_path,
        path_in_repo    = evaluation_trainer_evaluate_path,
        repo_id         = repo_id,
        repo_type       = 'dataset',
        commit_message  = f"{name}_{timestamp}"
    )

    print(f"{name} successfully uploaded to HF Hub as {evaluation_trainer_evaluate_path}")

evaluation_trainer_evaluate successfully saved locally to evaluation_trainer_evaluate.json
evaluation_trainer_evaluate successfully uploaded to HF Hub as evaluation_trainer_evaluate.json


### Evaluation 2: trainer.predict

In [79]:
def predict_with_optimized_thresholds(trainer, dataset, id2label, threshold_tuning=False, thresholds=None):
    """
    Predicts using trainer.predict(), with optional threshold tuning, using NumPy arrays, and not PyTorch tensors

    Parameters:
    - trainer                         : Hugging Face Trainer or CustomTrainer instance
    - dataset                         : Dataset to predict on
    - id2label                        : Dictionary mapping label indices (int) to label names (string)
    - threshold_tuning                : Boolean to enable threshold tuning per class (aka per label)
    - thresholds       (numpy.ndarray): Custom thresholds for classification

    Returns:
    if threshold_tuning:
        - best_thresholds      (numpy.ndarray): optimized threshold per class (aka per label)
        - best_thresholds_dict (dict)         : optimized threshold per class (aka per label)
        - best_metrics         (dict)         : best F1, best precision, best recall per class (aka per label)
        - best_preds           (numpy.ndarray): best predictions per class (aka per label)
    else:
        - thresholds      (numpy.ndarray): fixed threshold per class (aka per label)
        - thresholds_dict (dict)         : fixed threshold per class (aka per label)
        - metrics         (dict)         : computed with provided thresholds or default to 0.5
        - preds           (numpy.ndarray): predictions with provided thresholds or default to 0.5
    """
    # Predict
    predictions_output = trainer.predict(dataset)
    logits             = predictions_output.predictions
    true_labels        = predictions_output.label_ids

    # Convert logits to probabilities (with np, not with torch)
    probs = 1 / (1 + np.exp(-logits))  # Sigmoid function

    num_labels           = len(id2label)
    best_thresholds      = None
    best_thresholds_dict = None
    best_metrics         = None
    best_preds           = None

    if threshold_tuning:
        threshold_candidates = np.linspace(0.05, 0.95, 19)
        best_thresholds      = np.zeros(num_labels)
        best_metrics         = {label: {'f1': 0.0, 'precision': 0.0, 'recall': 0.0} for label in id2label.values()}

        # Iterate over each label to find the best threshold
        for label_idx, label in id2label.items():
            # Predictions for the current label across all threshold candidates
            preds = probs[:, label_idx][:, None] > threshold_candidates  # Create a matrix of shape (num_samples, num_thresholds)

            # Compute precision, recall, F1 for all thresholds at once for the current label
            precision, recall, f1, _ = precision_recall_fscore_support(
                np.tile(true_labels[:, label_idx], (len(threshold_candidates), 1)).T, preds, average=None, zero_division=0
            )

            # Find the best threshold based on F1 for the current label
            best_idx                   = np.argmax(f1)
            best_thresholds[label_idx] = threshold_candidates[best_idx]
            best_metrics[label]        = {'f1': f1[best_idx], 'precision': precision[best_idx], 'recall': recall[best_idx]}

        best_thresholds_dict = {id2label[i]: best_thresholds[i].item() for i in range(len(best_thresholds))}

        # Generate predictions using the optimized threshold for each label
        best_preds = np.zeros_like(true_labels, dtype=int)
        for label_idx, label in id2label.items():
            best_preds[:, label_idx] = (probs[:, label_idx] > best_thresholds[label_idx]).astype(int)

        #print("==== best_thresholds, best_threshold_dict and best_metrics ====")
        #print(f"best_thresholds:      {type(best_thresholds)} shape={best_thresholds.shape}\n{best_thresholds}")                # <class 'numpy.ndarray'> shape=(6,)
        #print(f"best_thresholds_dict: {type(best_thresholds_dict)} len={len(best_thresholds_dict)}\n{best_thresholds_dict}")    # <class 'dict'> len=6
        #print(f"best_metrics:         {type(best_metrics)} len={len(best_metrics)}\n{json.dumps(best_metrics, indent=4)}")      # <class 'dict'> len=6
        #print("===============================================================")
        #print()

    # ==== If not threshold_tuning ====

    # Apply provided thresholds or default to 0.5
    thresholds_fixed = thresholds if thresholds is not None else np.full(num_labels, 0.5)

    # Compute predictions with fixed thresholds
    preds_fixed = (probs > thresholds_fixed).astype(int)

    # Compute metrics in one step (no loop)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, preds_fixed, average=None, zero_division=0)

    # Convert to dict format
    metrics_fixed         = {id2label[i]: {'f1': f1[i], 'precision': precision[i], 'recall': recall[i]} for i in range(num_labels)}
    thresholds_fixed_dict = {id2label[i]: thresholds_fixed[i].item() for i in range(num_labels)}

    #print("==== provided thresholds and metrics ====")
    #print(f"thresholds_fixed     : {type(thresholds_fixed)} shape={thresholds_fixed.shape}\n{thresholds_fixed}")              # <class 'numpy.ndarray'> shape=(6,)
    #print(f"thresholds_fixed_dict: {type(thresholds_fixed_dict)} len={len(thresholds_fixed_dict)}\n{thresholds_fixed_dict}")  # <class 'dict'> len=6
    #print(f"metrics_fixed        : {type(metrics_fixed)} len={len(metrics_fixed)}\n{json.dumps(metrics_fixed, indent=4)}")    # <class 'dict'> len=6
    #print("===============================================================")
    #print()

    thresholds      = best_thresholds      if threshold_tuning else thresholds_fixed
    thresholds_dict = best_thresholds_dict if threshold_tuning else thresholds_fixed_dict
    metrics         = best_metrics         if threshold_tuning else metrics_fixed
    preds           = best_preds           if threshold_tuning else preds_fixed

    # Compute micro average
    #   compute metrics globally by summing all TP, FP, FN across all labels
    #   good for overall performance assessment
    #   dominated by frequent labels: if most samples belong to a few labels, if favors them
    micro_precision, micro_recall, micro_f1, _ = precision_recall_fscore_support(
        true_labels, preds, average='micro', zero_division=0
    )

    # Compute macro average
    #   each label is treated equally, regardless of how often it appears
    #   good for evaluating rare labels
    #   sensitive to rare labels: if rare labels perform poorly, macro F1 will drop
    macro_precision, macro_recall, macro_f1, _ = precision_recall_fscore_support(
        true_labels, preds, average='macro', zero_division=0
    )

    # Compute weighted average
    #   like macro, but weights each label's F1 based on its frequency
    #   balances between micro and macro by considering both label importance and prevalence
    #   useful if class imbalance exists but you still want per-label influence
    weighted_precision, weighted_recall, weighted_f1, _ = precision_recall_fscore_support(
        true_labels, preds, average='weighted', zero_division=0
    )

    global_metrics = {
        'micro':    {'f1': micro_f1,    'precision': micro_precision,    'recall': micro_recall},
        'macro':    {'f1': macro_f1,    'precision': macro_precision,    'recall': macro_recall},
        'weighted': {'f1': weighted_f1, 'precision': weighted_precision, 'recall': weighted_recall}
    }

    return thresholds, thresholds_dict, metrics, global_metrics

### Evaluation 2: calculate metrics and optimized thresholds

- First, to **calculate** the optimized thresholds, threshold_tuning = True and thresholds = None.
- After, to **use** these optimized thresholds, threshold_tuning = False and thresholds = the optimized thresholds.

In [80]:
# with best_thresholds (threshold_tuning = True and thresholds = None)
optimized_thresholds, optimized_thresholds_dict, evaluation_trainer_predict_metrics, evaluation_trainer_predict_global_metrics = predict_with_optimized_thresholds(
    trainer, validation_dataset, id2label, threshold_tuning=True, thresholds=None)

print("==== with best thresholds ====")
print(f"optimized_thresholds                     : {type(optimized_thresholds)} shape={optimized_thresholds.shape} {optimized_thresholds}")
print(f"optimized_thresholds_dict                : {type(optimized_thresholds_dict)} len={len(optimized_thresholds_dict)}\n{optimized_thresholds_dict}")
print(f"evaluation_trainer_predict_metrics       : {type(evaluation_trainer_predict_metrics)} len={len(evaluation_trainer_predict_metrics)}\n{json.dumps(evaluation_trainer_predict_metrics, indent=4)}")
print(f"evaluation_trainer_predict_global_metrics: {type(evaluation_trainer_predict_global_metrics)} len={len(evaluation_trainer_predict_global_metrics)}\n{json.dumps(evaluation_trainer_predict_global_metrics, indent=4)}")

print()

# with thresholds_fixed=0.5 (threshold_tuning = False and thresholds = None)
thresholds, thresholds_dict, evaluation_trainer_predict_metrics_thr05, evaluation_trainer_predict_global_metrics_thr05 = predict_with_optimized_thresholds(
    trainer, validation_dataset, id2label, threshold_tuning=False, thresholds=None)

print("==== with default fixed thresholds = 0.5 ====")
print(f"thresholds                                     : {type(thresholds)} shape={thresholds.shape} {thresholds}")
print(f"thresholds_dict                                : {type(thresholds_dict)} len={len(thresholds_dict)}\n{thresholds_dict}")
print(f"evaluation_trainer_predict_metrics_thr05       : {type(evaluation_trainer_predict_metrics_thr05)} len={len(evaluation_trainer_predict_metrics_thr05)}\n{json.dumps(evaluation_trainer_predict_metrics_thr05, indent=4)}")
print(f"evaluation_trainer_predict_global_metrics_thr05: {type(evaluation_trainer_predict_global_metrics_thr05)} len={len(evaluation_trainer_predict_global_metrics_thr05)}\n{json.dumps(evaluation_trainer_predict_global_metrics_thr05, indent=4)}")

threshold: <class 'float'> 0.5
threshold: <class 'float'> 0.5
==== with best thresholds ====
optimized_thresholds                     : <class 'numpy.ndarray'> shape=(6,) [0.45 0.5  0.45 0.45 0.45 0.45]
optimized_thresholds_dict                : <class 'dict'> len=6
{'390': 0.44999999999999996, '135': 0.49999999999999994, '136': 0.44999999999999996, '137': 0.44999999999999996, '138': 0.44999999999999996, '139': 0.44999999999999996}
evaluation_trainer_predict_metrics       : <class 'dict'> len=6
{
    "390": {
        "f1": 0.6263736263736264,
        "precision": 0.5632411067193676,
        "recall": 0.7054455445544554
    },
    "135": {
        "f1": 0.605009633911368,
        "precision": 0.628,
        "recall": 0.5836431226765799
    },
    "136": {
        "f1": 0.6967795901296528,
        "precision": 0.629154078549849,
        "recall": 0.7806935332708529
    },
    "137": {
        "f1": 0.8821530515155634,
        "precision": 0.825069921179761,
        "recall": 0.9477219626

threshold: <class 'float'> 0.5
threshold: <class 'float'> 0.5
==== with default fixed thresholds = 0.5 ====
thresholds                                     : <class 'numpy.ndarray'> shape=(6,) [0.5 0.5 0.5 0.5 0.5 0.5]
thresholds_dict                                : <class 'dict'> len=6
{'390': 0.5, '135': 0.5, '136': 0.5, '137': 0.5, '138': 0.5, '139': 0.5}
evaluation_trainer_predict_metrics_thr05       : <class 'dict'> len=6
{
    "390": {
        "f1": 0.6248399487836107,
        "precision": 0.6472148541114059,
        "recall": 0.6039603960396039
    },
    "135": {
        "f1": 0.605009633911368,
        "precision": 0.628,
        "recall": 0.5836431226765799
    },
    "136": {
        "f1": 0.6831983805668016,
        "precision": 0.7425742574257426,
        "recall": 0.6326148078725399
    },
    "137": {
        "f1": 0.8796090626388272,
        "precision": 0.8921598077500751,
        "recall": 0.8674065420560748
    },
    "138": {
        "f1": 0.9573100757401882,
      

In [81]:
print("evaluation 2: trainer.predict: successfully completed")

evaluation 2: trainer.predict: successfully completed


### Evaluation 2: trainer.predict: save locally and upload to HF Hub


In [82]:
if upload_to_HF is True:
    name                            = "evaluation_trainer_predict"
    evaluation_trainer_predict_path = f"{name}.json"

    with open(evaluation_trainer_predict_path, "w") as f:
        json.dump(evaluation_trainer_predict_metrics, f)

    print(f"{name} successfully saved locally to {evaluation_trainer_predict_path}")

    upload_file(
        path_or_fileobj = evaluation_trainer_predict_path,
        path_in_repo    = evaluation_trainer_predict_path,
        repo_id         = repo_id,
        repo_type       = 'dataset',
        commit_message  = f"{name}_{timestamp}"
    )

    print(f"{name} successfully uploaded to HF Hub as {evaluation_trainer_predict_path}")

    name_thr05                            = "evaluation_trainer_predict_thr05"
    evaluation_trainer_predict_path_thr05 = f"{name_thr05}.json"

    with open(evaluation_trainer_predict_path_thr05, "w") as f:
        json.dump(evaluation_trainer_predict_metrics_thr05, f)

    print(f"{name_thr05} successfully saved locally to {evaluation_trainer_predict_path_thr05}")

    upload_file(
        path_or_fileobj = evaluation_trainer_predict_path_thr05,
        path_in_repo    = evaluation_trainer_predict_path_thr05,
        repo_id         = repo_id,
        repo_type       = 'dataset',
        commit_message  = f"{name_thr05}_{timestamp}"
    )

    print(f"{name_thr05} successfully uploaded to HF Hub as {evaluation_trainer_predict_path_thr05}")

evaluation_trainer_predict successfully saved locally to evaluation_trainer_predict.json
evaluation_trainer_predict successfully uploaded to HF Hub as evaluation_trainer_predict.json
evaluation_trainer_predict_thr05 successfully saved locally to evaluation_trainer_predict_thr05.json
evaluation_trainer_predict_thr05 successfully uploaded to HF Hub as evaluation_trainer_predict_thr05.json


### Evaluation 2: optimized thresholds: save locally (as a dict) and upload to HF Hub (as a JSON file in repo 'model')
optimized_thresholds: <class 'numpy.ndarray'> shape=(6,) but JSON doesn't support NumPy types

In [83]:
if threshold_tuning is True and upload_to_HF is True:
    name                      = "optimized_thresholds"
    optimized_thresholds_path = f"{name}.json"

    with open(optimized_thresholds_path, "w") as f:
        json.dump(optimized_thresholds_dict, f, indent=4)

    print(f"{optimized_thresholds_dict} successfully saved locally to {optimized_thresholds_path}")

    upload_file(
        path_or_fileobj = optimized_thresholds_path,
        path_in_repo    = optimized_thresholds_path,
        repo_id         = repo_id,
        repo_type       = 'model',
        commit_message  = f"{name}_{timestamp}"
        )

    print(f"{name} successfully uploaded to HF Hub as {optimized_thresholds_path}")

{'390': 0.44999999999999996, '135': 0.49999999999999994, '136': 0.44999999999999996, '137': 0.44999999999999996, '138': 0.44999999999999996, '139': 0.44999999999999996} successfully saved locally to optimized_thresholds.json
optimized_thresholds successfully uploaded to HF Hub as optimized_thresholds.json


### Evaluation 3: model.eval

In [84]:
def compute_metrics_with_threshold(model, dataset, optimized_thresholds, id2label, batch_size=8):
    """
    Compute metrics during evaluation or test, by applying tuned thresholds

    Parameters:
    - model                                               : Hugging Face model
    - dataset                                             : Dataset to predict on
    - optimized_thresholds (list or NumPy array of floats): Optimized thresholds for each label
    - id2label                                            : Dictionary mapping label indices (int) to label names (string)
    - batch_size                                          : Batch size for prediction. Defaults to 8
    Returns:
    - metrics (dict)

    Compute metrics during evaluation or test, by applying optimized thresholds
    """
    dataloader = DataLoader(dataset, batch_size=batch_size)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)    # Move model to GPU/CPU
    model.eval()        # Set model to evaluation mode

    all_logits, all_labels = [], []

    with torch.no_grad():
        for batch in dataloader:
            labels  = batch.pop('labels')                           # Keep labels on CPU
            inputs  = {k: v.to(device) for k, v in batch.items()}   # Move inputs to device
            outputs = model(**inputs)

            all_logits.append(outputs.logits.cpu())     # Keep logits as tensors, move to CPU
            all_labels.append(labels)                   # Labels remain on CPU

    # Stack tensors
    logits = torch.cat(all_logits, dim=0).to(device)   # shape = (num_samples, num_labels), move to device
    labels = torch.cat(all_labels, dim=0).to(device)   # shape = (num_samples, num_labels), move to device

    # Convert logits to probabilities
    probs = torch.sigmoid(logits)           # shape = (num_samples, num_labels)

    # Apply per-class tuned thresholds (element-wise comparison)
    thresholds = torch.tensor(optimized_thresholds, dtype=torch.float32, device=device)  # Convert tuned_thresholds to tensor
    preds      = (probs > thresholds).int()                                              # Convert to binary predictions (1 or 0)

    # Compute TP, FP, FN, TN
    TP = ((preds == 1) & (labels == 1)).sum(dim=0).float()
    TN = ((preds == 0) & (labels == 0)).sum(dim=0).float()
    FP = ((preds == 1) & (labels == 0)).sum(dim=0).float()
    FN = ((preds == 0) & (labels == 1)).sum(dim=0).float()

    # Compute per-class metrics
    precision_per_class = TP / (TP + FP + 1e-8)
    recall_per_class    = TP / (TP + FN + 1e-8)
    f1_per_class        = 2 * (precision_per_class * recall_per_class) / (precision_per_class + recall_per_class + 1e-8)

    # Compute averaged metrics
    precision = precision_per_class.mean()
    recall    = recall_per_class.mean()
    f1        = f1_per_class.mean()

    # Compute accuracy
    accuracy = (preds == labels).float().mean()

    # Convert to NumPy for ROC-AUC and PR-AUC
    labels_np = labels.cpu().numpy()  # Move to CPU before converting
    probs_np  = probs.cpu().numpy()   # Move to CPU before converting

    # Compute ROC-AUC and PR-AUC
    roc_auc = torch.tensor(roc_auc_score(labels_np, probs_np, average=evaluation_average, multi_class='ovr'))
    pr_auc  = torch.tensor(average_precision_score(labels_np, probs_np, average=evaluation_average))

    # Convert predictions to Numpy for classification_report
    preds_np = preds.cpu().numpy()  # Move to CPU before converting

    # Generate classification report
    class_names  = [id2label[i] for i in range(len(id2label))]
    class_report = classification_report(labels_np, preds_np, target_names=class_names, zero_division=0)

    #print(f"\nClassification Report:\n{class_report}")

    # Store metrics
    metrics = {
        'accurary'             : accuracy.item(),
        'precision'            : precision.item(),
        'recall'               : recall.item(),
        'f1'                   : f1.item(),
        'roc_auc'              : roc_auc.item(),
        'pr_auc'               : pr_auc.item(),
        'per_class_precision'  : {id2label[i]: precision_per_class[i].item() for i in range(len(id2label))},
        'per_class_recall'     : {id2label[i]: recall_per_class[i].item() for i in range(len(id2label))},
        'per_class_f1'         : {id2label[i]: f1_per_class[i].item() for i in range(len(id2label))},
        'classification_report': class_report,
    }

    return metrics

In [85]:
evaluation_model_eval_metrics = compute_metrics_with_threshold(model, validation_dataset, optimized_thresholds, id2label, batch_size=16)

except_report = {k: v for k, v in evaluation_model_eval_metrics.items() if k!='classification_report'}
report        = evaluation_model_eval_metrics['classification_report']
print(f"evaluation_model_eval_metrics: {type(except_report)} len={len(except_report)}\n{json.dumps(except_report, indent=4)}")
print(f"evaluation_model_eval_metrics['classification_report']: {type(report)} len={len(report)}\n{report}")

evaluation_model_eval_metrics: <class 'dict'> len=9
{
    "accurary": 0.8833333253860474,
    "precision": 0.7326376438140869,
    "recall": 0.8305651545524597,
    "f1": 0.776735782623291,
    "roc_auc": 0.9585698103660364,
    "pr_auc": 0.9466109728546714,
    "per_class_precision": {
        "390": 0.5632411241531372,
        "135": 0.628000020980835,
        "136": 0.6288737654685974,
        "137": 0.8250699043273926,
        "138": 0.9356468319892883,
        "139": 0.814994215965271
    },
    "per_class_recall": {
        "390": 0.7054455280303955,
        "135": 0.5836431384086609,
        "136": 0.7797563076019287,
        "137": 0.9477219581604004,
        "138": 0.9897769689559937,
        "139": 0.9770464897155762
    },
    "per_class_f1": {
        "390": 0.6263736486434937,
        "135": 0.6050096154212952,
        "136": 0.6962342858314514,
        "137": 0.8821530342102051,
        "138": 0.961950957775116,
        "139": 0.8886932730674744
    }
}
evaluation_model_e

In [86]:
print("evaluation 3: model.eval: successfully completed")

evaluation 3: model.eval: successfully completed


### Evaluation 3: model.eval: save locally and upload to HF Hub

In [87]:
if upload_to_HF is True:

    name                       = "evaluation_model_eval"
    evaluation_model_eval_path = f"{name}.json"

    with open(evaluation_model_eval_path, "w") as f:
        json.dump(evaluation_model_eval_metrics, f)

    print(f"{name} successfully saved locally to {evaluation_model_eval_path}")

    upload_file(
        path_or_fileobj = evaluation_model_eval_path,
        path_in_repo    = evaluation_model_eval_path,
        repo_id         = repo_id,
        repo_type       = 'dataset',
        commit_message  = f"{name}_{timestamp}"
    )

    print(f"{name} successfully uploaded to HF Hub as {evaluation_model_eval_path}")

evaluation_model_eval successfully saved locally to evaluation_model_eval.json
evaluation_model_eval successfully uploaded to HF Hub as evaluation_model_eval.json


## Test step

### Test 1: trainer.evaluate

In [88]:
test_trainer_evaluate_metrics = trainer.evaluate(
    eval_dataset = encoded_dataset['test'],
    metric_key_prefix='test'
)

print(f"test_trainer_evaluate_metrics: {type(test_trainer_evaluate_metrics)} len={len(test_trainer_evaluate_metrics)}\n{json.dumps(test_trainer_evaluate_metrics, indent=4)}")


threshold: <class 'float'> 0.5
threshold: <class 'float'> 0.5
test_trainer_evaluate_metrics: <class 'dict'> len=12
{
    "test_loss": 0.009198885411024094,
    "test_f1": 0.8827622721428031,
    "test_precision": 0.8736432367692193,
    "test_recall": 0.8920736834059466,
    "test_roc_auc": 0.9059201542243271,
    "test_precision_recall_auc": 0.8195740875631762,
    "test_subset_accuracy": 0.5552083333333333,
    "test_hamming_loss": 0.1076388888888889,
    "test_runtime": 71.308,
    "test_samples_per_second": 67.314,
    "test_steps_per_second": 8.414,
    "epoch": 4.0
}


In [89]:
print("test_trainer.evaluate successfully completed")

test_trainer.evaluate successfully completed


### Test 1: trainer.evaluate: save locally and upload to HF Hub

In [90]:
if upload_to_HF is True:

    name                       = "test_trainer_evaluate"
    test_trainer_evaluate_path = f"{name}.json"

    with open(test_trainer_evaluate_path, "w") as f:
        json.dump(test_trainer_evaluate_metrics, f)

    print(f"{name} results successfully saved locally to {test_trainer_evaluate_path}")

    upload_file(
        path_or_fileobj = test_trainer_evaluate_path,
        path_in_repo    = test_trainer_evaluate_path,
        repo_id         = repo_id,
        repo_type       = 'dataset',
        commit_message  = f"{name}_{timestamp}"
    )

    print(f"{name} results successfully uploaded to HF Hub as {test_trainer_evaluate_path}")

test_trainer_evaluate results successfully saved locally to test_trainer_evaluate.json
test_trainer_evaluate results successfully uploaded to HF Hub as test_trainer_evaluate.json


### Test 2: trainer.predict

In [91]:
optimized_thresholds, optimized_thresholds_dict, test_trainer_predict_metrics, test_trainer_predict_global_metrics = predict_with_optimized_thresholds(
    trainer, test_dataset, id2label, threshold_tuning=False, thresholds=optimized_thresholds)

print(f"optimized_thresholds: {type(optimized_thresholds)} shape={optimized_thresholds.shape} {optimized_thresholds}")
print(f"optimized_thresholds_dict: {type(optimized_thresholds_dict)} len={len(optimized_thresholds_dict)}\n{optimized_thresholds_dict}")
print(f"test_trainer_predict_metrics: {type(test_trainer_predict_metrics)} len={len(test_trainer_predict_metrics)}\n{json.dumps(test_trainer_predict_metrics, indent=4)}")

threshold: <class 'float'> 0.5
threshold: <class 'float'> 0.5
optimized_thresholds: <class 'numpy.ndarray'> shape=(6,) [0.45 0.5  0.45 0.45 0.45 0.45]
optimized_thresholds_dict: <class 'dict'> len=6
{'390': 0.44999999999999996, '135': 0.49999999999999994, '136': 0.44999999999999996, '137': 0.44999999999999996, '138': 0.44999999999999996, '139': 0.44999999999999996}
test_trainer_predict_metrics: <class 'dict'> len=6
{
    "390": {
        "f1": 0.6055045871559633,
        "precision": 0.5443298969072164,
        "recall": 0.6821705426356589
    },
    "135": {
        "f1": 0.5968379446640316,
        "precision": 0.6291666666666667,
        "recall": 0.5676691729323309
    },
    "136": {
        "f1": 0.6876864081806562,
        "precision": 0.6222050886661527,
        "recall": 0.7685714285714286
    },
    "137": {
        "f1": 0.8776581054957194,
        "precision": 0.821188630490956,
        "recall": 0.9424673784104389
    },
    "138": {
        "f1": 0.9628879892037787,
     

In [92]:
#test_trainer_predict_results = predict_with_optimized_thresholds(
#    trainer, test_dataset, id2label, threshold_tuning=False, thresholds=optimized_thresholds)

#except_report = {k: v for k, v in test_trainer_predict_results.items() if k!='classification_report'}
#report        = test_trainer_predict_results['classification_report']
#print(f"test_trainer_predict_results: {type(except_report)} len={len(except_report)}\n{json.dumps(except_report, indent=4)}")
#print(f"test_trainer_predict_results['classification_report']: {type(report)} len={len(report)}\n{report}")


In [93]:
print("test_trainer.predict successfully completed")

test_trainer.predict successfully completed


### Test 2: trainer.predict: save locally and upload to HF Hub

In [94]:
if upload_to_HF is True:

    name                      = "test_trainer_predict"
    test_trainer_predict_path = f"{name}.json"

    with open(test_trainer_predict_path, "w") as f:
        json.dump(test_trainer_predict_metrics, f)

    print(f"{name} results successfully saved locally to {test_trainer_predict_path}")

    upload_file(
        path_or_fileobj = test_trainer_predict_path,
        path_in_repo    = test_trainer_predict_path,
        repo_id         = repo_id,
        repo_type       = 'dataset',
        commit_message  = f"{name}_{timestamp}"
    )

    print(f"{name} results successfully uploaded to HF Hub as {test_trainer_predict_path}")

test_trainer_predict results successfully saved locally to test_trainer_predict.json
test_trainer_predict results successfully uploaded to HF Hub as test_trainer_predict.json


### Test 3: model.eval

In [95]:
test_model_eval_metrics = compute_metrics_with_threshold(model, test_dataset, optimized_thresholds, id2label, batch_size=16)

except_report = {k: v for k, v in test_model_eval_metrics.items() if k!='classification_report'}
report        = test_model_eval_metrics['classification_report']
print(f"test_model_eval_metrics: {type(except_report)} len={len(except_report)}\n{json.dumps(except_report, indent=4)}")
print(f"test_model_eval_metrics['classification_report']: {type(report)} len={len(report)}\n{report}")

test_model_eval_metrics: <class 'dict'> len=9
{
    "accurary": 0.8847222328186035,
    "precision": 0.7302092909812927,
    "recall": 0.8226027488708496,
    "f1": 0.7716321349143982,
    "roc_auc": 0.9604809419780602,
    "pr_auc": 0.9497050795159258,
    "per_class_precision": {
        "390": 0.5452674627304077,
        "135": 0.6291666626930237,
        "136": 0.6217257380485535,
        "137": 0.8214008808135986,
        "138": 0.9357377290725708,
        "139": 0.8279569745063782
    },
    "per_class_recall": {
        "390": 0.6847545504570007,
        "135": 0.567669153213501,
        "136": 0.7685714364051819,
        "137": 0.9424673914909363,
        "138": 0.9916608929634094,
        "139": 0.9804930686950684
    },
    "per_class_f1": {
        "390": 0.6071019768714905,
        "135": 0.5968379974365234,
        "136": 0.6873935461044312,
        "137": 0.8777792453765869,
        "138": 0.9628880023956299,
        "139": 0.8977921605110168
    }
}
test_model_eval_metri

### Test 3: model.eval: save locally and upload to HF Hub

In [96]:
if upload_to_HF is True:

    name                 = "test_model_eval"
    test_model_eval_path = f"{name}.json"

    with open(test_model_eval_path, "w") as f:
        json.dump(test_model_eval_metrics, f)

    print(f"{name} successfully saved locally to {test_model_eval_path}")

    upload_file(
        path_or_fileobj = test_model_eval_path,
        path_in_repo    = test_model_eval_path,
        repo_id         = repo_id,
        repo_type       = 'dataset',
        commit_message  = f"{name}_{timestamp}"
    )

    print(f"{name} successfully uploaded to HF Hub as {test_model_eval_path}")

test_model_eval successfully saved locally to test_model_eval.json
test_model_eval successfully uploaded to HF Hub as test_model_eval.json


In [97]:
raise Exception("It's the end, I stop here")

Exception: It's the end, I stop here

==========================================================================================================

In [None]:
# Define the weighted loss function

class_weights = torch.tensor([7.68, 2.15, 0.61, 0.47, 0.68, 6.26], dtype=torch.float32).to(device)
loss_fn       = BCEWithLogitsLoss(pos_weight=class_weights)  # For multi-label classification (binary classification per label)

## Class supports, class weigths, weighted loss function

#Reminder:
#*   df_jobs      : <class 'pandas.core.frame.DataFrame'>
#*   df_jobs['id']: <class 'pandas.core.series.Series'>

#dataset = Dataset.from_pandas(df_jobs)
#*   dataset      : <class 'datasets.arrow_dataset.Dataset'>
#*   dataset['id']: <class 'list'>

#*   dataset_dict_jobs : <class 'datasets.dataset_dict.DatasetDict'>
#*   train_dataset     : <class 'datasets.arrow_dataset.Dataset'>
#*   validation_dataset: <class 'datasets.arrow_dataset.Dataset'>
#*   test_dataset      : <class 'datasets.arrow_dataset.Dataset'>


#We calculate the class supports for the train, validation and test datasets; the class weights and the weighted loss function are used for training only; the class supports of validation_dataset and test_dataset are calculated for information only.
# function B
def get_train_class_weights(datasetDict, labels):
  print(f"datasetDict: {type(datasetDict)} shape={datasetDict.shape}\n{datasetDict}")
  print(f"labels: {type(labels)} len={len(labels)}\n{labels}")

  dataset_train      = datasetDict['train']
  dataset_validation = datasetDict['validation']
  dataset_test       = datasetDict['test']

  def calculate_class_supports(dataset, labels):
    class_supports = dataset.map(
        lambda example: {col: example[col] for col in labels},
        batched=True
    ).to_pandas()[labels].sum(axis=0)
    return class_supports

  class_supports = {}

  for split_name, split_dataset in datasetDict.items():
    class_supports[split_name] = calculate_class_supports(split_dataset, labels)

  for split_name, split_class_supports in class_supports.items():
    print(f"{split_name}: {type(split_class_supports)} len={len(split_class_supports)}\n{split_class_supports}")

  train_class_supports_list = class_supports['train'].tolist()
  print(f"train_class_supports_list: {type(train_class_supports_list)} len={len(train_class_supports_list)} {train_class_supports_list}")

  train_class_supports_tensor = torch.tensor(train_class_supports_list, dtype=torch.float32)
  print(f"train_class_supports_tensor: {type(train_class_supports_tensor)} len={len(train_class_supports_tensor)} {train_class_supports_tensor}")

  train_total_samples = dataset_train.num_rows
  print(f"train_total_samples: {train_total_samples}")

  number_of_classes = len(labels)
  print(f"number_of_classes: {number_of_classes}")

  train_class_weights = train_total_samples / (number_of_classes * train_class_supports_tensor)
  print(f"train_class_weights: {type(train_class_weights)} len={len(train_class_weights)} {train_class_weights}")

  train_class_weights_sum = train_class_weights.sum()
  print(f"train_class_weights_sum: {train_class_weights_sum}")

  normalized_train_class_weights = (train_class_weights / train_class_weights_sum) * number_of_classes
  print(f"normalized_train_class_weights: {type(normalized_train_class_weights)} len={len(normalized_train_class_weights)} {normalized_train_class_weights}")

  # Positives samples per label
  supports = train_class_supports_tensor
  print(f"supports: {type(supports)} {len(supports)} {supports}")

  # Negatives samples per label
  negatives = train_total_samples - supports
  print(f"negatives: {type(negatives)} {len(negatives)} {negatives}")

  # pos_weights = negative to positive ratios
  pos_weights = negatives/supports
  print(f"pos_weights: {type(pos_weights)} {len(pos_weights)} {pos_weights}")

  # Normalize using min-max scaling
  normalized_pos_weights_minmax = (pos_weights - pos_weights.min()) / (pos_weights.max() - pos_weights.min())
  print(f"normalized_pos_weights_minmax: {type(normalized_pos_weights_minmax)} {len(normalized_pos_weights_minmax)} {normalized_pos_weights_minmax}")

  # Normalize using z-score standardization
  normalized_pos_weights_zscore = (pos_weights - pos_weights.mean()) / pos_weights.std()
  print(f"normalized_pos_weights_zscore: {type(normalized_pos_weights_zscore)} {len(normalized_pos_weights_zscore)} {normalized_pos_weights_zscore}")

  # Normalize using min-max scaling
  normalized_pos_weights_minmax = (pos_weights - pos_weights.min()) / (pos_weights.max() - pos_weights.min())
  print(f"normalized_pos_weights_minmax: {type(normalized_pos_weights_minmax)} {len(normalized_pos_weights_minmax)} {normalized_pos_weights_minmax}")

  # Normalize using z-score standardization
  normalized_pos_weights_zscore = (pos_weights - pos_weights.mean()) / pos_weights.std()
  print(f"normalized_pos_weights_zscore: {type(normalized_pos_weights_zscore)} {len(normalized_pos_weights_zscore)} {normalized_pos_weights_zscore}")

  # Normalize using sum-to-one
  normalized_pos_weights_sum1 = pos_weights / pos_weights.sum()
  print(f"normalized_pos_weights_sum1: {type(normalized_pos_weights_sum1)} {len(normalized_pos_weights_sum1)} {normalized_pos_weights_sum1}")

  return normalized_pos_weights_minmax
  #return normalized_pos_weights_zscore
  #return normalized_pos_weights_sum1

pos_weights = get_train_class_weights(datasetDict, labels)

loss_fn = BCEWithLogitsLoss(pos_weight=pos_weights.to(device))  # For multi-label classification (binary classification per label)
print(f"loss_fn: {type(loss_fn)} {loss_fn}")


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Data from your training results
epochs = np.arange(1, 9)
training_loss = [0.3122, 0.2897, 0.2665, 0.2466, 0.2244, 0.2223, 0.2202, 0.2077]
validation_loss = [0.303785, 0.293599, 0.278830, 0.275663, 0.280968, 0.280640, 0.279608, 0.282026]
f1_score = [0.865113, 0.871222, 0.875554, 0.880279, 0.879128, 0.878554, 0.879872, 0.877893]

# Plot Loss
plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
plt.plot(epochs, training_loss, label='Training Loss', marker='o')
plt.plot(epochs, validation_loss, label='Validation Loss', marker='s')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training & Validation Loss')
plt.legend()
plt.grid()

# Plot F1 Score
plt.subplot(1, 2, 2)
plt.plot(epochs, f1_score, label='F1 Score', marker='o', color='green')
plt.xlabel('Epochs')
plt.ylabel('F1 Score')
plt.title('F1 Score over Epochs')
plt.legend()
plt.grid()

plt.tight_layout()
plt.show()


In [None]:
example = datasetDict['test'][0]
print(f"datasetDict['test'][0]: {type(example)} {example.keys()}\n{example}")

In [None]:
inputs = tokenizer(
    example['text'],
    truncation     = True,
    padding        = 'max_length',
    max_length     = max_length,
    return_tensors = 'pt'
)

In [None]:
with torch.no_grad():  # Disable gradient calculation during prediction
    outputs = model(
        input_ids=inputs.input_ids.to(device),
        attention_mask=inputs.attention_mask.to(device)
    )


In [None]:
probs = torch.sigmoid(outputs.logits)


In [None]:
best_thresholds = [0.4, 0.3, 0.3, 0.3, 0.4, 0.5]
preds = np.zeros_like(probs)  # Initialize predictions array
for label_idx in range(num_labels):
  preds[:, label_idx] = (probs[:, label_idx] > best_thresholds[label_idx])  #.astype(int)

In [None]:
print(f"probs: {type(probs)} shape={probs.shape}\n{probs}")
print(f"preds: {type(preds)} shape={preds.shape}\n{preds}")

In [None]:
print(f"labels: {type(labels)} {len(labels)}\n{labels}")
# '390': False, '135': False, '136': True, '137': True, '138': True, '139': False}

In [None]:
def tune_thresholds(true_labels, probs, id2label):
  """
  Tune thresholds for each label to maximize F1 alone, as F1 balances precision and recall into a single metric.

  Args:
    true_labels: actual labels for the data                                      (numpy array of shape (num_samples, num_labels))
    probs      : predicted probabilities                                         (numpy array of shape (num_samples, num_labels))
    id2label   : dictionary mapping label indices (int) to label names (string)

  Returns:
    best_thresholds: best threshold for each label                                                      (numpy array of shape (num_labels,))
    best_metrics   : dictionary of best F1, precision_for_best_f1 and recall_for_best_f1 for each label (dictionary of numpy arrays)
  """
  thresholds      = np.linspace(0.1, 0.9, 9)
  best_thresholds = np.zeros(len(id2label))
  best_metrics    = {label: {'f1': 0.0, 'precision': 0.0, 'recall': 0.0} for label in id2label.values()}

  for label_idx, label in id2label.items():
    for threshold in thresholds:
      pred                     = (probs[:, label_idx] > threshold).astype(int)
      precision, recall, f1, _ = precision_recall_fscore_support(true_labels[:, label_idx], pred, average='binary', zero_division=0)
      if f1 > best_metrics[label]['f1']:
        best_thresholds[label_idx]       = threshold
        best_metrics[label]['f1']        = f1
        best_metrics[label]['precision'] = precision
        best_metrics[label]['recall']    = recall

  print("==== tune_thresholds ====")
  print(f"best_thresholds: {type(best_thresholds)} shape={best_thresholds.shape}\n{best_thresholds}")
  print(f"best_metrics   : {type(best_metrics)}    len={len(best_metrics)}      \n{json.dumps(best_metrics, indent=4)}")
  print("=========================")
  print()

  return best_thresholds, best_metrics

In [None]:
def compute_metrics_with_threshold_OLD(probs, label_ids, thresholds, id2label):
    """
    Compute metrics during evaluation or test, by applying tuned thresholds

    average:
    - 'micro'   : gives more weight to frequent labels     → best for imbalanced datasets where frequent labels are more important
    - 'macro'   : treats all labels equally                → best when you care about rare labels as much as frequent ones
    - 'weighted': like macro but considers label frequency → best if you want a compromise between macro and micro

    - 'macro' or 'weighted' AUC is often best because AUC isn't as affected by class imbalance as F1/Precision/Recall
    - 'macro'      AUC: usually the best because it treats all labels equally, avoiding the dominance of frequent labels
    - 'weighted'   AUC: similar to macro but considers label frequency
    - 'macro'   PR AUC: best for imbalanced datasets because it treats rare labels fairly
    - 'weighted PR AUC: also good, but slightly biased toward frequent labels

    PR AUC is better than ROC AUC when you care about positive examples in imbalanced data
    """
    average = 'macro'
    preds   = np.zeros_like(probs)

    # Apply per-label tuned threshold
    for label_idx in id2label.keys():
        preds[:, label_idx] = (probs[:, label_idx] > thresholds[label_idx]).astype(int)

    # Compute metrics
    f1        = f1_score       (label_ids, preds, average=average)
    precision = precision_score(label_ids, preds, average=average)
    recall    = recall_score   (label_ids, preds, average=average)
    accuracy  = accuracy_score (label_ids, preds)

    # Compute AUC scores with error handling
    try:
        roc_auc              = roc_auc_score          (label_ids, probs, average=average)
    except ValueError:
        roc_auc              = 0.0

    try:
        precision_recall_auc = average_precision_score(label_ids, probs, average=average)
    except ValueError:
        precision_recall_auc = 0.0

    # Compute per-class metrics (average = None)
    per_class_f1        = f1_score       (label_ids, preds, average=None)
    per_class_precision = precision_score(label_ids, preds, average=None)
    per_class_recall    = recall_score   (label_ids, preds, average=None)

    # Generate classification report
    report = classification_report(label_ids, preds, target_names=id2label.values(), zero_division=0)

    # Store metrics
    metrics = {
        'f1'                   : f1,
        'precision'            : precision,
        'recall'               : recall,
        'accuracy'             : accuracy,
        'roc_auc'              : roc_auc,
        'precision_recall_auc' : precision_recall_auc,
        'thresholds'           : thresholds.tolist(),
        'classification_report': report,
        'per_class_f1'         : per_class_f1,
        'per_class_precision'  : per_class_precision,
        'per_class_recall'     : per_class_recall
    }

    return metrics

In [None]:
def predict_with_optimized_thresholds_cuda(trainer, dataset, threshold_tuning=False, thresholds=None, threshold=threshold):
    """
    Predicts using trainer.predict(), with optional threshold tuning on GPU

    Parameters:
    - trainer         : Hugging Face Trainer or CustomTrainer instance
    - dataset         : dataset to predict on
    - id2label        : dictionary mapping label indices (int) to label names (string)
    - threshold_tuning: boolean to enable thresholds tuning per class (if evaluation, True, if prediction, False)
    - thresholds      : if evaluation, custom thresholds, if prediction, tuned thresholds (from evaluation)

    Returns:
    - best_thresholds (if threshold_tuning=True): optimized threshold for each label
    - best_metrics                              : computed with tuned thresholds whether for evaluation or prediction

    - metrics (if threshold_tuning=False): computed with fixed thresholds
    - metrics (if threshold_tuning=True): computed with tuned thresholds
    - predictions: final binary predictions
    - label_ids  : ground true labels from the dataset
    - best_thresholds (if threshold_tuning=True): optimized threshold per class
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Auto-detect GPU

    # Predict
    predictions_output = trainer.predict(dataset)        # <class 'transformers.trainer_utils.PredictionOutput'> len   = 3
    predictions_np     = predictions_output.predictions  # <class 'numpy.ndarray'>                               shape = (2400, 6)
    label_ids_np       = predictions_output.label_ids    # <class 'numpy.ndarray'>                               shape = (2400, 6)
    metrics_dict       = predictions_output.metrics      # <class 'dict'> (= trainer.evaluate results)           len   = 10

    print("==== predictions_output ====")
    print(f"predictions_output.predictions: {type(predictions_np)} shape={predictions_np.shape} \n{predictions_np}")
    print(f"predictions_output.label_ids  : {type(label_ids_np)}   shape={label_ids_np.shape}   \n{label_ids_np}")
    print(f"predictions_output.metrics    : {type(metrics_dict)}   len={len(metrics_dict)}      \n{json.dumps(metrics_dict, indent=4)}")

    # Convert NumPy arrays to PyTorch tensors (torch.from_numpy() keeps the NumPy array's memory layout, while torch.tensor() creates a new copy)
    logits      = torch.tensor(predictions_np, device=device)  # Move to GPU
    true_labels = torch.tensor(label_ids_np, device=device)    # Move to GPU

    print(f"logits     : {type(logits)}      shape={logits.shape}     \n{logits}")
    print(f"true_labels: {type(true_labels)} shape={true_labels.shape}\n{true_labels}")

    # Convert logits to probabilities using PyTorch (on GPU)
    probs = torch.sigmoid(logits)  # <class 'torch.Tensor'>  shape = (1200, 6)

    print(f"probs: {type(probs)} shape={probs.shape}\n{probs}")
    print("============================")
    print()

    num_labels = probs.shape[1]

    if threshold_tuning:
        best_thresholds = torch.full((num_labels,), threshold, device=device, dtype=torch.float32)  # Default to threshold

        # Define candidate thresholds (on GPU)
        threshold_candidates = torch.linspace(0.05, 0.95, 19, device=device)

        for label_idx in range(num_labels):
            best_f1 = 0
            for threshold in threshold_candidates:
                preds = (probs[:, label_idx] > threshold).int()

                # Convert to CPU for sklearn
                precision, recall, f1, _ = precision_recall_fscore_support(
                    true_labels[:, label_idx].cpu().numpy(),
                    preds.cpu().numpy(),
                    average='binary',
                    zero_division=0
                )

                if f1 > best_f1:
                    best_f1                    = f1
                    best_thresholds[label_idx] = threshold  # Store best threshold

        return probs, true_labels, best_thresholds

    # Apply provided thresholds (or default to 0.5)
    if thresholds is None:
        thresholds = torch.full((num_labels,), 0.5, device=device)  # Default to 0.5
    else:
        thresholds = torch.tensor(thresholds, device=device)  # Move thresholds to GPU

    preds = (probs > thresholds).int()

    return preds, true_labels, None

In [None]:
def compute_metrics_with_threshold_SAV(probs, label_ids, thresholds, id2label):

  #Compute metrics during evaluation or test, by applying tuned thresholds

  #logits  = eval_preds.predictions
  #labels  = eval_preds.label_ids
  #sigmoid = torch.nn.Sigmoid  # Sigmoid or numpy?
  #probs   = sigmoid(logits).cpu().numpy()
  preds   = np.zeros_like(probs)

  if threshold_tuning:
    # Apply per-label tuned threshold
    for label_idx in id2label.keys():
        preds[:, label_idx] = (probs[:, label_idx] > thresholds[label_idx]).astype(int)
  else:
    # threhsolds = None, apply a fixed threshold to all labels
    for label_idx in id2label.keys():
        preds[:, label_idx] = (probs[:, label_idx] > threshold).astype(int)

  # Compute metrics
  f1                    = f1_score               (label_ids, preds, average='micro')
  precision             = precision_score        (label_ids, preds, average='micro')
  recall                = recall_score           (label_ids, preds, average='micro')
  accuracy              = accuracy_score         (label_ids, preds)
  roc_auc               = roc_auc_score          (label_ids, probs, average='micro')  # multi_class="ovr" <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  precision_recall_auc  = average_precision_score(label_ids, probs, average='micro')

  # Use id2label for target_names
  report = classification_report(label_ids, preds, target_names=id2label.values(), zero_division=0)

  if threshold_tuning:
    _thresholds = thresholds.tolist()
  else:
    _thresholds = threshold

  metrics = {
      'f1'                   : f1,
      'precision'            : precision,
      'recall'               : recall,
      'accuracy'             : accuracy,
      'roc_auc'              : roc_auc,
      'precision_recall_auc' : precision_recall_auc,
      'thresholds'           : _thresholds,
      'classification_report': report
  }

  return metrics