<a href="https://colab.research.google.com/github/claudelepere/ML_GitHub/blob/main/Trainer_02_2e_5_1024.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q accelerate
!pip install -q huggingface_hub
!pip install -q scikit-learn
!pip install -q transformers datasets  # transformers and datasets are Hugging Face libraries
!pip install -q wandb

import json
import numpy as np
import os
import pandas as pd
import pickle
import sys
import time
import torch
import wandb

from datasets              import DatasetDict
from google.colab          import auth, drive, files, userdata
from huggingface_hub       import create_repo, login, upload_file, hf_hub_download
from huggingface_hub.utils import RepositoryNotFoundError
from sklearn.metrics       import accuracy_score, average_precision_score, classification_report, f1_score, precision_score, precision_recall_fscore_support, recall_score, roc_auc_score
from torch.utils.data      import DataLoader
from tqdm.auto             import tqdm
from transformers          import AdamW, EvalPrediction, LongformerTokenizerFast, LongformerForSequenceClassification, Trainer, TrainingArguments
from torch.nn              import BCEWithLogitsLoss


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/480.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m26.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/179.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/134.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import os
from google.colab    import userdata
from huggingface_hub import login, hf_hub_download

# Hugging Face Authenticate
os.environ["HF_TOKEN"] = userdata.get("HF_TOKEN")    # Store the key in os.environ
hf_token               = os.environ.get('HF_TOKEN')
login(token=hf_token)

# Verify
!huggingface-cli whoami

file_path = hf_hub_download(
    repo_id   ="claudelepere/skill_classification",
    repo_type = "dataset",
    filename  = "test_model_eval_results.csv"
)
print(f"file_path: {file_path}")  # /root/.cache/huggingface/hub/datasets--claudelepere--skill_classification/snapshots/51ead81f69b1689fc19694b3f034585cde9f56e1/test_model_eval_results.csv

# Next, open in Colab or download to local

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


claudelepere


test_model_eval_results.csv:   0%|          | 0.00/156k [00:00<?, ?B/s]

file_path: /root/.cache/huggingface/hub/datasets--claudelepere--skill_classification/snapshots/2550f18a29dd7e109d879cf85385d0057a0f5b59/test_model_eval_results.csv


In [3]:
"""
# Check the Python version
print(sys.version)
print()

# Get the installed packages (you can see that conda is not installed (do not install it))
!pip list
print()

# Check system information
!cat /etc/os-release
!uname -m
print()

# Check the GPU details (only if the runtime type is T4 GPU)
#!nvidia-smi
#print()

# Check RAM
!free -h
print()

# Check disk space
!df -h
print()

# Get environment variables
for key, value in os.environ.items():
    print(f"{key}: {value}")
"""
!python -V

print(f"currentdir: {os.getcwd()}")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"device: {device}")

datasetDict_zip_file_name = "dataset_11_12000.zip"
datasetDict_dir_name      = os.path.splitext(datasetDict_zip_file_name)[0]
print(f"datasetDict_zip_file_name: {datasetDict_zip_file_name}")
print(f"datasetDict_dir_name     : {datasetDict_dir_name}")
print()

# OOM: reduce batch size
#      small sizes (1 to 32):            PROs: better generalization in some cases
#                                        CONs: may produce noisier gradients
#      large sizes (128, 256, or higer): PROs: gradients are smoother, leading to more stable training
#                                        CONs: poorer generalization (overfitting) in some cases
#      intermediate sizes (32, 64):      combines the benefits of small and large sizes
batch_size = 8

# OOM: enable gradient accumulation to compensate for smaller batch sizes by accumulating gradients over several steps
#      effective batch size = per-device batch size x gradient accumulation steps;
#      in each iteration, the model computes the gradients, these gradients are immediately used to update the model parameters
gradient_accumulation_steps = 4  #<<<<<<<<<<<<<<<<<<< gradient_accumulation_steps may not be None => comment it in TrainingArguments

# OOM: use PYTORCH_CUDA_ALLOC_CONF to handle memory fragmentation
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

# OOM: check for zombie processes
if torch.cuda.is_available():
  !nvidia-smi
  torch.cuda.memory_summary()
!ps aux | grep python
#!kill -9 <PID>
#!nvidia-smi     # Checked if killed

# OOM: use fp16 (half precision) mixed precision training
#      reduces memory requirements by up to 50%
fp16 = True

# OOM: limit the number of GPU workers: 0 (default) or 1 in Colab
#dataloader_num_workers = 1

# OOM: reduce model size or input tokens
#      1) LongformerTokenizer.from_pretrained('allenai/longformer-base/large-4096'): large/base: 435M/149M parameters
#      2) max_length: 4096 max for Longformer; 1 word can give several tokens, stop words are NOT discarded!
#         word_text_length_counts_sorted: jobs count                 : 50000
#                                         jobs count under  512 words: 44794  89.59%
#                                         jobs count under  640 words: 47894  95.79%
#                                         jobs count under  768 words: 49123  98.25%
#                                         jobs count under  896 words: 49691  99.38%
#                                         jobs count under 1024 words: 49917  99.83%
#                                         jobs count under 2048 words: 50000 100.00%
#                                         jobs count under 4096 words: 50000 100.00%
#max_length =  768    #37 min
max_length = 1024    #38 min         # GPU RAM: 12.2 / 40 GB
#max_length = 2048    #1 hr 10 min    # GPU RAM: 21.4 / 40 GB
#max_length = 4096    #2 hr 10 min    # GPU RAM: 39.5 / 40 GB => OutOfMemoryError

# OOM: free up GPU memory
torch.cuda.empty_cache()

# OOM: monitor GPU memory usage
#!nividia-smi

# 1 epoch is a complete pass through the entire training dataset;
# with n datapoints and batch size = b, n/b iterations to complete 1 epoch;
# 1 iteration is a single update of the model's parameters
epochs = 5

# A common rule is to scale the learning rate proportionaly with the effective batch size
# note: get_linear_schedule_with_warmup <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
learning_rate = 2e-5  # 1e-5 x 32/8

# Reduce the number of transformers layers
#hidden_layers = 12    # 12 (default) or 6

# Threshold: 0.5 (default)
threshold = 0.2

if fp16:
  _fp = "fp16"
else:
  _fp = "fp32"

if 'gradient_accumulation_steps' not in globals():
  run_name = f"Longformer-multilabel-{datasetDict_dir_name}-length{max_length}-batch{batch_size}-epochs{epochs}-lr{learning_rate}-{_fp}-threshold{threshold}"
else:
  run_name = f"Longformer-multilabel-{datasetDict_dir_name}-length{max_length}-batch{batch_size}x{gradient_accumulation_steps}-epochs{epochs}-lr{learning_rate}-{_fp}-threshold{threshold}"
print(f"run_name                 : {run_name}")

Python 3.10.12
currentdir: /content
device: cuda
datasetDict_zip_file_name: dataset_11_12000.zip
datasetDict_dir_name     : dataset_11_12000

Sun Jan 12 10:20:32 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0              43W / 400W |      5MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+---------------

In [4]:
def upload_unzip_dataset(file_name=datasetDict_zip_file_name):
  # Check if the file exists
  if not os.path.exists(file_name):
    print(f"'{file_name}' not found in /content. Uploading...")
    uploaded_files = files.upload()                              # Prompt file upload dialog
    if file_name not in uploaded_files:
      raise FileNotFoundError(f"'{file_name}' was not uploaded. Please try again.")
    print(f"'{file_name}' successfully uploaded to /content")
    uploaded_file_name = list(uploaded_files.keys())[0]          # Get the name of the uploaded file

    !unzip {uploaded_file_name}

    unzipped_dir_name = os.path.splitext(uploaded_file_name)[0]
    assert unzipped_dir_name==datasetDict_dir_name, "unzipped_dir_name != datasetDict_dir_name"
  else:
    print(f"'{datasetDict_dir_name}' already exists in /content.")


In [5]:
upload_unzip_dataset(datasetDict_zip_file_name)

'dataset_11_12000.zip' not found in /content. Uploading...


Saving dataset_11_12000.zip to dataset_11_12000.zip
'dataset_11_12000.zip' successfully uploaded to /content
Archive:  dataset_11_12000.zip
  inflating: dataset_11_12000/dataset_dict.json  
  inflating: dataset_11_12000/test/data-00000-of-00001.arrow  
  inflating: dataset_11_12000/test/dataset_info.json  
  inflating: dataset_11_12000/test/state.json  
  inflating: dataset_11_12000/train/data-00000-of-00001.arrow  
  inflating: dataset_11_12000/train/dataset_info.json  
  inflating: dataset_11_12000/train/state.json  
  inflating: dataset_11_12000/validation/data-00000-of-00001.arrow  
  inflating: dataset_11_12000/validation/dataset_info.json  
  inflating: dataset_11_12000/validation/state.json  


In [6]:
# Hugging Face Authenticate

os.environ["HF_TOKEN"] = userdata.get("HF_TOKEN")    # Store the key in os.environ
hf_token               = os.environ.get('HF_TOKEN')
login(token=hf_token)

# Verify
!huggingface-cli whoami

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


claudelepere


In [7]:
# Create the skill_classification repo on the Hugging Face Hub

HF_name         = "claudelepere/skill_classification"
repo_id_model   = HF_name
repo_id_dataset = HF_name

repo_model_url = create_repo(
    repo_id   = repo_id_model,
    repo_type = "model",
    private   = True,
    exist_ok  = True
)
print(f"Repo model url: {repo_model_url} created successfully as a private repo.")

repo_dataset_url = create_repo(
    repo_id   = repo_id_dataset,
    repo_type = "dataset",
    private   = True,
    exist_ok  = True
)
print(f"Repo datasets url: {repo_dataset_url} created successfully as a private repo.")

repo_id_dataset = f"datasets/{HF_name}"

print(f"repo_id_model: {repo_id_model}")
print(f"repo_id_dataset: {repo_id_dataset}")

Repo model url: https://huggingface.co/claudelepere/skill_classification created successfully as a private repo.
Repo datasets url: https://huggingface.co/datasets/claudelepere/skill_classification created successfully as a private repo.
repo_id_model: claudelepere/skill_classification
repo_id_dataset: datasets/claudelepere/skill_classification


In [8]:
# W&B initialization

os.environ["WANDB_API_KEY"] = userdata.get("WANDB_API_KEY")        # Store the key in os.environ
wandb_api_key               = os.environ.get('WANDB_API_KEY')
wandb.login(key=wandb_api_key)

try:
  wandb.init(
      project = "skill_classification",
      name    = run_name,
      entity  = "claudelepere-c-cile-cy",
      config  = {
          "learning_rate": learning_rate,
          "epochs"       : 5,
          "batch_size"   : batch_size
      }
  )
except wandb.errors.CommError as err:
  print(f"CommError: {err}")
except Exception as exc:
  print(f"Exception: {exc}")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mclaudelepere[0m ([33mclaudelepere-c-cile-cy[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [9]:
# Create the dataset: 3 Hugging Face Dataset in a Hugging Face DatasetDict

datasetDict = DatasetDict.load_from_disk(datasetDict_dir_name)

print(f"datasetDict: {type(datasetDict)} {datasetDict.shape}\n{datasetDict}")
print(f"datasetDict.keys(): {datasetDict.keys()}")
print(f"datasetDict['train']: {type(datasetDict['train'])} {datasetDict['train'].shape}")
print(f"datasetDict['validation']: {type(datasetDict['validation'])} {datasetDict['validation'].shape}")
print(f"datasetDict['test']: {type(datasetDict['test'])} {datasetDict['test'].shape}")

datasetDict: <class 'datasets.dataset_dict.DatasetDict'> {'train': (9600, 8), 'validation': (1200, 8), 'test': (1200, 8)}
DatasetDict({
    train: Dataset({
        features: ['id', 'text', '390', '135', '136', '137', '138', '139'],
        num_rows: 9600
    })
    validation: Dataset({
        features: ['id', 'text', '390', '135', '136', '137', '138', '139'],
        num_rows: 1200
    })
    test: Dataset({
        features: ['id', 'text', '390', '135', '136', '137', '138', '139'],
        num_rows: 1200
    })
})
datasetDict.keys(): dict_keys(['train', 'validation', 'test'])
datasetDict['train']: <class 'datasets.arrow_dataset.Dataset'> (9600, 8)
datasetDict['validation']: <class 'datasets.arrow_dataset.Dataset'> (1200, 8)
datasetDict['test']: <class 'datasets.arrow_dataset.Dataset'> (1200, 8)


In [10]:
example = datasetDict['train'][0]
print(f"datasetDict['train'][0]: {type(example)} {example.keys()}\n{example}")

datasetDict['train'][0]: <class 'dict'> dict_keys(['id', 'text', '390', '135', '136', '137', '138', '139'])
{'id': 153918, 'text': "Proximus SpearIT - Senior Web Security Engineer F5, Security, Firewall, LTM, ASM Proximus SpearIT Job description Install, deploy, manage and operate security solutions based on Web Application Firewall systems Mandatory knowledge of F5 Technologies (LTM and ASM) Good Knowledge of Firewall, Proxy, Networking (Routing & Switching) systems and technologies Testing of new configuration, equipment and releases Reporting on network/security usage and performance Participate to project and deployment of new security architecture Deliver technical solutions and improve the level of protection of the system Develop the technical documentation regarding the operational procedures Be part in the security incident response process and participate to On-Duty staffing outside business hours. Job requirements Advanced skills in F5 (LTM and ASM) architectures, solutions 

In [11]:
# Create the label list and the id2label and label2id mappings.

"""
dataset 7_1000_125_125  ,  48 labels
dataset 7_128_18_54     ,  42 labels
dataset 8910_1087_68_204, 206 labels
dataset 11_1000         ,   6 labels
"""

labels = [label for label in datasetDict['train'].features.keys() if label not in ['id', 'text']]
labels.sort()
print(f"labels: {type(labels)} {len(labels)}\n{labels}")

id2label = {idx:label for idx, label in enumerate(labels)}
print(f"id2label: {type(id2label)} {len(id2label)}\n{id2label}")

label2id = {label:idx for idx, label in enumerate(labels)}
print(f"label2id: {type(label2id)} {len(label2id)}\n{label2id}")

labels: <class 'list'> 6
['135', '136', '137', '138', '139', '390']
id2label: <class 'dict'> 6
{0: '135', 1: '136', 2: '137', 3: '138', 4: '139', 5: '390'}
label2id: <class 'dict'> 6
{'135': 0, '136': 1, '137': 2, '138': 3, '139': 4, '390': 5}


In [12]:
# Load tokenizer and model

model_name = "allenai/longformer-base-4096"

tokenizer = LongformerTokenizerFast.from_pretrained(model_name)

model = LongformerForSequenceClassification.from_pretrained(
    model_name,
    num_labels        = len(labels),
#    num_hidden_layers = hidden_layers,
    problem_type      = 'multi_label_classification')

# Configure attention window size
model.config.attention_window = 512

optimizer = AdamW(model.parameters(), lr=learning_rate)

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/597M [00:00<?, ?B/s]

Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
# Tokenize ('input_ids' and 'attention_mask'), add 'global_attention_mask' (for Longformer), add 'labels'

def preprocess_data(examples, indices):
  # Step 1: Extract text and tokenize
  text = examples['text']             # Batch of texts
  encoding = tokenizer(
      text,                           # Tokenize text
      truncation     = True,
      padding        = 'max_length',
      max_length     = max_length,
      return_tensors = 'pt'           # Return PyTorch tensors
  )

  # Step 2: Create and add the global attention mask
  global_attention_mask             = torch.zeros_like(encoding['input_ids'])  # Initialize global attention mask with zeros (same shape as input_ids)
  global_attention_mask[:, 0]       = 1                                        # Set global attention on the first token ([CLS], token ID=0) in each sequence
  encoding['global_attention_mask'] = global_attention_mask                    # Add the global_attention_mask to the batch

  # Step 3: Create and populate the label matrix
  labels_matrix = torch.zeros((len(text), len(labels)), dtype=torch.float32)   # Create an empty label matrix
  #print(f"labels_matrix: {type(labels_matrix)} {labels_matrix.shape}")
  #---------Populate label matrix
  for idx, label in enumerate(labels):
    #print(f"idx:{idx} label:{label}")
    if label in examples:
      labels_matrix[:, idx] = torch.tensor(
          [1.0 if val else 0.0 for val in examples[label]],
          dtype=torch.float32
          )
  print(f"labels_matrix: {type(labels_matrix)} {labels_matrix.shape}")

  encoding['labels'] = labels_matrix                                           # Add labels to the encoding
  print(f"encoding['labels']: {type(encoding['labels'])} {encoding['labels'].shape}")

  # encoding: <class 'transformers.tokenization_utils_base.BatchEncoding'> dict_keys(['input_ids', 'attention_mask', 'global_attention_mask', 'labels'])
  #   'input_ids': tensor([[
  #   'attention_mask': tensor([[
  #   'global_attention_mask': tensor([[
  #   'labels': tensor([[
  #print(f"1 preprocess_data call: encoding: {type(encoding)} {encoding.keys()}")

  return encoding

In [14]:
# Create the 3 encoded datasets, train, validation and test

encoded_dataset = datasetDict.map(
    preprocess_data,
    batched        = True,
    remove_columns = datasetDict['train'].column_names,
    with_indices   = True
)
#train_dataset      = encoded_dataset['train']
#validation_dataset = encoded_dataset['validation']
#test_dataset       = encoded_dataset['test']

#train_labels_list_of_lists = train_dataset['labels'].tolist()
#print("=============================================")
#print(f"encoded_dataset: {type(encoded_dataset)} {encoded_dataset.shape}\n{encoded_dataset}")
#print(f"train_dataset: {type(train_dataset)} {train_dataset.shape} {train_dataset.features}\n{train_dataset}")
#print(f"validation_dataset: {type(validation_dataset)} {validation_dataset.shape} {validation_dataset.features}")
#print(f"test_dataset: {type(test_dataset)} {test_dataset.shape} {test_dataset.features}")
#print("---")
#print(f"train_dataset['labels']: {type(train_dataset['labels'])} len={len(train_dataset['labels'])}\n{train_dataset['labels']}")
#print("---")
#print(f"train_dataset[0]['input_ids']: {type(train_dataset[0]['input_ids'])} {len(train_dataset[0]['input_ids'])}\n{train_dataset['input_ids'][0]}")
#print(f"train_dataset[0]['attention_mask']: {type(train_dataset[0]['attention_mask'])} {len(train_dataset[0]['attention_mask'])}\n{train_dataset['attention_mask'][0]}")
#print(f"train_dataset[0]['global_attention_mask']: {type(train_dataset[0]['global_attention_mask'])} {len(train_dataset[0]['global_attention_mask'])}\n{train_dataset['global_attention_mask'][0]}")
#print(f"train_dataset[0]['labels']: {type(train_dataset[0]['labels'])} {len(train_dataset[0]['labels'])} {train_dataset[0]['labels']}")
#print(f"train_dataset['labels'][0]: {type(train_dataset['labels'][0])} {len(train_dataset['labels'][0])}\n{train_dataset['labels'][0]}")

encoded_dataset.set_format('torch')
train_dataset      = encoded_dataset['train']
validation_dataset = encoded_dataset['validation']
test_dataset       = encoded_dataset['test']
print(f"train_dataset_tensor: {type(train_dataset)} {train_dataset.shape} {train_dataset.features}\n{train_dataset}")
print(f"train_dataset_tensor['input_ids']:             {type(train_dataset['input_ids'])}             len={len(train_dataset['input_ids'])}             shape={train_dataset['input_ids'].shape}            ") #\n{train_dataset['input_ids']}")
print(f"train_dataset_tensor['attention_mask']:        {type(train_dataset['attention_mask'])}        len={len(train_dataset['attention_mask'])}        shape={train_dataset['attention_mask'].shape}       ") #\n{train_dataset['attention_mask']}")
print(f"train_dataset_tensor['global_attention_mask']: {type(train_dataset['global_attention_mask'])} len={len(train_dataset['global_attention_mask'])} shape={train_dataset['global_attention_mask'].shape}") #\n{train_dataset['global_attention_mask']}")
print(f"train_dataset_tensor['labels']:                {type(train_dataset['labels'])}                len={len(train_dataset['labels'])}                shape={train_dataset['labels'].shape}               ") #\n{train_dataset['labels']}")



Map:   0%|          | 0/9600 [00:00<?, ? examples/s]

labels_matrix: <class 'torch.Tensor'> torch.Size([1000, 6])
encoding['labels']: <class 'torch.Tensor'> torch.Size([1000, 6])
labels_matrix: <class 'torch.Tensor'> torch.Size([1000, 6])
encoding['labels']: <class 'torch.Tensor'> torch.Size([1000, 6])
labels_matrix: <class 'torch.Tensor'> torch.Size([1000, 6])
encoding['labels']: <class 'torch.Tensor'> torch.Size([1000, 6])
labels_matrix: <class 'torch.Tensor'> torch.Size([1000, 6])
encoding['labels']: <class 'torch.Tensor'> torch.Size([1000, 6])
labels_matrix: <class 'torch.Tensor'> torch.Size([1000, 6])
encoding['labels']: <class 'torch.Tensor'> torch.Size([1000, 6])
labels_matrix: <class 'torch.Tensor'> torch.Size([1000, 6])
encoding['labels']: <class 'torch.Tensor'> torch.Size([1000, 6])
labels_matrix: <class 'torch.Tensor'> torch.Size([1000, 6])
encoding['labels']: <class 'torch.Tensor'> torch.Size([1000, 6])
labels_matrix: <class 'torch.Tensor'> torch.Size([1000, 6])
encoding['labels']: <class 'torch.Tensor'> torch.Size([1000, 6])


Map:   0%|          | 0/1200 [00:00<?, ? examples/s]

labels_matrix: <class 'torch.Tensor'> torch.Size([1000, 6])
encoding['labels']: <class 'torch.Tensor'> torch.Size([1000, 6])
labels_matrix: <class 'torch.Tensor'> torch.Size([200, 6])
encoding['labels']: <class 'torch.Tensor'> torch.Size([200, 6])


Map:   0%|          | 0/1200 [00:00<?, ? examples/s]

labels_matrix: <class 'torch.Tensor'> torch.Size([1000, 6])
encoding['labels']: <class 'torch.Tensor'> torch.Size([1000, 6])
labels_matrix: <class 'torch.Tensor'> torch.Size([200, 6])
encoding['labels']: <class 'torch.Tensor'> torch.Size([200, 6])
train_dataset_tensor: <class 'datasets.arrow_dataset.Dataset'> (9600, 4) {'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None), 'global_attention_mask': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), 'labels': Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None)}
Dataset({
    features: ['input_ids', 'attention_mask', 'global_attention_mask', 'labels'],
    num_rows: 9600
})
train_dataset_tensor['input_ids']:             <class 'torch.Tensor'>             len=9600             shape=torch.Size([9600, 1024])            
train_dataset_tensor['attention_mask']:        <class 'torch.Tensor'>      

In [15]:
# Truncated part

def get_truncated_part(text):
  tokens = tokenizer(
      text,
      truncation                = True,
      padding                   = 'max_length',
      max_length                = max_length,
      return_overflowing_tokens = True,
      return_tensors            = None
      )
  print(f"tokens.keys(): {tokens.keys()}")

  # Get the truncated tokens
  truncated_ids = tokens["input_ids"][0]
  print(f"truncated_ids: {type(truncated_ids)} {truncated_ids}")
  #overflow_ids  = tokens["overflow_to_sample_mapping"][0]
  #print(f"overflow_ids: {type(overflow_ids)} {overflow_ids}")

  # Decode the tokens back to text
  truncated_text = tokenizer.decode(truncated_ids, skip_special_tokens=True)
  #overflow_text  = tokenizer.decode(overflow_ids, skip_special_tokens=True)

  print(f"original_text :\n{text}")
  print(f"truncated_text:\n{truncated_text}")
  #print(f"overflow_text:\n{overflow_text}")

  original_tokens  = tokenizer.tokenize(text)
  truncated_tokens = tokenizer.tokenize(truncated_text)
  #overflow_tokens  = tokenizer.tokenize(overflow_text)

  print(f"original_tokens count : {len(original_tokens)}")
  print(f"truncated_tokens count: {len(truncated_tokens)}")
  #print(f"overflow_tokens count: {len(overflow_tokens)}")


In [16]:
example_text = datasetDict['train'][0]['text']
#get_truncated_part(example_text)

inputs = tokenizer(
    example_text,
    truncation     = True,
    padding        = 'max_length',
    max_length     = max_length,
    return_tensors = 'pt'
)

# inputs: <class 'transformers.tokenization_utils_base.BatchEncoding'> dict_keys(['input_ids', 'attention_mask'])
#   {'input_ids': tensor([[
#   'attention_mask': tensor([[
#print(f"inputs: {type(inputs)} {inputs.keys()}") #\n{inputs}")
#print(f"inputs_ids: {type(inputs.input_ids)} {inputs.input_ids.shape}\n{inputs.input_ids}")
#print(f"attention_mask: {type(inputs.attention_mask)} {inputs.attention_mask.shape}\n{inputs.attention_mask}")
#print(f"labels: {inputs.labels.shape}")

In [17]:
# 4. Forward pass for multi-label classification

outputs = model(
    input_ids      = inputs.input_ids,
    attention_mask = inputs.attention_mask
    )

print(f"outputs: {type(outputs)} {outputs.keys()}\n{outputs}")

# Logits (= raw model outputs)
logits = outputs.logits
print(f"logits: {type(logits)} {logits.shape}\n{logits}")

# Convert logits to probabilities
sigmoid = torch.nn.Sigmoid()
probs   = sigmoid(logits)
print(f"probs: {type(probs)} {probs.shape}\n{probs}")

Initializing global attention on CLS token...


outputs: <class 'transformers.models.longformer.modeling_longformer.LongformerSequenceClassifierOutput'> odict_keys(['logits'])
LongformerSequenceClassifierOutput(loss=None, logits=tensor([[-0.0251, -0.0806,  0.1326,  0.2702,  0.1157, -0.2547]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None, global_attentions=None)
logits: <class 'torch.Tensor'> torch.Size([1, 6])
tensor([[-0.0251, -0.0806,  0.1326,  0.2702,  0.1157, -0.2547]],
       grad_fn=<AddmmBackward0>)
probs: <class 'torch.Tensor'> torch.Size([1, 6])
tensor([[0.4937, 0.4799, 0.5331, 0.5671, 0.5289, 0.4367]],
       grad_fn=<SigmoidBackward0>)


In [18]:
example = encoded_dataset['train'][0]

print(f"example: {type(example)} {example.keys()}\n{example}")
print()
#print(f"example['input_ids']: {type(example['input_ids'])} {len(example['input_ids'])}\n{example['input_ids']}")
#print(f"example['attention_mask']: {type(example['attention_mask'])} {len(example['attention_mask'])}\n{example['attention_mask']}")
#print(f"example['labels']:  {type(example['labels'])} {len(example['labels'])}\n{example['labels']}")

example: <class 'dict'> dict_keys(['input_ids', 'attention_mask', 'global_attention_mask', 'labels'])
{'input_ids': tensor([    0, 10653,  1178,  ...,     1,     1,     1]), 'attention_mask': tensor([1, 1, 1,  ..., 0, 0, 0]), 'global_attention_mask': tensor([1, 0, 0,  ..., 0, 0, 0]), 'labels': tensor([0., 0., 1., 1., 0., 0.])}



In [19]:
tokenizer.decode(example['input_ids'])

"<s>Proximus SpearIT - Senior Web Security Engineer F5, Security, Firewall, LTM, ASM Proximus SpearIT Job description Install, deploy, manage and operate security solutions based on Web Application Firewall systems Mandatory knowledge of F5 Technologies (LTM and ASM) Good Knowledge of Firewall, Proxy, Networking (Routing & Switching) systems and technologies Testing of new configuration, equipment and releases Reporting on network/security usage and performance Participate to project and deployment of new security architecture Deliver technical solutions and improve the level of protection of the system Develop the technical documentation regarding the operational procedures Be part in the security incident response process and participate to On-Duty staffing outside business hours. Job requirements Advanced skills in F5 (LTM and ASM) architectures, solutions and products Proven professional experience of several years (>4 years) in a similar operational environment Master's degree in 

In [20]:
[id2label[idx] for idx, label in enumerate(example['labels']) if label == 1.0]

['137', '138']

In [21]:
# Set PyTorch format to ensures correctness and compatibility with PyTorch pipelines

# The 3 Hugging Face Dataset are formatted as PyTorch Dataset
encoded_dataset.set_format('torch')

In [22]:
batch_size  = batch_size
metric_name = "f1"

In [23]:
training_args = TrainingArguments(
    output_dir                  = './training_results',  # where model predictions and checkpoints will be written during training
    overwrite_output_dir        = True,
    logging_dir                 = './logs',
    logging_steps               = 50,
    save_steps                  = 500,
    save_total_limit            = 2,
    eval_strategy               = 'epoch',
    save_strategy               = 'epoch',
    learning_rate               = learning_rate,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size  = batch_size,
    gradient_accumulation_steps = gradient_accumulation_steps,
    num_train_epochs            = epochs,
    weight_decay                = 0.01,
    load_best_model_at_end      = True,
    metric_for_best_model       = metric_name,
    run_name                    = run_name,
    fp16                        = fp16,
    #dataloader_num_workers      = dataloader_num_workers,
    report_to                  = 'wandb'
    )

In [24]:
# Metrics
#   source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/

def multi_label_metrics(predictions, labels):
    average = 'micro'    # 'micro' or 'weighted'

    # first, apply sigmoid on predictions whose shape is (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs   = sigmoid(torch.Tensor(predictions))

    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1

    # finally, compute metrics
    y_true               = labels
    f1                   = f1_score               (y_true=y_true, y_pred=y_pred, average=average)    #, zero_division=1)
    precision            = precision_score        (y_true=y_true, y_pred=y_pred, average=average)    #, zero_division=1)
    recall               = recall_score           (y_true=y_true, y_pred=y_pred, average=average)    #, zero_division=1)
    roc_auc              = roc_auc_score          (y_true=y_true, y_score=probs, average=average)
    precision_recall_auc = average_precision_score(y_true=y_true, y_score=probs, average=average)
    accuracy             = accuracy_score         (y_true=y_true, y_pred=y_pred)

    # return as dictionary
    metrics = {
        'f1'                  : f1,
        'precision'           : precision,
        'recall'              : recall,
        'roc_auc'             : roc_auc,
        'precision_recall_auc': precision_recall_auc,
        'accuracy'            : accuracy
        }

    return metrics

In [25]:
def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    result = multi_label_metrics(
        predictions = preds,
        labels      = p.label_ids
        )
    return result

Let's verify a batch as well as a forward pass:

In [26]:
print(f"input_ids:              {type(encoded_dataset['train']['input_ids'][0])}\t{encoded_dataset['train']['input_ids'][0].shape}")
print(f"attention_mask:         {type(encoded_dataset['train']['attention_mask'][0])}\t{encoded_dataset['train']['attention_mask'][0].shape}")
print(f"global_attention_mask:  {type(encoded_dataset['train']['global_attention_mask'][0])}\t{encoded_dataset['train']['global_attention_mask'][0].shape}")
print(f"labels:                 {type(encoded_dataset['train'][0]['labels'])}\t{encoded_dataset['train'][0]['labels'].shape}")

input_ids:              <class 'torch.Tensor'>	torch.Size([1024])
attention_mask:         <class 'torch.Tensor'>	torch.Size([1024])
global_attention_mask:  <class 'torch.Tensor'>	torch.Size([1024])
labels:                 <class 'torch.Tensor'>	torch.Size([6])


In [27]:
# Execute a forward pass for debugging or verification purposes (cf. BERT_3_1 in Notion BERT database)

outputs = model(
    input_ids      = encoded_dataset['train']['input_ids'][0].unsqueeze(0),
    attention_mask = encoded_dataset['train']['attention_mask'][0].unsqueeze(0),
    labels         = encoded_dataset['train'][0]['labels'].unsqueeze(0)
    )

print(f"outputs: {type(outputs)} {outputs.keys()}\n{outputs}")

outputs: <class 'transformers.models.longformer.modeling_longformer.LongformerSequenceClassifierOutput'> odict_keys(['loss', 'logits'])
LongformerSequenceClassifierOutput(loss=tensor(0.6428, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>), logits=tensor([[-0.0251, -0.0806,  0.1326,  0.2702,  0.1157, -0.2547]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None, global_attentions=None)


In [28]:
"""
# Define the weighted loss function

class_weights = torch.tensor([7.68, 2.15, 0.61, 0.47, 0.68, 6.26], dtype=torch.float32).to(device)
loss_fn       = BCEWithLogitsLoss(pos_weight=class_weights)  # For multi-label classification (binary classification per label)
"""

'\n# Define the weighted loss function\n\nclass_weights = torch.tensor([7.68, 2.15, 0.61, 0.47, 0.68, 6.26], dtype=torch.float32).to(device)\nloss_fn       = BCEWithLogitsLoss(pos_weight=class_weights)  # For multi-label classification (binary classification per label)\n'

## Class supports, class weigths, weighted loss function

Reminder:
*   df_jobs      : <class 'pandas.core.frame.DataFrame'>
*   df_jobs['id']: <class 'pandas.core.series.Series'>

dataset = Dataset.from_pandas(df_jobs)
*   dataset      : <class 'datasets.arrow_dataset.Dataset'>
*   dataset['id']: <class 'list'>

*   dataset_dict_jobs : <class 'datasets.dataset_dict.DatasetDict'>
*   train_dataset     : <class 'datasets.arrow_dataset.Dataset'>
*   validation_dataset: <class 'datasets.arrow_dataset.Dataset'>
*   test_dataset      : <class 'datasets.arrow_dataset.Dataset'>


We calculate the class supports for the train, validation and test datasets; the class weights and the weighted loss function are used for training only; the class supports of validation_dataset and test_dataset are calculated for information only.

In [29]:
"""
def get_train_class_weights(datasetDict, labels):
  print(f"datasetDict: {type(datasetDict)} shape={datasetDict.shape}\n{datasetDict}")
  print(f"labels: {type(labels)} len={len(labels)}\n{labels}")

  dataset_train      = datasetDict['train']
  dataset_validation = datasetDict['validation']
  dataset_test       = datasetDict['test']

  def calculate_class_supports(dataset, labels):
    class_supports = dataset.map(
        lambda example: {col: example[col] for col in labels},
        batched=True
    ).to_pandas()[labels].sum(axis=0)
    return class_supports

  class_supports = {}

  for split_name, split_dataset in datasetDict.items():
    class_supports[split_name] = calculate_class_supports(split_dataset, labels)

  for split_name, split_class_supports in class_supports.items():
    print(f"{split_name}: {type(split_class_supports)} len={len(split_class_supports)}\n{split_class_supports}")

  train_class_supports_list = class_supports['train'].tolist()
  print(f"train_class_supports_list: {type(train_class_supports_list)} len={len(train_class_supports_list)} {train_class_supports_list}")

  train_class_supports_tensor = torch.tensor(train_class_supports_list, dtype=torch.float32)
  print(f"train_class_supports_tensor: {type(train_class_supports_tensor)} len={len(train_class_supports_tensor)} {train_class_supports_tensor}")

  train_total_samples = dataset_train.num_rows
  print(f"train_total_samples: {train_total_samples}")

  number_of_classes = len(labels)
  print(f"number_of_classes: {number_of_classes}")

  train_class_weights = train_total_samples / (number_of_classes * train_class_supports_tensor)
  print(f"train_class_weights: {type(train_class_weights)} len={len(train_class_weights)} {train_class_weights}")

  train_class_weights_sum = train_class_weights.sum()
  print(f"train_class_weights_sum: {train_class_weights_sum}")

  normalized_train_class_weights = (train_class_weights / train_class_weights_sum) * number_of_classes
  print(f"normalized_train_class_weights: {type(normalized_train_class_weights)} len={len(normalized_train_class_weights)} {normalized_train_class_weights}")

  # Positives samples per label
  supports = train_class_supports_tensor
  print(f"supports: {type(supports)} {len(supports)} {supports}")

  # Negatives samples per label
  negatives = train_total_samples - supports
  print(f"negatives: {type(negatives)} {len(negatives)} {negatives}")

  # pos_weights = negative to positive ratios
  pos_weights = negatives/supports
  print(f"pos_weights: {type(pos_weights)} {len(pos_weights)} {pos_weights}")

  # Normalize using min-max scaling
  normalized_pos_weights_minmax = (pos_weights - pos_weights.min()) / (pos_weights.max() - pos_weights.min())
  print(f"normalized_pos_weights_minmax: {type(normalized_pos_weights_minmax)} {len(normalized_pos_weights_minmax)} {normalized_pos_weights_minmax}")

  # Normalize using z-score standardization
  normalized_pos_weights_zscore = (pos_weights - pos_weights.mean()) / pos_weights.std()
  print(f"normalized_pos_weights_zscore: {type(normalized_pos_weights_zscore)} {len(normalized_pos_weights_zscore)} {normalized_pos_weights_zscore}")

  # Normalize using min-max scaling
  normalized_pos_weights_minmax = (pos_weights - pos_weights.min()) / (pos_weights.max() - pos_weights.min())
  print(f"normalized_pos_weights_minmax: {type(normalized_pos_weights_minmax)} {len(normalized_pos_weights_minmax)} {normalized_pos_weights_minmax}")

  # Normalize using z-score standardization
  normalized_pos_weights_zscore = (pos_weights - pos_weights.mean()) / pos_weights.std()
  print(f"normalized_pos_weights_zscore: {type(normalized_pos_weights_zscore)} {len(normalized_pos_weights_zscore)} {normalized_pos_weights_zscore}")

  # Normalize using sum-to-one
  normalized_pos_weights_sum1 = pos_weights / pos_weights.sum()
  print(f"normalized_pos_weights_sum1: {type(normalized_pos_weights_sum1)} {len(normalized_pos_weights_sum1)} {normalized_pos_weights_sum1}")

  return normalized_pos_weights_minmax
  #return normalized_pos_weights_zscore
  #return normalized_pos_weights_sum1

pos_weights = get_train_class_weights(datasetDict, labels)

loss_fn = BCEWithLogitsLoss(pos_weight=pos_weights.to(device))  # For multi-label classification (binary classification per label)
print(f"loss_fn: {type(loss_fn)} {loss_fn}")
"""

'\ndef get_train_class_weights(datasetDict, labels):\n  print(f"datasetDict: {type(datasetDict)} shape={datasetDict.shape}\n{datasetDict}")\n  print(f"labels: {type(labels)} len={len(labels)}\n{labels}")\n\n  dataset_train      = datasetDict[\'train\']\n  dataset_validation = datasetDict[\'validation\']\n  dataset_test       = datasetDict[\'test\']\n\n  def calculate_class_supports(dataset, labels):\n    class_supports = dataset.map(\n        lambda example: {col: example[col] for col in labels},\n        batched=True\n    ).to_pandas()[labels].sum(axis=0)\n    return class_supports\n\n  class_supports = {}\n\n  for split_name, split_dataset in datasetDict.items():\n    class_supports[split_name] = calculate_class_supports(split_dataset, labels)\n\n  for split_name, split_class_supports in class_supports.items():\n    print(f"{split_name}: {type(split_class_supports)} len={len(split_class_supports)}\n{split_class_supports}")\n\n  train_class_supports_list = class_supports[\'train\'].to

In [30]:
def get_class_weights(labels=encoded_dataset['train']['labels']):
  print(f"labels: {type(labels)} len={len(labels)} shape={labels.shape}\n{labels}")

  num_samples, num_labels = labels.shape
  print(f"num_samples: {type(num_samples)} {num_samples}")
  print(f"num_labels:  {type(num_labels)}  {num_labels}")

  class_counts = labels.sum(dim=0)
  print(f"class_counts: {type(class_counts)} len={len(class_counts)}\n{class_counts}")

  pos_weights = (num_samples-class_counts) / class_counts
  print(f"pos_weights: {type(pos_weights)} len={len(pos_weights)}\n{pos_weights}")

  normalized_pos_weights_minmax = (pos_weights - pos_weights.min()) / (pos_weights.max() - pos_weights.min())
  print(f"normalized_pos_weights_minmax: {type(normalized_pos_weights_minmax)} {len(normalized_pos_weights_minmax)} {normalized_pos_weights_minmax}")

  normalized_pos_weights_zscore = (pos_weights - pos_weights.mean()) / pos_weights.std()
  print(f"normalized_pos_weights_zscore: {type(normalized_pos_weights_zscore)} {len(normalized_pos_weights_zscore)} {normalized_pos_weights_zscore}")

  normalized_pos_weights_sum1 = pos_weights / pos_weights.sum()
  print(f"normalized_pos_weights_sum1: {type(normalized_pos_weights_sum1)} {len(normalized_pos_weights_sum1)} {normalized_pos_weights_sum1}")

  #return pos_weights
  #return normalized_pos_weights_minmax
  #return normalized_pos_weights_zscore
  return normalized_pos_weights_sum1

pos_weights = get_class_weights()

loss_fn = BCEWithLogitsLoss(pos_weight=pos_weights.to(device))

#raise Exception("Stop here")

labels: <class 'torch.Tensor'> len=9600 shape=torch.Size([9600, 6])
tensor([[0., 0., 1., 1., 0., 0.],
        [1., 1., 0., 0., 0., 1.],
        [0., 1., 1., 1., 0., 0.],
        ...,
        [0., 0., 1., 1., 1., 0.],
        [0., 0., 1., 1., 1., 0.],
        [0., 0., 0., 1., 1., 0.]])
num_samples: <class 'int'> 9600
num_labels:  <class 'int'>  6
class_counts: <class 'torch.Tensor'> len=6
tensor([ 554., 1789., 6639., 8754., 6055.,  715.])
pos_weights: <class 'torch.Tensor'> len=6
tensor([16.3285,  4.3661,  0.4460,  0.0966,  0.5855, 12.4266])
normalized_pos_weights_minmax: <class 'torch.Tensor'> 6 tensor([1.0000, 0.2630, 0.0215, 0.0000, 0.0301, 0.7596])
normalized_pos_weights_zscore: <class 'torch.Tensor'> 6 tensor([ 1.5167, -0.1917, -0.7515, -0.8014, -0.7316,  0.9595])
normalized_pos_weights_sum1: <class 'torch.Tensor'> 6 tensor([0.4768, 0.1275, 0.0130, 0.0028, 0.0171, 0.3628])


In [31]:
# CustomTrainer

class CustomTrainer(Trainer):

  # No print in compute_loss because out of memory because prints are batch per batch
  def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):

    #print(f"inputs passed to compute_loss: {inputs.keys()}")
    input_ids             = inputs['input_ids']                        # shape: batch_size, sequence_length
    attention_mask        = inputs['attention_mask']                   # shape: batch_size, sequence_length
    global_attention_mask = inputs.get('global_attention_mask', None)  # shape: batch_size, sequence_length; optional as LongFormer specific
    labels                = inputs.get('labels', None)                 # shape: batch_size, num_labels; needed for loss computation, not required by the model

    #outputs = model(**inputs, global_attention_mask=global_attention_mask)  # Forward pass
    # Forward pass
    outputs = model(
        input_ids             = input_ids,
        attention_mask        = attention_mask,
        global_attention_mask = global_attention_mask,
        labels                = labels
    )
    #print(f"outputs: {type(outputs)} {outputs.keys()}\n{outputs}")
    logits = outputs.logits  # shape: (batch_size, num_labels)

    # If labels are provided, compute loss
    if labels is not None:
      loss = loss_fn(logits, labels)  # Compute weighted loss
      return (loss, outputs) if return_outputs else loss

    # If no labels, return outputs only, for evaluation or inference
    return outputs

In [32]:
"""
Trainer
Epoch	Tra Loss	Val Loss	F1	      Precision	Recall	  Roc Auc	  Pr R Auc	Accuracy
    1	1.310600	0.323275	0.825071	0.728187	0.951692	0.934036	0.902197	0.268333
    2	1.056400	0.296025	0.828955	0.732896	0.953993	0.945496	0.923083	0.300833
    3	0.983800	0.302103	0.855444	0.798009	0.921788	0.945643	0.919572	0.438333
    4	0.939200	0.293572	0.852538	0.786986	0.930003	0.948216	0.924455	0.422500
    5	0.826300	0.294880	0.850751	0.783246	0.930989	0.948209	0.924764	0.415833

CustomTrainer without normalization: BETTER THAN WITH THE BEST NORMALISATION
Trainer
Epoch	Tra Loss	Val Loss	F1	      Precisio	Recall	  Roc Auc	  Pr R Auc	Accuracy
    1	2.403400	0.567468	0.739746	0.594843	0.977982	0.831007	0.676924	0.161667
    2	1.872100	0.577790	0.796023	0.686517	0.947092	0.899120	0.807780	0.286667
    3	1.665800	0.537207	0.798294	0.703407	0.922774	0.891882	0.793428	0.340000
    4 1.636200	0.547590	0.805169	0.703140	0.941834	0.899867	0.806801	0.330833
    5	1.285800	0.603059	0.825648	0.735181	0.941505	0.911542	0.825745	0.360833

CustomTrainer with min-max scaling normalization
Epoch	Tra Loss	Val Loss	F1	      Precisio	Recall	  Roc Auc	  Pr R Auc	Accuracy
    1	0.398400	0.092601	0.112784	0.555233	0.062767	0.714287	0.580253	0.014167
    2	0.322000	0.090992	0.107781	0.684615	0.058495	0.814680	0.694242	0.014167
    3	0.296200	0.091699	0.157151	0.752113	0.087742	0.833960	0.718631	0.018333
    4	0.283400	0.088504	0.213101	0.737354	0.124548	0.798356	0.688813	0.023333
    5	0.242600	0.091010	0.220592	0.778884	0.128492	0.825896	0.719419	0.025000

CustomTrainer with z-score standardization normalization
Epoch	Train Loss	 Val Loss	F1	      Precisio	Recall	  Roc Auc	  Pr R Auc	Accuracy
    1	-12.231400	-3.137438	0.000000	0.000000	0.000000	0.128440	0.302990	0.000000
    2	-16.298600	-4.164513	0.000000	0.000000	0.000000	0.128387	0.308852	0.000000
    3	-19.402800	-4.852346	0.000000	0.000000	0.000000	0.127796	0.308158	0.000000
    4	-20.877700	-5.259121	0.000000	0.000000	0.000000	0.127423	0.308258	0.000000
    5	-21.728900	-5.394069	0.000000	0.000000	0.000000	0.127423	0.308258	0.000000

CustomTrainer with sum-to-one normalization
Epoch	Tra Loss	Val Loss	F1	      Precisio	Recall	  Roc Auc	  Pr R Auc	Accuracy
    1	0.251400	0.061206	0.065810	0.709459	0.034505	0.880490	0.782563	0.011667
    2	0.209100	0.056683	0.085581	0.758242	0.045350	0.892519	0.798075	0.013333
    3	0.186600	0.057045	0.119952	0.723636	0.065396	0.896226	0.801348	0.016667
    4	0.176900	0.054687	0.174015	0.780105	0.097930	0.905384	0.818976	0.022500
    5	0.153500	0.056971	0.251534	0.830571	0.148209	0.912967	0.834056	0.026667
"""

'\nTrainer\nEpoch\tTra Loss\tVal Loss\tF1\t      Precision\tRecall\t  Roc Auc\t  Pr R Auc\tAccuracy\n    1\t1.310600\t0.323275\t0.825071\t0.728187\t0.951692\t0.934036\t0.902197\t0.268333\n    2\t1.056400\t0.296025\t0.828955\t0.732896\t0.953993\t0.945496\t0.923083\t0.300833\n    3\t0.983800\t0.302103\t0.855444\t0.798009\t0.921788\t0.945643\t0.919572\t0.438333\n    4\t0.939200\t0.293572\t0.852538\t0.786986\t0.930003\t0.948216\t0.924455\t0.422500\n    5\t0.826300\t0.294880\t0.850751\t0.783246\t0.930989\t0.948209\t0.924764\t0.415833\n\nCustomTrainer without normalization: BETTER THAN WITH THE BEST NORMALISATION\nTrainer\nEpoch\tTra Loss\tVal Loss\tF1\t      Precisio\tRecall\t  Roc Auc\t  Pr R Auc\tAccuracy\n    1\t2.403400\t0.567468\t0.739746\t0.594843\t0.977982\t0.831007\t0.676924\t0.161667\n    2\t1.872100\t0.577790\t0.796023\t0.686517\t0.947092\t0.899120\t0.807780\t0.286667\n    3\t1.665800\t0.537207\t0.798294\t0.703407\t0.922774\t0.891882\t0.793428\t0.340000\n    4 1.636200\t0.547590\t

In [33]:
# Create a Hugging Face's transformers trainer (which abstracts the training loop)

#trainer = CustomTrainer(
trainer = Trainer(
    model           = model,
    args            = training_args,
    train_dataset   = encoded_dataset["train"],
    eval_dataset    = encoded_dataset["validation"],
    compute_metrics = compute_metrics  # Optional: custom metrics function
)


In [34]:
# Train

trainer_train = trainer.train()
print(f"trainer_train: {type(trainer_train)} len={len(trainer_train)}\n{trainer_train}")

file_path = "trainer_train.json"
with open(file_path, "w") as f:
  json.dump(trainer_train, f)
print(f"Train output successfully saved to {file_path}.")


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Roc Auc,Precision Recall Auc,Accuracy
1,1.3582,0.333919,0.821393,0.722666,0.951364,0.929272,0.893141,0.265
2,1.1458,0.315505,0.823898,0.727708,0.949392,0.937921,0.90862,0.330833
3,1.0392,0.309815,0.849555,0.785435,0.925074,0.941537,0.912774,0.426667
4,1.0133,0.30581,0.850436,0.784346,0.928689,0.943676,0.916891,0.42
5,0.8823,0.305897,0.848895,0.782428,0.927703,0.944374,0.917742,0.415833


trainer_train: <class 'transformers.trainer_utils.TrainOutput'> len=3
TrainOutput(global_step=1500, training_loss=1.1479406331380209, metrics={'train_runtime': 2426.7431, 'train_samples_per_second': 19.78, 'train_steps_per_second': 0.618, 'total_flos': 3.1529784508416e+16, 'train_loss': 1.1479406331380209, 'epoch': 5.0})
Train output successfully saved to trainer_train.json.


In [35]:
print("Training successfully completed.")

Training successfully completed.


## Upload model, tokenizer, train results, evaluate results

In [36]:
# Save model to /content

model_path = "model"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)

('model/tokenizer_config.json',
 'model/special_tokens_map.json',
 'model/vocab.json',
 'model/merges.txt',
 'model/added_tokens.json',
 'model/tokenizer.json')

In [37]:
# Upload model and tokenizer to HF repo_id_model

tokenizer = LongformerTokenizerFast.from_pretrained(model_path)
model     = LongformerForSequenceClassification.from_pretrained(model_path)

tokenizer.push_to_hub(repo_id_model)
model.push_to_hub(repo_id_model)


README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/595M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/claudelepere/skill_classification/commit/0c6cf37eaf196bc97af62b6f3d0711a02e496d5b', commit_message='Upload LongformerForSequenceClassification', commit_description='', oid='0c6cf37eaf196bc97af62b6f3d0711a02e496d5b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/claudelepere/skill_classification', endpoint='https://huggingface.co', repo_type='model', repo_id='claudelepere/skill_classification'), pr_revision=None, pr_num=None)

In [38]:
# Upload Train results to HF repo_id_dataset

# Train
upload_file(
    path_or_fileobj = 'trainer_train.json',
    path_in_repo    = 'trainer_train.json',
    repo_id         = HF_name,
    repo_type       = 'dataset'
)

CommitInfo(commit_url='https://huggingface.co/datasets/claudelepere/skill_classification/commit/86464ac70ce89594608540d7536d4ce494a1105a', commit_message='Upload trainer_train.json with huggingface_hub', commit_description='', oid='86464ac70ce89594608540d7536d4ce494a1105a', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/claudelepere/skill_classification', endpoint='https://huggingface.co', repo_type='dataset', repo_id='claudelepere/skill_classification'), pr_revision=None, pr_num=None)

To Get Results of Evaluation and Test

In [39]:
def get_results(model, dataset, batch_size, threshold, phase):
  # Clear GPU cache
  torch.cuda.empty_cache()

  # Set the model to evaluation mode to disable dropout and other training-specific behaviors
  model.eval()

  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  model.to(device)

  dataLoader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

  all_preds       = []
  all_probs       = []
  all_true_labels = []

  for batch in tqdm(dataLoader):
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
      outputs = model(**batch)
    logits = outputs.logits

    # Convert logits to probabilities and probabilities to predictions
    sigmoid = torch.nn.Sigmoid()
    probs   = sigmoid(logits).cpu().numpy()    # Convert to Numpy
    preds   = (probs > threshold).astype(int)  # Convert to binary Numpy array

    # Accumulate probabilities, predictions and labels
    all_probs.append(probs)
    all_preds.append(preds)
    all_true_labels.append(batch['labels'].cpu().numpy())

  # Concatenate results from all batches
  all_probs       = np.concatenate(all_probs, axis=0)        # shape: [num_samples, num_labels]
  all_preds       = np.concatenate(all_preds, axis=0)        # shape: [num_samples, num_labels]
  all_true_labels = np.concatenate(all_true_labels, axis=0)  # shape: [num_samples, num_labels]

  print(f"all_probs:       {type(all_probs)} {all_probs.shape}")
  print(f"all_preds:       {type(all_preds)} {all_preds.shape}")
  print(f"all_true_labels: {type(all_true_labels)} {all_true_labels.shape}")
  results_df = pd.DataFrame({
      'predictions'  : [list(pred)  for pred  in all_preds],
      'probabilities': [list(prob)  for prob  in all_probs],
      'true_labels'  : [list(label) for label in all_true_labels]
  })
  results_file_path = f"{phase}_results.csv"
  results_df.to_csv(results_file_path, index=False)

  # Classification report for precision, recall, F1 score
  report = classification_report(
      y_true        = all_true_labels,
      y_pred        = all_preds,
      target_names  = labels,
      zero_division = 0,
      output_dict   = True
  )
  print(f"Classification Report:\n{report}")

  # ROC AUC for multi-label classification
  roc_auc = roc_auc_score(
      y_true  = all_true_labels,
      y_score = all_probs,
      average = 'micro'
  )
  print(f"ROC AUC: {roc_auc}")

  metrics = {
      'classification_report': report,
      'roc_auc'              : roc_auc
  }
  metrics_file_path = f"{phase}_metrics.json"
  with open(metrics_file_path, "w") as f:
    json.dump(metrics, f, indent=4)

  print(f"{phase} Results Saved to {results_file_path}")
  print(f"{phase} Metrics Saved to {metrics_file_path}")

## Evaluate

After training, we evaluate our model on the validation set.

In [40]:
# First evaluate results

phase_evaluate = 'evaluate_model_eval'

get_results(
    model      = model,
    dataset    = validation_dataset,
    batch_size = batch_size,
    threshold  = threshold,
    phase      = phase_evaluate
)

  0%|          | 0/150 [00:00<?, ?it/s]

all_probs:       <class 'numpy.ndarray'> (1200, 6)
all_preds:       <class 'numpy.ndarray'> (1200, 6)
all_true_labels: <class 'numpy.ndarray'> (1200, 6)
Classification Report:
{'135': {'precision': 0.5694444444444444, 'recall': 0.6212121212121212, 'f1-score': 0.5942028985507246, 'support': 66.0}, '136': {'precision': 0.6162790697674418, 'recall': 0.673728813559322, 'f1-score': 0.6437246963562753, 'support': 236.0}, '137': {'precision': 0.8364030335861322, 'recall': 0.9234449760765551, 'f1-score': 0.877771461057419, 'support': 836.0}, '138': {'precision': 0.9293103448275862, 'recall': 0.9953831948291783, 'f1-score': 0.96121266161391, 'support': 1083.0}, '139': {'precision': 0.6657559198542805, 'recall': 0.9865047233468286, 'f1-score': 0.7949972811310495, 'support': 741.0}, '390': {'precision': 0.4891304347826087, 'recall': 0.5555555555555556, 'f1-score': 0.5202312138728323, 'support': 81.0}, 'micro avg': {'precision': 0.7843463780183181, 'recall': 0.9286887939533355, 'f1-score': 0.85043

In [41]:
print("First evaluation successfully completed.")

First evaluation successfully completed.


In [42]:
# Second evaluate results

trainer_evaluate = trainer.evaluate()
print(f"trainer_evaluate: {type(trainer_evaluate)} len={len(trainer_evaluate)}\n{trainer_evaluate}")

file_path = "trainer_evaluate.json"
with open(file_path, "w") as f:
  json.dump(trainer_evaluate, f)
print(f"Evaluate output successfully saved to {file_path}.")

trainer_evaluate: <class 'dict'> len=11
{'eval_loss': 0.3058101534843445, 'eval_f1': 0.8504363526933494, 'eval_precision': 0.7843463780183181, 'eval_recall': 0.9286887939533355, 'eval_roc_auc': 0.943675808322235, 'eval_precision_recall_auc': 0.9168913102427908, 'eval_accuracy': 0.42, 'eval_runtime': 18.1579, 'eval_samples_per_second': 66.087, 'eval_steps_per_second': 8.261, 'epoch': 5.0}
Evaluate output successfully saved to trainer_evaluate.json.


In [43]:
print("Second evaluation successfully completed.")

Second evaluation successfully completed.


In [44]:
# Upload Evaluate Results to HF repo_id_dataset

upload_file(
    path_or_fileobj = f"{phase_evaluate}_results.csv",
    path_in_repo    = f"{phase_evaluate}_results.csv",
    repo_id         = HF_name,
    repo_type       = 'dataset'
)
upload_file(
    path_or_fileobj = f"{phase_evaluate}_metrics.json",
    path_in_repo    = f"{phase_evaluate}_metrics.json",
    repo_id         = HF_name,
    repo_type       = 'dataset'
)
upload_file(
    path_or_fileobj = 'trainer_evaluate.json',
    path_in_repo    = 'trainer_evaluate.json',
    repo_id         = HF_name,
    repo_type       = 'dataset'
)


CommitInfo(commit_url='https://huggingface.co/datasets/claudelepere/skill_classification/commit/cb4efdbebb03f24ebe552ca8075f18f0e8c00a10', commit_message='Upload trainer_evaluate.json with huggingface_hub', commit_description='', oid='cb4efdbebb03f24ebe552ca8075f18f0e8c00a10', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/claudelepere/skill_classification', endpoint='https://huggingface.co', repo_type='dataset', repo_id='claudelepere/skill_classification'), pr_revision=None, pr_num=None)

## Test

In [45]:
# First test results

phase_test = 'test_model_eval'

get_results(
    model      = model,
    dataset    = test_dataset,
    batch_size = batch_size,
    threshold  = threshold,
    phase      = phase_test
)

  0%|          | 0/150 [00:00<?, ?it/s]

all_probs:       <class 'numpy.ndarray'> (1200, 6)
all_preds:       <class 'numpy.ndarray'> (1200, 6)
all_true_labels: <class 'numpy.ndarray'> (1200, 6)
Classification Report:
{'135': {'precision': 0.7093023255813954, 'recall': 0.7530864197530864, 'f1-score': 0.7305389221556886, 'support': 81.0}, '136': {'precision': 0.5842696629213483, 'recall': 0.6782608695652174, 'f1-score': 0.6277665995975855, 'support': 230.0}, '137': {'precision': 0.825668449197861, 'recall': 0.947239263803681, 'f1-score': 0.8822857142857143, 'support': 815.0}, '138': {'precision': 0.9225413402959095, 'recall': 0.9943714821763602, 'f1-score': 0.9571106094808126, 'support': 1066.0}, '139': {'precision': 0.6910798122065728, 'recall': 0.983957219251337, 'f1-score': 0.8119139547710976, 'support': 748.0}, '390': {'precision': 0.6371681415929203, 'recall': 0.6792452830188679, 'f1-score': 0.6575342465753424, 'support': 106.0}, 'micro avg': {'precision': 0.7903181189488243, 'recall': 0.937951411687459, 'f1-score': 0.8578

In [46]:
print("First test successfully completed.")

First test successfully completed.


In [47]:
# Second test results

trainer_predict = trainer.predict(test_dataset)
print(f"trainer_predict: {type(trainer_predict)} len={len(trainer_predict)}\n{trainer_predict}")
print(f"trainer_predict.predictions: {type(trainer_predict.predictions)} shape={trainer_predict.predictions.shape}")  # Model logits
print(f"trainer_predict.label_ids: {type(trainer_predict.label_ids)} shape={trainer_predict.label_ids.shape}")        # Ground truth labels
print(f"trainer_predict.metrics: {type(trainer_predict.metrics)} len={len(trainer_predict.metrics)}")

trainer_predict_json_serializable = {
    'predictions': trainer_predict.predictions.tolist(),  # Convert Numpy array to list
    'label_ids'  : trainer_predict.label_ids.tolist(),    # Convert Numpy array to list
    'metrics'    : trainer_predict.metrics                # Dictionary is already serializable
}

file_path = "trainer_predict.json"
with open(file_path, "w") as f:
  json.dump(trainer_predict_json_serializable, f)
print(f"Test output successfully saved to {file_path}.")

trainer_predict: <class 'transformers.trainer_utils.PredictionOutput'> len=3
PredictionOutput(predictions=array([[-5.1640625e+00, -2.0937500e+00,  2.1308594e+00,  4.0546875e+00,
         5.6982422e-01, -4.4375000e+00],
       [ 9.8339844e-01,  3.3671875e+00,  2.0156250e+00, -1.2275391e+00,
        -3.3945312e+00,  1.6269531e+00],
       [-5.1835938e+00, -1.8720703e+00,  2.1875000e+00,  4.1289062e+00,
         1.0751953e+00, -4.4101562e+00],
       ...,
       [-4.7812500e+00, -4.2812500e+00, -1.5234375e+00,  3.3359375e+00,
         2.8691406e+00, -4.8867188e+00],
       [-4.7812500e+00, -3.9433594e+00, -1.6210938e+00,  2.9648438e+00,
         2.6933594e+00, -4.7656250e+00],
       [-4.8203125e+00, -3.3496094e+00,  2.9628906e+00,  3.6347656e+00,
        -2.4299622e-03, -4.3867188e+00]], dtype=float32), label_ids=array([[0., 1., 1., 1., 0., 0.],
       [1., 1., 1., 0., 0., 1.],
       [0., 1., 1., 1., 1., 0.],
       ...,
       [0., 0., 1., 1., 1., 0.],
       [0., 0., 0., 1., 1., 0.],


In [48]:
print("Second test successfully completed.")

Second test successfully completed.


In [49]:
# Upload Test Results to HF repo_id_dataset

upload_file(
    path_or_fileobj = f"{phase_test}_results.csv",
    path_in_repo    = f"{phase_test}_results.csv",
    repo_id         = HF_name,
    repo_type       = 'dataset'
)
upload_file(
    path_or_fileobj = f"{phase_test}_metrics.json",
    path_in_repo    = f"{phase_test}_metrics.json",
    repo_id         = HF_name,
    repo_type       = 'dataset'
)
upload_file(
    path_or_fileobj = 'trainer_predict.json',
    path_in_repo    = 'trainer_predict.json',
    repo_id         = HF_name,
    repo_type       = 'dataset'
)

CommitInfo(commit_url='https://huggingface.co/datasets/claudelepere/skill_classification/commit/4f822d5beaad7f86c1c2e6094523803bf2e43200', commit_message='Upload trainer_predict.json with huggingface_hub', commit_description='', oid='4f822d5beaad7f86c1c2e6094523803bf2e43200', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/claudelepere/skill_classification', endpoint='https://huggingface.co', repo_type='dataset', repo_id='claudelepere/skill_classification'), pr_revision=None, pr_num=None)

In [50]:
print("It's the end")

It's the end
