<a href="https://colab.research.google.com/github/claudelepere/ML_GitHub/blob/main/jobs_EN_11_0_24000_thresholds_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q accelerate
!pip install -q huggingface_hub
!pip install -q scikit-learn
!pip install -q transformers datasets  # transformers and datasets are Hugging Face libraries
!pip install -q wandb

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m116.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m71.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m58.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m39.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import datetime
import json
import numpy as np
import os
import pandas as pd
import pickle
import sys
import time
import torch
import torch.nn.functional as F
import wandb

from datasets              import DatasetDict
from google.colab          import auth, drive, files, userdata
from huggingface_hub       import create_branch, create_repo, HfApi, login, upload_file, hf_hub_download, whoami
from huggingface_hub.utils import RepositoryNotFoundError
from sklearn.metrics       import accuracy_score, average_precision_score, classification_report, f1_score, precision_score, precision_recall_curve, precision_recall_fscore_support, recall_score, roc_auc_score
from torch.utils.data      import DataLoader
from tqdm.auto             import tqdm
from transformers          import AdamW, EvalPrediction, LongformerTokenizerFast, LongformerForSequenceClassification, Trainer, TrainingArguments
from torch.nn              import BCEWithLogitsLoss, Module

## Google Cloud authentication

In [3]:
auth.authenticate_user()  # user = c.lepere@ictjob.be

## Get skills and jobs

In [4]:
skills         = 11
all_rows_low   = 0
all_rows_high  = 24000
num_datapoints = all_rows_high - all_rows_low

datasetDict_zip_file_name = f"dataset_EN_{skills}_{all_rows_low}_{all_rows_high}.zip"
datasetDict_dir_name      = os.path.splitext(datasetDict_zip_file_name)[0]

print(f"datasetDict_zip_file_name: {datasetDict_zip_file_name}")
print(f"datasetDict_dir_name     : {datasetDict_dir_name}")
print()

datasetDict_zip_file_name: dataset_EN_11_0_24000.zip
datasetDict_dir_name     : dataset_EN_11_0_24000



## Upload to HF Hub?

In [5]:
upload_to_HF = True

## Hugging Face Hub (HF Hub) authenticate

In [6]:
if upload_to_HF:

  os.environ["HF_TOKEN"] = userdata.get("HF_TOKEN")    # Store the key in os.environ
  hf_token               = os.environ.get('HF_TOKEN')

  login(token=hf_token)

  # Check
  user = whoami(token=hf_token)
  assert user['name'] == 'claudelepere', f"{user['name']} is not claudelepere"
  print(f"user: {user}")

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


user: {'type': 'user', 'id': '66ec3d5f61228b02f8780beb', 'name': 'claudelepere', 'fullname': 'Claude Lepère', 'isPro': False, 'avatarUrl': 'https://cdn-avatars.huggingface.co/v1/production/uploads/66ec3d5f61228b02f8780beb/gvnf9pvm2KvE90ETMUQo3.jpeg', 'orgs': [], 'auth': {'type': 'access_token', 'accessToken': {'displayName': 'jobs_token', 'role': 'fineGrained', 'createdAt': '2025-01-04T17:44:35.493Z', 'fineGrained': {'canReadGatedRepos': False, 'global': [], 'scoped': [{'entity': {'_id': '66ec3d5f61228b02f8780beb', 'type': 'user', 'name': 'claudelepere'}, 'permissions': ['repo.content.read', 'repo.write']}]}}}}


## repo_id, branch, model and dataset repos on HF Hub
**1 repo = 1 model and 1 tokenizer**

**branch = revision**

In [7]:
if upload_to_HF:

  repo_id   = 'claudelepere/jobs_EN_11_0_24000_tuned_thresholds'
  timestamp = f"{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"

  model_repoUrl   = create_repo(repo_id=repo_id, repo_type="model",   private=True, exist_ok=True)
  dataset_repoUrl = create_repo(repo_id=repo_id, repo_type="dataset", private=True, exist_ok=True)

  #create_branch(repo_id=repo_id, repo_type="model",   branch=branch, exist_ok=True)
  #create_branch(repo_id=repo_id, repo_type="dataset", branch=branch, exist_ok=True)

  print(f"Model Repo Url: {model_repoUrl} created successfully as a private repo")
  print(f"Dataset Repo Url: {dataset_repoUrl} created successfully as a private repo")

Model Repo Url: https://huggingface.co/claudelepere/jobs_EN_11_0_24000_tuned_thresholds created successfully as a private repo
Dataset Repo Url: https://huggingface.co/datasets/claudelepere/jobs_EN_11_0_24000_tuned_thresholds created successfully as a private repo


## HF model card
Model card here => README.md on the HF Hub.

In [8]:
model_card = """
---
tags:
- "24000"
---
# Model
Model fine-tuned on higly-imbalanced multilabel classification.

## Model details
- Language: English
- Task: Multilabel classification
- Architecture: Longformer
- Pretrained model: [allenai/longformer-base-4096](https://huggingface.co/allenai/longformer-base-4096)
- Framework: Pytorch
- Version 1.0.0

## Training Data
- skills: 11
- 24000 job datapoints

## Fine-tuning parameters
- batch size: 8
- gradient accumulation: 4
- fp16 precision
- input tokens max length: 1024
- epochs: 5
- learning rate: 1e-5
- tuned thresholds
- attention window size: 1024
"""

## Save locally and upload model card to HF Hub

In [9]:
if upload_to_HF:

  with open("model_card.md", "w") as f:
    f.write(model_card)

  upload_file(
      path_or_fileobj = 'model_card.md',  # path to a file or a file-like object
      path_in_repo    = 'README.md',
      repo_id         = repo_id
  )

In [10]:
"""
# Check the Python version
print(sys.version)
print()

# Get the installed packages (you can see that conda is not installed (do not install it))
!pip list
print()

# Check system information
!cat /etc/os-release
!uname -m
print()

# Check the GPU details (only if the runtime type is T4 GPU)
#!nvidia-smi
#print()

# Check RAM
!free -h
print()

# Check disk space
!df -h
print()

# Get environment variables
for key, value in os.environ.items():
    print(f"{key}: {value}")
"""
!python -V

Python 3.11.11


In [11]:
print(f"currentdir: {os.getcwd()}")

currentdir: /content


In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"device: {device}")

device: cuda


## Out Of Memory (OOM)

### OOM: reduce batch size
      small sizes (1 to 32):            PROs: better generalization in some cases
                                        CONs: may produce noisier gradients
      large sizes (128, 256, or higer): PROs: gradients are smoother, leading to more stable training
                                        CONs: poorer generalization (overfitting) in some cases
      intermediate sizes (32, 64):      combines the benefits of small and large sizes

In [13]:
batch_size = 8

### OOM: enable gradient accumulation

* compensate for smaller batch sizes by accumulating gradients over several steps
* **effective batch size** = per-device batch size x gradient acumulation steps
* in each iteration, the model computes the gradients, these gradients are immediately used to update the model parameters

WARNING: gradient_accumulation_steps may not be None => comment it in TrainingArguments

In [14]:
gradient_accumulation_steps = 4

### OOM: use PYTORCH_CUDA_ALLOC_CONF to handle memory fragmentation

In [15]:
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

### OOM: check for and kill zombie processes

In [16]:
!ps aux | grep python
!kill -9 <PID>
if torch.cuda.is_available():
  !nvidia-smi
  print(torch.cuda.memory_summary())

root          84  0.8  0.0      0     0 ?        Z    15:00   0:12 [python3] <defunct>
root          85  0.0  0.0  63648 50780 ?        S    15:00   0:00 python3 /usr/local/bin/colab-file
root         134  0.2  0.1 775752 122672 ?       Sl   15:00   0:04 /usr/bin/python3 /usr/local/bin/j
root        5417  7.4  1.5 12074156 1316184 ?    Ssl  15:21   0:20 /usr/bin/python3 -m colab_kernel_
root        5472  0.2  0.0 545964 21184 ?        Sl   15:21   0:00 /usr/bin/python3 /usr/local/lib/p
root        6798  0.0  0.0   7376  3444 ?        S    15:26   0:00 /bin/bash -c ps aux | grep python
root        6800  0.0  0.0   6484  2292 ?        S    15:26   0:00 grep python
/bin/bash: -c: line 1: syntax error near unexpected token `newline'
/bin/bash: -c: line 1: `kill -9 <PID>'
Wed Feb 12 15:26:14 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----

### OOM: use fp16 (half precision) mixed precision training
reduces memory requirements by up to 50%

In [17]:
fp = 'fp16'

### OOM: limit the number of GPU workers:
* 0 (default) or 1
* in Colab dataloader_num_workers = 1

### OOM: reduce model size or input tokens
* LongformerTokenizer.from_pretrained('allenai/longformer-base/large-4096'): large/base: 435M/149M parameters
* max_length: 4096 max for Longformer
* a single word can be equal to several tokens; stop words are **NOT discarded**!
* word_text_length_counts_sorted:
      jobs count                 : 50000
      jobs count under  512 words: 44794  89.59%
      jobs count under  640 words: 47894  95.79%
      jobs count under  768 words: 49123  98.25%
      jobs count under  896 words: 49691  99.38%
      jobs count under 1024 words: 49917  99.83%
      jobs count under 2048 words: 50000 100.00%
      jobs count under 4096 words: 50000 100.00%

In [18]:
#max_length =  768    #      37 min    #
max_length = 1024    #      38 min    # GPU RAM: 12.2 / 40 GB
#max_length = 2048    # 1 hr 10 min    # GPU RAM: 21.4 / 40 GB
#max_length = 4096    # 2 hr 10 min    # GPU RAM: 39.5 / 40 GB => OutOfMemoryError

### OOM: free up GPU memory

In [19]:
torch.cuda.empty_cache()

### OOM: reduce the number of transformers layers

In [20]:
# hidden_layers = 6  # default:12

## epoch
* 1 epoch is a complete pass through the entire training dataset
* with n datapoints and batch size = b, n/b iterations to complete 1 epoch
* 1 iteration is a single update of the model's parameters

In [21]:
epochs = 5

## learning rate
* A common rule is to scale the learning rate proportionaly with the effective batch size
* **note: get_linear_schedule_with_warmup**

In [22]:
learning_rate = 1e-5  # 1e-5 x 32/8

## threshold
default: 0.5

In [23]:
threshold = 0.5

## attention window size

In [24]:
attention_window = 1024 #512

## Upload and unzip job dataset

In [26]:
def upload_unzip_dataset(file_name=datasetDict_zip_file_name):
  # Check if the file exists
  if not os.path.exists(file_name):
    print(f"'{file_name}' not found in /content. Uploading...")
    uploaded_files = files.upload()                              # Prompt file upload dialog
    if file_name not in uploaded_files:
      raise FileNotFoundError(f"'{file_name}' was not uploaded. Please try again.")
    print(f"'{file_name}' successfully uploaded to /content")
    uploaded_file_name = list(uploaded_files.keys())[0]          # Get the name of the uploaded file

    !unzip {uploaded_file_name}

    unzipped_dir_name = os.path.splitext(uploaded_file_name)[0]
    assert unzipped_dir_name==datasetDict_dir_name, "unzipped_dir_name != datasetDict_dir_name"
  else:
    print(f"'{file_name}' already exists in /content.")

upload_unzip_dataset(datasetDict_zip_file_name)

'dataset_EN_11_0_24000.zip' not found in /content. Uploading...


Saving dataset_EN_11_0_24000.zip to dataset_EN_11_0_24000.zip
'dataset_EN_11_0_24000.zip' successfully uploaded to /content
Archive:  dataset_EN_11_0_24000.zip
  inflating: dataset_EN_11_0_24000/dataset_dict.json  
  inflating: dataset_EN_11_0_24000/test/data-00000-of-00001.arrow  
  inflating: dataset_EN_11_0_24000/test/dataset_info.json  
  inflating: dataset_EN_11_0_24000/test/state.json  
  inflating: dataset_EN_11_0_24000/train/data-00000-of-00001.arrow  
  inflating: dataset_EN_11_0_24000/train/dataset_info.json  
  inflating: dataset_EN_11_0_24000/train/state.json  
  inflating: dataset_EN_11_0_24000/validation/data-00000-of-00001.arrow  
  inflating: dataset_EN_11_0_24000/validation/dataset_info.json  
  inflating: dataset_EN_11_0_24000/validation/state.json  


## W&B initialization (not used now)

In [27]:
run_name = f"EN_{skills}_{all_rows_low}_{all_rows_high}_ml{max_length}_ep{epochs}_lr{learning_rate}_th{threshold}_at{attention_window}_{fp}"

if 'gradient_accumulation_steps' not in globals():
  run_name = f"{run_name}_ba{batch_size}"
else:
  run_name = f"{run_name}_ba{batch_size}x{gradient_accumulation_steps}"

print(f"run_name: {run_name}")

run_name: EN_11_0_24000_ml1024_ep5_lr1e-05_th0.5_at1024_fp16_ba8x4


In [28]:
os.environ["WANDB_API_KEY"] = userdata.get("WANDB_API_KEY")        # Store the key in os.environ
wandb_api_key               = os.environ.get('WANDB_API_KEY')
wandb.login(key=wandb_api_key)

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mclaudelepere[0m ([33mclaudelepere-c-cile-cy[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [29]:
try:
  wandb.init(
      project = "skill_classification",
      name    = run_name,
      entity  = "claudelepere-c-cile-cy",
      config  = {
          "learning_rate": learning_rate,
          "epochs"       : epochs,
          "batch_size"   : batch_size
      }
  )
except wandb.errors.CommError as err:
  print(f"CommError: {err}")
except Exception as exc:
  print(f"Exception: {exc}")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


## Create datasetDict (HF DatasetDict) = 3 HF Dataset, train, validation and test

In [30]:
datasetDict = DatasetDict.load_from_disk(datasetDict_dir_name)

In [31]:
print(f"datasetDict: {type(datasetDict)} {datasetDict.shape}\n{datasetDict}")
print(f"datasetDict.keys(): {datasetDict.keys()}")
print(f"datasetDict['train']:      {type(datasetDict['train'])}      {datasetDict['train'].shape}")
print(f"datasetDict['validation']: {type(datasetDict['validation'])} {datasetDict['validation'].shape}")
print(f"datasetDict['test']:       {type(datasetDict['test'])}       {datasetDict['test'].shape}")

datasetDict: <class 'datasets.dataset_dict.DatasetDict'> {'train': (19200, 8), 'validation': (2400, 8), 'test': (2400, 8)}
DatasetDict({
    train: Dataset({
        features: ['id', 'text', '390', '135', '136', '137', '138', '139'],
        num_rows: 19200
    })
    validation: Dataset({
        features: ['id', 'text', '390', '135', '136', '137', '138', '139'],
        num_rows: 2400
    })
    test: Dataset({
        features: ['id', 'text', '390', '135', '136', '137', '138', '139'],
        num_rows: 2400
    })
})
datasetDict.keys(): dict_keys(['train', 'validation', 'test'])
datasetDict['train']:      <class 'datasets.arrow_dataset.Dataset'>      (19200, 8)
datasetDict['validation']: <class 'datasets.arrow_dataset.Dataset'> (2400, 8)
datasetDict['test']:       <class 'datasets.arrow_dataset.Dataset'>       (2400, 8)


In [32]:
example = datasetDict['train'][0]
print(f"datasetDict['train'][0]: {type(example)} {example.keys()}\n{example}")

datasetDict['train'][0]: <class 'dict'> dict_keys(['id', 'text', '390', '135', '136', '137', '138', '139'])
{'id': 196243, 'text': "Vivid Resourcing - Software Engineer   React.JS Node.js Vue.js Nest.JS   Vivid Resourcing   I'm partnered with a startup based in Brussels who are looking for an experienced Software Engineer to strengthen it's development team. The client created an AI-enabled Intelligence Platform for business enterprises. This platform analyses significant competitors, industry trends, market dynamics, new technologies, and business ecosystem evolutions to ensure that companies remain constantly up date. You'll be responsible for managing the interchange of data between the server and the users. Your key tasks will be developing the server-side logic, defining and maintaining the core database, and guaranteeing front-end performance and responsiveness. You'll work closely with other teams such as Product Managers and Data Engineers. Your profile At least 3+ Years of exp

## Create labels (list), id2label (dict) and label2id (dict).
* dataset 7_1000_125_125  ,  48 labels
* dataset 7_128_18_54     ,  42 labels
* dataset 8910_1087_68_204, 206 labels
* dataset 11_1000         ,   6 labels

In [33]:
labels = [label for label in datasetDict['train'].features.keys() if label not in ['id', 'text']]
labels.sort()
print(f"labels: {type(labels)} {len(labels)}\n{labels}")

num_labels = len(labels)

id2label = {idx: label for idx, label in enumerate(labels)}
print(f"id2label: {type(id2label)} {len(id2label)}\n{id2label}")

label2id = {label: idx for idx, label in enumerate(labels)}
print(f"label2id: {type(label2id)} {len(label2id)}\n{label2id}")

labels: <class 'list'> 6
['135', '136', '137', '138', '139', '390']
id2label: <class 'dict'> 6
{0: '135', 1: '136', 2: '137', 3: '138', 4: '139', 5: '390'}
label2id: <class 'dict'> 6
{'135': 0, '136': 1, '137': 2, '138': 3, '139': 4, '390': 5}


## Load the pretrained tokenizer and the model

In [34]:
model_name = "allenai/longformer-base-4096"

In [35]:
tokenizer = LongformerTokenizerFast.from_pretrained(model_name)

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

In [36]:
model = LongformerForSequenceClassification.from_pretrained(
    model_name,
    num_labels   = num_labels,
    id2label     = id2label,
    label2id     = label2id,
    problem_type = 'multi_label_classification'
)

# Configure attention window size
model.config.attention_window = attention_window

pytorch_model.bin:   0%|          | 0.00/597M [00:00<?, ?B/s]

Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [37]:
optimizer = AdamW(model.parameters(), lr=learning_rate)



## Tokenize ('input_ids' and 'attention_mask'), add 'global_attention_mask' (for Longformer), add 'labels'

In [38]:
def preprocess_data(examples, indices):
  # Step 1: Extract text and tokenize
  text = examples['text']             # Batch of texts
  encoding = tokenizer(
      text,                           # Tokenize text
      truncation     = True,
      padding        = 'max_length',
      max_length     = max_length,
      return_tensors = 'pt'           # Return PyTorch tensors
  )

  # Step 2: Create and add the global attention mask
  global_attention_mask             = torch.zeros_like(encoding['input_ids'])  # Initialize global attention mask with zeros (same shape as input_ids)
  global_attention_mask[:, 0]       = 1                                        # Set global attention on the first token ([CLS], token ID=0) in each sequence
  encoding['global_attention_mask'] = global_attention_mask                    # Add the global_attention_mask to the batch

  # Step 3: Create and populate the label matrix
  labels_matrix = torch.zeros((len(text), len(labels)), dtype=torch.float32)   # Create an empty label matrix
  #print(f"labels_matrix: {type(labels_matrix)} {labels_matrix.shape}")
  #---------Populate label matrix
  for idx, label in enumerate(labels):
    #print(f"idx:{idx} label:{label}")
    if label in examples:
      labels_matrix[:, idx] = torch.tensor(
          [1.0 if val else 0.0 for val in examples[label]],
          dtype=torch.float32
          )
  print(f"labels_matrix: {type(labels_matrix)} {labels_matrix.shape}")

  encoding['labels'] = labels_matrix                                           # Add labels to the encoding
  print(f"encoding['labels']: {type(encoding['labels'])} {encoding['labels'].shape}")

  # encoding: <class 'transformers.tokenization_utils_base.BatchEncoding'> dict_keys(['input_ids', 'attention_mask', 'global_attention_mask', 'labels'])
  #   'input_ids': tensor([[
  #   'attention_mask': tensor([[
  #   'global_attention_mask': tensor([[
  #   'labels': tensor([[
  #print(f"1 preprocess_data call: encoding: {type(encoding)} {encoding.keys()}")

  return encoding

## Create encoded_dataset (datasets.dataset_dict.DatasetDict) = 3 encoded datasets.arrow_dataset.Dataset, train, validation and test

In [39]:
encoded_dataset = datasetDict.map(
    preprocess_data,
    batched        = True,
    remove_columns = datasetDict['train'].column_names,
    with_indices   = True
)

print(f"Zencoded_dataset: {type(encoded_dataset)} shape={encoded_dataset.shape}")

Map:   0%|          | 0/19200 [00:00<?, ? examples/s]

labels_matrix: <class 'torch.Tensor'> torch.Size([1000, 6])
encoding['labels']: <class 'torch.Tensor'> torch.Size([1000, 6])
labels_matrix: <class 'torch.Tensor'> torch.Size([1000, 6])
encoding['labels']: <class 'torch.Tensor'> torch.Size([1000, 6])


model.safetensors:   0%|          | 0.00/597M [00:00<?, ?B/s]

labels_matrix: <class 'torch.Tensor'> torch.Size([1000, 6])
encoding['labels']: <class 'torch.Tensor'> torch.Size([1000, 6])
labels_matrix: <class 'torch.Tensor'> torch.Size([1000, 6])
encoding['labels']: <class 'torch.Tensor'> torch.Size([1000, 6])
labels_matrix: <class 'torch.Tensor'> torch.Size([1000, 6])
encoding['labels']: <class 'torch.Tensor'> torch.Size([1000, 6])
labels_matrix: <class 'torch.Tensor'> torch.Size([1000, 6])
encoding['labels']: <class 'torch.Tensor'> torch.Size([1000, 6])
labels_matrix: <class 'torch.Tensor'> torch.Size([1000, 6])
encoding['labels']: <class 'torch.Tensor'> torch.Size([1000, 6])
labels_matrix: <class 'torch.Tensor'> torch.Size([1000, 6])
encoding['labels']: <class 'torch.Tensor'> torch.Size([1000, 6])
labels_matrix: <class 'torch.Tensor'> torch.Size([1000, 6])
encoding['labels']: <class 'torch.Tensor'> torch.Size([1000, 6])
labels_matrix: <class 'torch.Tensor'> torch.Size([1000, 6])
encoding['labels']: <class 'torch.Tensor'> torch.Size([1000, 6])


Map:   0%|          | 0/2400 [00:00<?, ? examples/s]

labels_matrix: <class 'torch.Tensor'> torch.Size([1000, 6])
encoding['labels']: <class 'torch.Tensor'> torch.Size([1000, 6])
labels_matrix: <class 'torch.Tensor'> torch.Size([1000, 6])
encoding['labels']: <class 'torch.Tensor'> torch.Size([1000, 6])
labels_matrix: <class 'torch.Tensor'> torch.Size([400, 6])
encoding['labels']: <class 'torch.Tensor'> torch.Size([400, 6])


Map:   0%|          | 0/2400 [00:00<?, ? examples/s]

labels_matrix: <class 'torch.Tensor'> torch.Size([1000, 6])
encoding['labels']: <class 'torch.Tensor'> torch.Size([1000, 6])
labels_matrix: <class 'torch.Tensor'> torch.Size([1000, 6])
encoding['labels']: <class 'torch.Tensor'> torch.Size([1000, 6])
labels_matrix: <class 'torch.Tensor'> torch.Size([400, 6])
encoding['labels']: <class 'torch.Tensor'> torch.Size([400, 6])
Zencoded_dataset: <class 'datasets.dataset_dict.DatasetDict'> shape={'train': (19200, 4), 'validation': (2400, 4), 'test': (2400, 4)}


In [40]:
encoded_dataset.set_format('torch')
train_dataset      = encoded_dataset['train']
validation_dataset = encoded_dataset['validation']
test_dataset       = encoded_dataset['test']

print(f"train_dataset_tensor:                          {type(train_dataset)}                              {train_dataset.shape} {train_dataset.features}\n{train_dataset}")
print(f"train_dataset_tensor['input_ids']:             {type(train_dataset['input_ids'])}             len={len(train_dataset['input_ids'])}             shape={train_dataset['input_ids'].shape}            ") #\n{train_dataset['input_ids']}")
print(f"train_dataset_tensor['attention_mask']:        {type(train_dataset['attention_mask'])}        len={len(train_dataset['attention_mask'])}        shape={train_dataset['attention_mask'].shape}       ") #\n{train_dataset['attention_mask']}")
print(f"train_dataset_tensor['global_attention_mask']: {type(train_dataset['global_attention_mask'])} len={len(train_dataset['global_attention_mask'])} shape={train_dataset['global_attention_mask'].shape}") #\n{train_dataset['global_attention_mask']}")
print(f"train_dataset_tensor['labels']:                {type(train_dataset['labels'])}                len={len(train_dataset['labels'])}                shape={train_dataset['labels'].shape}               ") #\n{train_dataset['labels']}")

train_dataset_tensor:                          <class 'datasets.arrow_dataset.Dataset'>                              (19200, 4) {'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None), 'global_attention_mask': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), 'labels': Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None)}
Dataset({
    features: ['input_ids', 'attention_mask', 'global_attention_mask', 'labels'],
    num_rows: 19200
})
train_dataset_tensor['input_ids']:             <class 'torch.Tensor'>             len=19200             shape=torch.Size([19200, 1024])            
train_dataset_tensor['attention_mask']:        <class 'torch.Tensor'>        len=19200        shape=torch.Size([19200, 1024])       
train_dataset_tensor['global_attention_mask']: <class 'torch.Tensor'> len=19200 shape=torch.Size([19200, 1024])
train_dataset_tensor

## Truncated part

In [41]:
def get_truncated_part(text):
  tokens = tokenizer(
      text,
      truncation                = True,
      padding                   = 'max_length',
      max_length                = max_length,
      return_overflowing_tokens = True,
      return_tensors            = None
  )
  print(f"tokens.keys(): {tokens.keys()}")

  # Get the truncated tokens
  truncated_ids = tokens["input_ids"][0]
  print(f"truncated_ids: {type(truncated_ids)} {truncated_ids}")
  #overflow_ids  = tokens["overflow_to_sample_mapping"][0]
  #print(f"overflow_ids: {type(overflow_ids)} {overflow_ids}")

  # Decode the tokens back to text
  truncated_text = tokenizer.decode(truncated_ids, skip_special_tokens=True)
  #overflow_text  = tokenizer.decode(overflow_ids, skip_special_tokens=True)

  print(f"original_text :\n{text}")
  print(f"truncated_text:\n{truncated_text}")
  #print(f"overflow_text:\n{overflow_text}")

  original_tokens  = tokenizer.tokenize(text)
  truncated_tokens = tokenizer.tokenize(truncated_text)
  #overflow_tokens  = tokenizer.tokenize(overflow_text)

  print(f"original_tokens count : {len(original_tokens)}")
  print(f"truncated_tokens count: {len(truncated_tokens)}")
  #print(f"overflow_tokens count: {len(overflow_tokens)}")

In [42]:
example_text = datasetDict['train'][0]['text']
#get_truncated_part(example_text)

In [43]:
inputs = tokenizer(
    example_text,
    truncation     = True,
    padding        = 'max_length',
    max_length     = max_length,
    return_tensors = 'pt'
)

## Forward pass for multi-label classification

In [44]:
outputs = model(
    input_ids      = inputs.input_ids,
    attention_mask = inputs.attention_mask
)

Initializing global attention on CLS token...


In [45]:
print(f"outputs: {type(outputs)} {outputs.keys()}\n{outputs}")

outputs: <class 'transformers.models.longformer.modeling_longformer.LongformerSequenceClassifierOutput'> odict_keys(['logits'])
LongformerSequenceClassifierOutput(loss=None, logits=tensor([[-0.0985, -0.1789, -0.2517, -0.0088,  0.0251, -0.0361]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None, global_attentions=None)


In [46]:
# Logits (= raw model outputs)
logits = outputs.logits
print(f"logits: {type(logits)} {logits.shape}\n{logits}")

logits: <class 'torch.Tensor'> torch.Size([1, 6])
tensor([[-0.0985, -0.1789, -0.2517, -0.0088,  0.0251, -0.0361]],
       grad_fn=<AddmmBackward0>)


In [47]:
# Convert logits to probabilities
sigmoid = torch.nn.Sigmoid()
probs   = sigmoid(logits)
print(f"probs: {type(probs)} {probs.shape}\n{probs}")

probs: <class 'torch.Tensor'> torch.Size([1, 6])
tensor([[0.4754, 0.4554, 0.4374, 0.4978, 0.5063, 0.4910]],
       grad_fn=<SigmoidBackward0>)


In [48]:
example = encoded_dataset['train'][0]

In [49]:
print(f"example: {type(example)} {example.keys()}\n{example}")
print()
#print(f"example['input_ids']: {type(example['input_ids'])} {len(example['input_ids'])}\n{example['input_ids']}")
#print(f"example['attention_mask']: {type(example['attention_mask'])} {len(example['attention_mask'])}\n{example['attention_mask']}")
#print(f"example['labels']:  {type(example['labels'])} {len(example['labels'])}\n{example['labels']}")

example: <class 'dict'> dict_keys(['input_ids', 'attention_mask', 'global_attention_mask', 'labels'])
{'input_ids': tensor([   0,  846, 6837,  ...,    1,    1,    1]), 'attention_mask': tensor([1, 1, 1,  ..., 0, 0, 0]), 'global_attention_mask': tensor([1, 0, 0,  ..., 0, 0, 0]), 'labels': tensor([0., 0., 1., 1., 1., 0.])}



In [50]:
tokenizer.decode(example['input_ids'])

"<s>Vivid Resourcing - Software Engineer   React.JS Node.js Vue.js Nest.JS   Vivid Resourcing   I'm partnered with a startup based in Brussels who are looking for an experienced Software Engineer to strengthen it's development team. The client created an AI-enabled Intelligence Platform for business enterprises. This platform analyses significant competitors, industry trends, market dynamics, new technologies, and business ecosystem evolutions to ensure that companies remain constantly up date. You'll be responsible for managing the interchange of data between the server and the users. Your key tasks will be developing the server-side logic, defining and maintaining the core database, and guaranteeing front-end performance and responsiveness. You'll work closely with other teams such as Product Managers and Data Engineers. Your profile At least 3+ Years of experience with JavaScript. Extensive experience developing frontend applications using React and related libraries. Experience wit

In [51]:
[id2label[idx] for idx, label in enumerate(example['labels']) if label == 1.0]

['137', '138', '139']

## Set PyTorch format to ensures correctness and compatibility with PyTorch pipelines
The 3 Hugging Face Dataset are formatted as PyTorch Dataset.

In [52]:
encoded_dataset.set_format('torch')

## Workflow

- 3 steps: training, evaluation, prediction
- 3 datasets: train, validation, test
- 3 Trainer functions: train, evaluate, predict
---
* training uses train_dataset
* evaluation uses validation_dataset
* test uses test_dataset

## Training step


In [53]:
batch_size  = batch_size
metric_name = "f1"

### Metrics
  source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/

In [54]:
def multi_label_metrics(logits, true_labels):
  """
  logits => sigmoid => probabilities => predictions

  Args:
    logits     : raw, unnormalized scores outputted by the model  (numpy array of shape (batch_size, num_labels))
    true_labels: actual labels for the data                       (numpy array of shape (batch_size, num_labels))

  Returns:
    metrics: dictionary of scores
  """
  average = 'micro'    # 'micro' or 'weighted'

  sigmoid = torch.nn.Sigmoid()
  probs   = sigmoid(torch.Tensor(logits))
  # next, use threshold to turn them into integer predictions
  preds = np.zeros(probs.shape)
  preds[np.where(probs > threshold)] = 1

  # compute metrics
  f1                   = f1_score               (y_true=true_labels, y_pred=preds,  average=average)    #, zero_division=1)
  precision            = precision_score        (y_true=true_labels, y_pred=preds,  average=average)    #, zero_division=1)
  recall               = recall_score           (y_true=true_labels, y_pred=preds,  average=average)    #, zero_division=1)
  roc_auc              = roc_auc_score          (y_true=true_labels, y_score=probs, average=average)
  precision_recall_auc = average_precision_score(y_true=true_labels, y_score=probs, average=average)
  accuracy             = accuracy_score         (y_true=true_labels, y_pred=preds)

  metrics = {

      'f1'                  : f1,
      'precision'           : precision,
      'recall'              : recall,
      'roc_auc'             : roc_auc,
      'precision_recall_auc': precision_recall_auc,
      'accuracy'            : accuracy
  }

  return metrics

In [55]:
def compute_metrics(p: EvalPrediction):
  preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
  result = multi_label_metrics(logits=preds,true_labels=p.label_ids)   # true_labels=p.label_ids<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

  return result

In [56]:
print(f"input_ids:              {type(encoded_dataset['train']['input_ids'][0])}\t{encoded_dataset['train']['input_ids'][0].shape}")
print(f"attention_mask:         {type(encoded_dataset['train']['attention_mask'][0])}\t{encoded_dataset['train']['attention_mask'][0].shape}")
print(f"global_attention_mask:  {type(encoded_dataset['train']['global_attention_mask'][0])}\t{encoded_dataset['train']['global_attention_mask'][0].shape}")
print(f"labels:                 {type(encoded_dataset['train'][0]['labels'])}\t{encoded_dataset['train'][0]['labels'].shape}")

input_ids:              <class 'torch.Tensor'>	torch.Size([1024])
attention_mask:         <class 'torch.Tensor'>	torch.Size([1024])
global_attention_mask:  <class 'torch.Tensor'>	torch.Size([1024])
labels:                 <class 'torch.Tensor'>	torch.Size([6])


### Execute a forward pass for debugging or verification purposes (cf. BERT_3_1 in Notion BERT database)

In [57]:
outputs = model(
    input_ids      = encoded_dataset['train']['input_ids'][0].unsqueeze(0),
    attention_mask = encoded_dataset['train']['attention_mask'][0].unsqueeze(0),
    labels         = encoded_dataset['train'][0]['labels'].unsqueeze(0)
)

In [58]:
print(f"outputs: {type(outputs)} {outputs.keys()}\n{outputs}")

outputs: <class 'transformers.models.longformer.modeling_longformer.LongformerSequenceClassifierOutput'> odict_keys(['loss', 'logits'])
LongformerSequenceClassifierOutput(loss=tensor(0.6889, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>), logits=tensor([[-0.0985, -0.1789, -0.2517, -0.0088,  0.0251, -0.0361]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None, global_attentions=None)


### Weighted loss function

In [59]:
"""# Define the weighted loss function

class_weights = torch.tensor([7.68, 2.15, 0.61, 0.47, 0.68, 6.26], dtype=torch.float32).to(device)
loss_fn       = BCEWithLogitsLoss(pos_weight=class_weights)  # For multi-label classification (binary classification per label)

## Class supports, class weigths, weighted loss function

Reminder:
*   df_jobs      : <class 'pandas.core.frame.DataFrame'>
*   df_jobs['id']: <class 'pandas.core.series.Series'>

dataset = Dataset.from_pandas(df_jobs)
*   dataset      : <class 'datasets.arrow_dataset.Dataset'>
*   dataset['id']: <class 'list'>

*   dataset_dict_jobs : <class 'datasets.dataset_dict.DatasetDict'>
*   train_dataset     : <class 'datasets.arrow_dataset.Dataset'>
*   validation_dataset: <class 'datasets.arrow_dataset.Dataset'>
*   test_dataset      : <class 'datasets.arrow_dataset.Dataset'>


We calculate the class supports for the train, validation and test datasets; the class weights and the weighted loss function are used for training only; the class supports of validation_dataset and test_dataset are calculated for information only.

def get_train_class_weights(datasetDict, labels):
  print(f"datasetDict: {type(datasetDict)} shape={datasetDict.shape}\n{datasetDict}")
  print(f"labels: {type(labels)} len={len(labels)}\n{labels}")

  dataset_train      = datasetDict['train']
  dataset_validation = datasetDict['validation']
  dataset_test       = datasetDict['test']

  def calculate_class_supports(dataset, labels):
    class_supports = dataset.map(
        lambda example: {col: example[col] for col in labels},
        batched=True
    ).to_pandas()[labels].sum(axis=0)
    return class_supports

  class_supports = {}

  for split_name, split_dataset in datasetDict.items():
    class_supports[split_name] = calculate_class_supports(split_dataset, labels)

  for split_name, split_class_supports in class_supports.items():
    print(f"{split_name}: {type(split_class_supports)} len={len(split_class_supports)}\n{split_class_supports}")

  train_class_supports_list = class_supports['train'].tolist()
  print(f"train_class_supports_list: {type(train_class_supports_list)} len={len(train_class_supports_list)} {train_class_supports_list}")

  train_class_supports_tensor = torch.tensor(train_class_supports_list, dtype=torch.float32)
  print(f"train_class_supports_tensor: {type(train_class_supports_tensor)} len={len(train_class_supports_tensor)} {train_class_supports_tensor}")

  train_total_samples = dataset_train.num_rows
  print(f"train_total_samples: {train_total_samples}")

  number_of_classes = len(labels)
  print(f"number_of_classes: {number_of_classes}")

  train_class_weights = train_total_samples / (number_of_classes * train_class_supports_tensor)
  print(f"train_class_weights: {type(train_class_weights)} len={len(train_class_weights)} {train_class_weights}")

  train_class_weights_sum = train_class_weights.sum()
  print(f"train_class_weights_sum: {train_class_weights_sum}")

  normalized_train_class_weights = (train_class_weights / train_class_weights_sum) * number_of_classes
  print(f"normalized_train_class_weights: {type(normalized_train_class_weights)} len={len(normalized_train_class_weights)} {normalized_train_class_weights}")

  # Positives samples per label
  supports = train_class_supports_tensor
  print(f"supports: {type(supports)} {len(supports)} {supports}")

  # Negatives samples per label
  negatives = train_total_samples - supports
  print(f"negatives: {type(negatives)} {len(negatives)} {negatives}")

  # pos_weights = negative to positive ratios
  pos_weights = negatives/supports
  print(f"pos_weights: {type(pos_weights)} {len(pos_weights)} {pos_weights}")

  # Normalize using min-max scaling
  normalized_pos_weights_minmax = (pos_weights - pos_weights.min()) / (pos_weights.max() - pos_weights.min())
  print(f"normalized_pos_weights_minmax: {type(normalized_pos_weights_minmax)} {len(normalized_pos_weights_minmax)} {normalized_pos_weights_minmax}")

  # Normalize using z-score standardization
  normalized_pos_weights_zscore = (pos_weights - pos_weights.mean()) / pos_weights.std()
  print(f"normalized_pos_weights_zscore: {type(normalized_pos_weights_zscore)} {len(normalized_pos_weights_zscore)} {normalized_pos_weights_zscore}")

  # Normalize using min-max scaling
  normalized_pos_weights_minmax = (pos_weights - pos_weights.min()) / (pos_weights.max() - pos_weights.min())
  print(f"normalized_pos_weights_minmax: {type(normalized_pos_weights_minmax)} {len(normalized_pos_weights_minmax)} {normalized_pos_weights_minmax}")

  # Normalize using z-score standardization
  normalized_pos_weights_zscore = (pos_weights - pos_weights.mean()) / pos_weights.std()
  print(f"normalized_pos_weights_zscore: {type(normalized_pos_weights_zscore)} {len(normalized_pos_weights_zscore)} {normalized_pos_weights_zscore}")

  # Normalize using sum-to-one
  normalized_pos_weights_sum1 = pos_weights / pos_weights.sum()
  print(f"normalized_pos_weights_sum1: {type(normalized_pos_weights_sum1)} {len(normalized_pos_weights_sum1)} {normalized_pos_weights_sum1}")

  return normalized_pos_weights_minmax
  #return normalized_pos_weights_zscore
  #return normalized_pos_weights_sum1

pos_weights = get_train_class_weights(datasetDict, labels)

loss_fn = BCEWithLogitsLoss(pos_weight=pos_weights.to(device))  # For multi-label classification (binary classification per label)
print(f"loss_fn: {type(loss_fn)} {loss_fn}")
"""

'# Define the weighted loss function\n\nclass_weights = torch.tensor([7.68, 2.15, 0.61, 0.47, 0.68, 6.26], dtype=torch.float32).to(device)\nloss_fn       = BCEWithLogitsLoss(pos_weight=class_weights)  # For multi-label classification (binary classification per label)\n\n## Class supports, class weigths, weighted loss function\n\nReminder:\n*   df_jobs      : <class \'pandas.core.frame.DataFrame\'>\n*   df_jobs[\'id\']: <class \'pandas.core.series.Series\'>\n\ndataset = Dataset.from_pandas(df_jobs)\n*   dataset      : <class \'datasets.arrow_dataset.Dataset\'>\n*   dataset[\'id\']: <class \'list\'>\n\n*   dataset_dict_jobs : <class \'datasets.dataset_dict.DatasetDict\'>\n*   train_dataset     : <class \'datasets.arrow_dataset.Dataset\'>\n*   validation_dataset: <class \'datasets.arrow_dataset.Dataset\'>\n*   test_dataset      : <class \'datasets.arrow_dataset.Dataset\'>\n\n\nWe calculate the class supports for the train, validation and test datasets; the class weights and the weighted l

In [60]:
def get_class_weights(labels=encoded_dataset['train']['labels']):
  print(f"labels: {type(labels)} len={len(labels)} shape={labels.shape}\n{labels}")

  num_samples, num_labels = labels.shape
  print(f"num_samples: {type(num_samples)} {num_samples}")
  print(f"num_labels:  {type(num_labels)}  {num_labels}")

  class_counts = labels.sum(dim=0)
  print(f"class_counts: {type(class_counts)} len={len(class_counts)}\n{class_counts}")

  pos_weights = (num_samples-class_counts) / class_counts
  print(f"pos_weights: {type(pos_weights)} len={len(pos_weights)}\n{pos_weights}")

  normalized_pos_weights_minmax = (pos_weights - pos_weights.min()) / (pos_weights.max() - pos_weights.min())
  print(f"normalized_pos_weights_minmax: {type(normalized_pos_weights_minmax)} {len(normalized_pos_weights_minmax)} {normalized_pos_weights_minmax}")

  normalized_pos_weights_zscore = (pos_weights - pos_weights.mean()) / pos_weights.std()
  print(f"normalized_pos_weights_zscore: {type(normalized_pos_weights_zscore)} {len(normalized_pos_weights_zscore)} {normalized_pos_weights_zscore}")

  normalized_pos_weights_sum1 = pos_weights / pos_weights.sum()
  print(f"normalized_pos_weights_sum1: {type(normalized_pos_weights_sum1)} {len(normalized_pos_weights_sum1)} {normalized_pos_weights_sum1}")

  #return pos_weights
  #return normalized_pos_weights_minmax
  #return normalized_pos_weights_zscore
  return normalized_pos_weights_sum1

In [61]:
pos_weights = get_class_weights()

labels: <class 'torch.Tensor'> len=19200 shape=torch.Size([19200, 6])
tensor([[0., 0., 1., 1., 1., 0.],
        [0., 0., 0., 1., 1., 0.],
        [0., 1., 1., 1., 0., 0.],
        ...,
        [0., 0., 1., 1., 1., 0.],
        [0., 0., 1., 1., 1., 0.],
        [0., 0., 0., 1., 1., 0.]])
num_samples: <class 'int'> 19200
num_labels:  <class 'int'>  6
class_counts: <class 'torch.Tensor'> len=6
tensor([ 1153.,  4075., 13850., 17561., 13651.,  1560.])
pos_weights: <class 'torch.Tensor'> len=6
tensor([15.6522,  3.7117,  0.3863,  0.0933,  0.4065, 11.3077])
normalized_pos_weights_minmax: <class 'torch.Tensor'> 6 tensor([1.0000, 0.2326, 0.0188, 0.0000, 0.0201, 0.7208])
normalized_pos_weights_zscore: <class 'torch.Tensor'> 6 tensor([ 1.5633, -0.2328, -0.7331, -0.7771, -0.7300,  0.9098])
normalized_pos_weights_sum1: <class 'torch.Tensor'> 6 tensor([0.4960, 0.1176, 0.0122, 0.0030, 0.0129, 0.3583])


In [62]:
loss_fn = BCEWithLogitsLoss(pos_weight=pos_weights.to(device))

### FocalLoss

In [63]:
class FocalLoss(Module):
  """
  Focal Loss implementation
  """
  def __init__(self, alpha=1.0, gamma=2.0, logits=False, reduce=True):
    super(FocalLoss, self).__init__()
    self.alpha   = alpha
    self.gamma   = gamma
    self.logits  = logits  # This flag is to indicate whether input is logits or probability
    self.reduce  = reduce

  # inputs  = model's predictions: PyTorch tensor, shape=(batch_size, num_classes)
  # targets = ground truth labels: PyTorch tensor, shape=same as inputs shape
  def forward(self, inputs, targets):
    # Here, we check if input is probability or logits
    if self.logits:
      # Input is logits
      BCE_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction='none')
    else:
      # Input is probability
      BCE_loss = F.binary_cross_entropy(inputs, targets, reduction='none')
    pt = torch.exp(-BCE_loss)
    F_loss = self.alpha * (1-pt)**self.gamma * BCE_loss

    if self.reduce:
      return torch.mean(F_loss)
    else:
      return F_loss

  def __repr__(self):
    return f"FocalLoss(alpha={self.alpha}, gamma={self.gamma}, logits={self.logits}, reduce={self.reduce})"

  def __str__(self):
    return f"FocalLoss(alpha={self.alpha}, gamma={self.gamma}, logits={self.logits}, reduce={self.reduce})"

  def __call__(self, inputs, targets):
    return self.forward(inputs, targets)

In [64]:
focal_loss_fn = FocalLoss(alpha=0.5, gamma=4.0, logits=True, reduce=True)
print(f"focal_loss_fn: {type(focal_loss_fn)} {focal_loss_fn}")

focal_loss_fn: <class '__main__.FocalLoss'> FocalLoss(alpha=0.5, gamma=4.0, logits=True, reduce=True)


### HF transformer Trainer and CustomTrainer
Abstracts the training loop.

In [65]:
training_args = TrainingArguments(
    output_dir                  = './training_results',  # where model predictions and checkpoints will be written during training
    overwrite_output_dir        = True,
    logging_dir                 = './logs',
    logging_steps               = 50,
    save_steps                  = 500,
    save_total_limit            = 2,
    eval_strategy               = 'epoch',
    save_strategy               = 'epoch',
    learning_rate               = learning_rate,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size  = batch_size,
    gradient_accumulation_steps = gradient_accumulation_steps,
    num_train_epochs            = epochs,
    weight_decay                = 0.01,
    load_best_model_at_end      = True,
    metric_for_best_model       = metric_name,
    run_name                    = run_name,
    fp16                        = fp,
    report_to                  = 'wandb'
)

In [66]:
class CustomTrainer(Trainer):

  def __init__(self, *args, loss_fn=None, **kwargs):
    super().__init__(*args, **kwargs)
    self.loss_fn = loss_fn

  """
  # No print in compute_loss because out of memory because prints are batch per batch
  def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):

    #print(f"inputs passed to compute_loss: {inputs.keys()}")
    #input_ids             = inputs['input_ids']                        # shape: batch_size, sequence_length
    #attention_mask        = inputs['attention_mask']                   # shape: batch_size, sequence_length
    #global_attention_mask = inputs.get('global_attention_mask', None)  # shape: batch_size, sequence_length; optional as LongFormer specific
    labels                = inputs.pop('labels', None)                 # shape: batch_size, num_labels; needed for loss computation, not required by the model

    #outputs = model(**inputs, global_attention_mask=global_attention_mask)  # Forward pass
    # Forward pass
    #outputs = model(
    #    input_ids             = input_ids,
    #    attention_mask        = attention_mask,
    #    global_attention_mask = global_attention_mask,
    #    labels                = labels
    #)
    outputs = model(**inputs, labels=labels)
    #print(f"outputs: {type(outputs)} {outputs.keys()}\n{outputs}")
    logits = outputs.logits  # shape: (batch_size, num_labels)

    # If labels are provided, compute loss
    if labels is not None:
      # Use the custom loss function if provided
      if self.loss_fn is not None:
        loss = self.loss_fn(logits, labels)  # Compute weighted loss
      else:
        # Default loss: BCEWithLogitsLoss
        loss_fn = BCEWithLogitsLoss()
        loss    = loss_fn(logits, labels)    # Compute loss
      return (loss, outputs) if return_outputs else loss

    # If no labels, return outputs only, for evaluation or prediction
    return outputs
    """
  def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
    labels  = inputs.pop('labels', None)
    outputs = model(**inputs, labels=labels)
    logits  = outputs.logits

    if labels is not None:
      if self.loss_fn is not None:
        loss = self.loss_fn(logits, labels)
      else:
        loss_fn = BCEWithLogitsLoss()
        loss    = loss_fn(logits, labels)
      return (loss, outputs) if return_outputs else loss

    return outputs

In [67]:
"""
trainer = CustomTrainer(
    model           = model,
    args            = training_args,
    train_dataset   = encoded_dataset["train"],
    eval_dataset    = encoded_dataset["validation"],
    compute_metrics = compute_metrics,                # custom metrics function
    loss_fn         = focal_loss_fn,
)
"""

trainer = Trainer(
    model           = model,
    args            = training_args,
    train_dataset   = encoded_dataset["train"],
    eval_dataset    = encoded_dataset["validation"],
    compute_metrics = compute_metrics,                # custom metrics function
)

In [68]:
"""
predictions_output: <class 'transformers.trainer_utils.PredictionOutput'> len=3:
- predictions: np.ndarray          raw model outputs = logits
- label_ids  : np.ndarray or None: true labels corresponding to the predictions, if available
- metrics    : dict              : metrics computed during prediction step
"""
"""
predictions_output = trainer.predict(test_dataset)
print(f"predictions_output: {type(predictions_output)} len={len(predictions_output)}\n{predictions_output}")  # <class 'transformers.trainer_utils.PredictionOutput'> len=3
logits = predictions_output.predictions
print(f"logits: {type(logits)} len={len(logits)} shape={logits.shape}\n{logits}")


raise Exception("I'm here")
"""

'\npredictions_output = trainer.predict(test_dataset)\nprint(f"predictions_output: {type(predictions_output)} len={len(predictions_output)}\n{predictions_output}")  # <class \'transformers.trainer_utils.PredictionOutput\'> len=3\nlogits = predictions_output.predictions\nprint(f"logits: {type(logits)} len={len(logits)} shape={logits.shape}\n{logits}")\n\n\nraise Exception("I\'m here")\n'

### trainer.train

In [69]:
trainer_train_results = trainer.train()

print(f"trainer_train_results: {type(trainer_train_results)} len={len(trainer_train_results)}\n{trainer_train_results}")
print()
print(f"trainer_train_results.metrics: {type(trainer_train_results.metrics)} len={len(trainer_train_results.metrics)}\n{json.dumps(trainer_train_results.metrics, indent=4)}")

Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Roc Auc,Precision Recall Auc,Accuracy
1,0.306,0.300233,0.863459,0.852435,0.874771,0.944436,0.928424,0.51125
2,0.286,0.287859,0.871516,0.86427,0.878885,0.949883,0.935046,0.529167
3,0.2643,0.28357,0.870792,0.87664,0.865021,0.951897,0.938496,0.520833
4,0.2511,0.279155,0.877861,0.870562,0.885283,0.953416,0.940264,0.534167
5,0.2301,0.277958,0.878573,0.87211,0.885131,0.953454,0.940711,0.53625


trainer_train_results: <class 'transformers.trainer_utils.TrainOutput'> len=3
TrainOutput(global_step=3000, training_loss=0.280786548614502, metrics={'train_runtime': 4810.34, 'train_samples_per_second': 19.957, 'train_steps_per_second': 0.624, 'total_flos': 6.3059569016832e+16, 'train_loss': 0.280786548614502, 'epoch': 5.0})

trainer_train_results.metrics: <class 'dict'> len=6
{
    "train_runtime": 4810.34,
    "train_samples_per_second": 19.957,
    "train_steps_per_second": 0.624,
    "total_flos": 6.3059569016832e+16,
    "train_loss": 0.280786548614502,
    "epoch": 5.0
}


In [70]:
print("trainer.train successfully completed")

trainer.train successfully completed


### trainer.train results: save locally and upload to HF Hub

In [71]:
if upload_to_HF:

  name               = "trainer_train"
  trainer_train_path = f"{name}.json"

  with open(trainer_train_path, "w") as f:
    json.dump(trainer_train_results, f)

  print(f"{name} results successfully saved locally to {trainer_train_path}")

  upload_file(
      path_or_fileobj = trainer_train_path,
      path_in_repo    = trainer_train_path,
      repo_id         = repo_id,
      repo_type       = 'dataset',
      commit_message  = f"{name}_{timestamp}"
  )

  print(f"{name} results successfully uploaded to HF Hub as {trainer_train_path}")

trainer_train results successfully saved locally to trainer_train.json
trainer_train results successfully uploaded to HF Hub as trainer_train.json


### trainer.train results: check that the uploaded file can be downloaded
File locally downloaded to:
/root/.cache/huggingface/hub/datasets-claudelepere-skill_classification/snapshots/full_commit_hash/trainer_train_results.json

In [72]:
if upload_to_HF:

  file_path = hf_hub_download(repo_type="dataset", repo_id=repo_id, filename=trainer_train_path)

  print(f"file_path: {file_path}")

trainer_train.json:   0%|          | 0.00/204 [00:00<?, ?B/s]

file_path: /root/.cache/huggingface/hub/datasets--claudelepere--jobs_EN_11_0_24000_tuned_thresholds/snapshots/62a98fed979319f50af344a09440f911a6e01ded/trainer_train.json


## Evaluation step

### Evaluation: trainer.evaluate
trainer.evaluate uses a fixed threshold of 0.5 to convert logits into binary labels, which is often suboptimal for imbalanced data.

In [73]:
evaluation_trainer_evaluate_results = trainer.evaluate(
    #eval_dataset=encoded_dataset["validation"],  # by default, trainer.evaluate() evaluates the dataset passed as eval_dataset during training
    metric_key_prefix="eval"                     # prefix for the evaluation metrics
)

print(f"evaluation_trainer_evaluate_results: {type(evaluation_trainer_evaluate_results)} len={len(evaluation_trainer_evaluate_results)}\n{json.dumps(evaluation_trainer_evaluate_results, indent=4)}")


evaluation_trainer_evaluate_results: <class 'dict'> len=11
{
    "eval_loss": 0.27795812487602234,
    "eval_f1": 0.8785725086949947,
    "eval_precision": 0.8721104773341339,
    "eval_recall": 0.8851310176721511,
    "eval_roc_auc": 0.9534538341453793,
    "eval_precision_recall_auc": 0.9407111366154768,
    "eval_accuracy": 0.53625,
    "eval_runtime": 36.1258,
    "eval_samples_per_second": 66.435,
    "eval_steps_per_second": 8.304,
    "epoch": 5.0
}


In [74]:
print("evaluation_trainer.evaluate successfully completed")

evaluation_trainer.evaluate successfully completed


### Evaluation: trainer.evaluate results: save locally and upload to HF Hub

In [75]:
if upload_to_HF:

  name                             = "evaluation_trainer_evaluate"
  evaluation_trainer_evaluate_path = f"{name}.json"

  with open(evaluation_trainer_evaluate_path, "w") as f:
    json.dump(evaluation_trainer_evaluate_results, f)

  print(f"{name} successfully saved locally to {evaluation_trainer_evaluate_path}")

  upload_file(
      path_or_fileobj = evaluation_trainer_evaluate_path,
      path_in_repo    = evaluation_trainer_evaluate_path,
      repo_id         = repo_id,
      repo_type       = 'dataset',
      commit_message  = f"{name}_{timestamp}"
  )

print(f"{name} successfully uploaded to HF Hub as {evaluation_trainer_evaluate_path}")

evaluation_trainer_evaluate successfully saved locally to evaluation_trainer_evaluate.json
evaluation_trainer_evaluate successfully uploaded to HF Hub as evaluation_trainer_evaluate.json


### Evaluation: trainer.predict

In [76]:
def tune_thresholds(true_labels, probs, id2label):
  """
  Tune thresholds for each label to maximize F1 alone, as F1 balances precision and recall into a single metric.

  Args:
    true_labels: actual labels for the data                                      (numpy array of shape (num_samples, num_labels))
    probs      : predicted probabilities                                         (numpy array of shape (num_samples, num_labels))
    id2label   : dictionary mapping label indices (int) to label names (string)

  Returns:
    best_thresholds: best threshold for each label                                                      (numpy array of shape (num_labels,))
    best_metrics   : dictionary of best F1, precision_for_best_f1 and recall_for_best_f1 for each label (dictionary of numpy arrays)
  """
  thresholds      = np.linspace(0.1, 0.9, 9)
  best_thresholds = np.zeros(len(id2label))
  best_metrics    = {label: {'f1': 0.0, 'precision': 0.0, 'recall': 0.0} for label in id2label.values()}

  for label_idx, label in id2label.items():
    for threshold in thresholds:
      pred                     = (probs[:, label_idx] > threshold).astype(int)
      precision, recall, f1, _ = precision_recall_fscore_support(true_labels[:, label_idx], pred, average='binary', zero_division=0)
      if f1 > best_metrics[label]['f1']:
        best_thresholds[label_idx]       = threshold
        best_metrics[label]['f1']        = f1
        best_metrics[label]['precision'] = precision
        best_metrics[label]['recall']    = recall

  print("==== tune_thresholds ====")
  print(f"best_thresholds: {type(best_thresholds)} shape={best_thresholds.shape}\n{best_thresholds}")
  print(f"best_metrics   : {type(best_metrics)}    len={len(best_metrics)}      \n{json.dumps(best_metrics, indent=4)}")
  print("=========================")
  print()

  return best_thresholds, best_metrics


In [77]:
def compute_metrics_with_threshold(probs, label_ids, thresholds, id2label):
  """
  Compute metrics during evaluation or test, by applying tuned thresholds
  """
  #logits  = eval_preds.predictions
  #labels  = eval_preds.label_ids
  #sigmoid = torch.nn.Sigmoid  # Sigmoid or numpy?
  #probs   = sigmoid(logits).cpu().numpy()
  preds   = np.zeros_like(probs)

  # Apply threshold per label
  for label_idx in id2label.keys():
    preds[:, label_idx] = (probs[:, label_idx] > thresholds[label_idx]).astype(int)

  # Compute metrics
  f1                    = f1_score               (label_ids, preds, average='micro')
  precision             = precision_score        (label_ids, preds, average='micro')
  recall                = recall_score           (label_ids, preds, average='micro')
  accuracy              = accuracy_score         (label_ids, preds)
  roc_auc               = roc_auc_score          (label_ids, probs, average='micro')  # multi_class="ovr" <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  precision_recall_auc  = average_precision_score(label_ids, probs, average='micro')

  # Use id2label for target_names
  report = classification_report(label_ids, preds, target_names=id2label.values(), zero_division=0)

  metrics = {
      'f1'                   : f1,
      'precision'            : precision,
      'recall'               : recall,
      'accuracy'             : accuracy,
      'roc_auc'              : roc_auc,
      'precision_recall_auc' : precision_recall_auc,
      'thresholds'           : thresholds.tolist(),
      'classification_report': report
  }

  return metrics


In [78]:
def predict_with_thresholds(trainer, dataset, id2label, threshold_tuning=False, thresholds=None):
    """
    Predicts using trainer.predict(), with optional thresholds tuning.

    Parameters:
      - trainer         : Hugging Face Trainer or CustomTrainer instance
      - dataset         : dataset to predict on
      - id2label        : dictionary mapping label indices (int) to label names (string)
      - threshold_tuning: boolean to enable thresholds tuning (True if evaluation, False if prediction)
      - thresholds      :                                     (None if evaluation, tuned thresholds if prediction)

    Returns:
      - best_thresholds (if threshold_tuning=True): optimized threshold for each label
      - best_metrics                              : computed with tuned thresholds whether for evaluation or prediction
    """
    # Predict
    predictions_output = trainer.predict(dataset)        # <class 'transformers.trainer_utils.PredictionOutput'> len   = 3
    predictions        = predictions_output.predictions  # <class 'numpy.ndarray'>                               shape = (1200, 6)
    label_ids          = predictions_output.label_ids    # <class 'numpy.ndarray'>                               shape = (1200, 6)
    metrics            = predictions_output.metrics      # <class 'dict'> = trainer.evaluate results             len   = 10

    print("==== predict_with_thresholds ====")
    print(f"predictions: {type(predictions)} shape={predictions.shape}               \n{predictions}")
    print(f"label_ids  : {type(label_ids)}   shape={label_ids.shape}                 \n{label_ids}")
    print(f"metrics    : {type(metrics)}     len={len(metrics)}    = trainer.evaluate\n{json.dumps(metrics, indent=4)}")

    logits             = predictions
    true_labels        = label_ids

    # Convert logits to probabilities
    probs = 1 / (1 + np.exp(-logits))                    # <class 'numpy.ndarray'>                               shape = (1200, 6)
    print(f"probs: {type(probs)} shape={probs.shape}")
    print("=================================")
    print()

    if threshold_tuning:
      best_thresholds, best_metrics = tune_thresholds(true_labels, probs, id2label)
      metrics                       = compute_metrics_with_threshold(probs, label_ids, best_thresholds, id2label)
      return best_thresholds, metrics
    else:
      metrics = compute_metrics_with_threshold(probs, label_ids, thresholds, id2label)
      return metrics


In [79]:
best_thresholds, evaluation_trainer_predict_results = predict_with_thresholds(trainer, validation_dataset, id2label, threshold_tuning=True, thresholds=None)

except_report = {k: v for k, v in evaluation_trainer_predict_results.items() if k!='classification_report'}
report        = evaluation_trainer_predict_results['classification_report']

print("==== predict_with_thresholds ====")
print(f"evaluation_trainer_predict_results:                          {type(except_report)} len={len(except_report)}\n{json.dumps(except_report, indent=4)}")
print(f"evaluation_trainer_predict_results['classification_report']: {type(report)}        len={len(report)}\n       {report}")
print("=================================")
print()


==== predict_with_thresholds ====
predictions: <class 'numpy.ndarray'> shape=(2400, 6)               
[[-5.234375   -3.6757812   2.9941406   4.4257812   1.4599609  -5.0078125 ]
 [-5.2226562  -3.2246094   1.3759766   4.5703125   1.9560547  -4.7890625 ]
 [-2.0878906   1.4111328   3.921875    2.2792969  -0.4416504  -1.2460938 ]
 ...
 [-3.5292969  -0.04400635  3.1308594   2.5742188  -0.57421875 -2.5625    ]
 [-4.984375   -3.109375    3.8476562   4.171875    0.8173828  -4.765625  ]
 [-0.95214844  2.3886719   3.3007812   1.2050781  -0.9423828  -0.02017212]]
label_ids  : <class 'numpy.ndarray'>   shape=(2400, 6)                 
[[0. 0. 1. 1. 1. 0.]
 [0. 0. 1. 1. 0. 0.]
 [0. 1. 1. 0. 0. 1.]
 ...
 [0. 1. 1. 1. 0. 1.]
 [0. 0. 1. 1. 1. 0.]
 [0. 1. 1. 1. 0. 1.]]
metrics    : <class 'dict'>     len=10    = trainer.evaluate
{
    "test_loss": 0.27795812487602234,
    "test_f1": 0.8785725086949947,
    "test_precision": 0.8721104773341339,
    "test_recall": 0.8851310176721511,
    "test_roc_auc": 0

## Evaluation: best thresholds: save locally (as a dict) and upload to HF Hub (as a JSON file)
best_thresholds: <class 'numpy.ndarray'> shape=(6,) but JSON doesn't support Numpy types

In [80]:
best_thresholds_list = best_thresholds.tolist()
best_thresholds_dict = {label: threshold for label, threshold in zip(id2label.values(), best_thresholds_list)}

print(f"best_thresholds_dict: {type(best_thresholds_dict)} len={len(best_thresholds_dict)}\n{best_thresholds_dict}")

name                 = "best_thresholds"
best_thresholds_path = f"{name}.json"

with open(best_thresholds_path, "w") as f:
  json.dump(best_thresholds_dict, f, indent=4)

print(f"{best_thresholds} successfully saved locally to {best_thresholds_path}")

path_in_repo = "tuned_thresholds.json"

upload_file(
    path_or_fileobj = best_thresholds_path,
    path_in_repo    = path_in_repo,
    repo_id         = repo_id,
    repo_type       = 'model',
    commit_message  = f"tuned_thresholds_{timestamp}"
)

print(f"{name} successfully uploaded to HF Hub as {path_in_repo}")

best_thresholds_dict: <class 'dict'> len=6
{'135': 0.5, '136': 0.4, '137': 0.4, '138': 0.4, '139': 0.4, '390': 0.30000000000000004}
[0.5 0.4 0.4 0.4 0.4 0.3] successfully saved locally to best_thresholds.json
best_thresholds successfully uploaded to HF Hub as tuned_thresholds.json


In [81]:
print("evaluation_trainer.predict successfully completed")

evaluation_trainer.predict successfully completed


### Evaluation: trainer.predict results: save locally and upload to HF Hub

In [82]:
if upload_to_HF:

  name                            = "evaluation_trainer_predict"
  evaluation_trainer_predict_path = f"{name}.json"

  with open(evaluation_trainer_predict_path, "w") as f:
    json.dump(evaluation_trainer_predict_results, f)

  print(f"{name} results successfully saved locally to {evaluation_trainer_predict_path}")

  upload_file(
      path_or_fileobj = evaluation_trainer_predict_path,
      path_in_repo    = evaluation_trainer_predict_path,
      repo_id         = repo_id,
      repo_type       = 'dataset',
      commit_message  = f"{name}_{timestamp}"
  )

  print(f"{name} results successfully uploaded to HF Hub as {evaluation_trainer_evaluate_path}")

evaluation_trainer_predict results successfully saved locally to evaluation_trainer_predict.json
evaluation_trainer_predict results successfully uploaded to HF Hub as evaluation_trainer_evaluate.json


## Prediction step

### Prediction: trainer.evaluate

In [83]:
prediction_trainer_evaluate_results = trainer.evaluate(
    eval_dataset=encoded_dataset['test'],  # by default, trainer.evaluate() evaluates the dataset passed as eval_dataset during training
    metric_key_prefix='test'
)

print(f"prediction_trainer_evaluate_results: {type(prediction_trainer_evaluate_results)} len={len(prediction_trainer_evaluate_results)}\n{json.dumps(prediction_trainer_evaluate_results, indent=4)}")


prediction_trainer_evaluate_results: <class 'dict'> len=11
{
    "test_loss": 0.2718951404094696,
    "test_f1": 0.876100099487258,
    "test_precision": 0.8677986658580958,
    "test_recall": 0.8845618915159944,
    "test_roc_auc": 0.9553717441177797,
    "test_precision_recall_auc": 0.943119318999831,
    "test_accuracy": 0.5266666666666666,
    "test_runtime": 36.0847,
    "test_samples_per_second": 66.51,
    "test_steps_per_second": 8.314,
    "epoch": 5.0
}


In [84]:
print("prediction_trainer.evaluate successfully completed")

prediction_trainer.evaluate successfully completed


### Prediction: trainer.evaluate results: save locally and upload to HF Hub

In [85]:
if upload_to_HF:

  name                             = "prediction_trainer_evaluate"
  prediction_trainer_evaluate_path = f"{name}.json"

  with open(prediction_trainer_evaluate_path, "w") as f:
    json.dump(prediction_trainer_evaluate_results, f)

  print(f"{name} results successfully saved locally to {prediction_trainer_evaluate_path}")

  upload_file(
      path_or_fileobj = prediction_trainer_evaluate_path,
      path_in_repo    = prediction_trainer_evaluate_path,
      repo_id         = repo_id,
      repo_type       = 'dataset',
      commit_message  = f"{name}_{timestamp}"
  )

  print(f"{name} results successfully uploaded to HF Hub as {prediction_trainer_evaluate_path}")

prediction_trainer_evaluate results successfully saved locally to prediction_trainer_evaluate.json
prediction_trainer_evaluate results successfully uploaded to HF Hub as prediction_trainer_evaluate.json


### Prediction: trainer.predict

In [86]:
prediction_trainer_predict_results = predict_with_thresholds(trainer, test_dataset, id2label, threshold_tuning=False, thresholds=best_thresholds)

except_report = {k: v for k, v in prediction_trainer_predict_results.items() if k!='classification_report'}
report        = prediction_trainer_predict_results['classification_report']
print(f"prediction_trainer_predict_results: {type(except_report)} len={len(except_report)}\n{json.dumps(except_report, indent=4)}")
print(f"prediction_trainer_predict_results['classification_report']: {type(report)} len={len(report)}\n{report}")


==== predict_with_thresholds ====
predictions: <class 'numpy.ndarray'> shape=(2400, 6)               
[[-2.1328125   1.7099609   3.9199219   2.4628906   0.14575195 -1.2011719 ]
 [-5.0625     -4.28125    -0.92089844  3.7070312   3.1914062  -5.0351562 ]
 [-4.90625    -2.859375    4.1054688   4.0664062   0.6230469  -4.6445312 ]
 ...
 [-4.9140625  -2.3359375   0.92626953  3.8886719   1.4853516  -4.3007812 ]
 [-2.5566406  -0.60839844  1.9287109   2.2050781   0.12561035 -2.1230469 ]
 [-5.296875   -3.21875     0.3305664   3.9375      1.9169922  -4.734375  ]]
label_ids  : <class 'numpy.ndarray'>   shape=(2400, 6)                 
[[0. 1. 1. 0. 0. 0.]
 [0. 0. 0. 1. 1. 0.]
 [0. 0. 1. 1. 1. 0.]
 ...
 [0. 0. 0. 1. 1. 0.]
 [1. 1. 1. 1. 1. 1.]
 [0. 0. 1. 1. 1. 0.]]
metrics    : <class 'dict'>     len=10    = trainer.evaluate
{
    "test_loss": 0.2718951404094696,
    "test_f1": 0.876100099487258,
    "test_precision": 0.8677986658580958,
    "test_recall": 0.8845618915159944,
    "test_roc_auc": 0.9

In [87]:
print("prediction_trainer.predict successfully completed")

prediction_trainer.predict successfully completed


### Prediction: trainer.predict results: save locally and upload to HF Hub

In [88]:
if upload_to_HF:

  name                            = "prediction_trainer_predict"
  prediction_trainer_predict_path = f"{name}.json"

  with open(prediction_trainer_predict_path, "w") as f:
    json.dump(prediction_trainer_predict_results, f)

  print(f"{name} results successfully saved locally to {prediction_trainer_predict_path}")

  upload_file(
      path_or_fileobj = prediction_trainer_predict_path,
      path_in_repo    = prediction_trainer_predict_path,
      repo_id         = repo_id,
      repo_type       = 'dataset',
      commit_message  = f"{name}_{timestamp}"
  )

  print(f"{name} results successfully uploaded to HF Hub as {prediction_trainer_evaluate_path}")

prediction_trainer_predict results successfully saved locally to prediction_trainer_predict.json
prediction_trainer_predict results successfully uploaded to HF Hub as prediction_trainer_evaluate.json


## Upload tokenizer and model to HF Hub and check

In [89]:
if upload_to_HF:

  # Upload
  commit_message = f"'model_tokenizer'_{timestamp}"

  tokenizer.push_to_hub(repo_id, commit_message=commit_message)  # commit_message as named parameter
  model.push_to_hub(    repo_id, commit_message=commit_message)  # commit_message as named parameter

  print(f"tokenizer and model successfully uploaded to HF Hub at {repo_id}")

  # Check
  def check_upload(repo_id):
    print()
    print("Tokenizer")
    tokenizer = LongformerTokenizerFast.from_pretrained(repo_id)
    print()
    print("Model")
    model = LongformerForSequenceClassification.from_pretrained(repo_id)
    print()

    inputs = tokenizer("Hello, my dog is cute", return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)

    print(f"outputs: {type(outputs)} {outputs.keys()}\n{outputs}")

  # To check if the upload was successful, download the tokenizer and the model
  check_upload(repo_id)

README.md:   0%|          | 0.00/593 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/592 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/595M [00:00<?, ?B/s]

tokenizer and model successfully uploaded to HF Hub at claudelepere/jobs_EN_11_0_24000_tuned_thresholds

Tokenizer


tokenizer_config.json:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.56M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]


Model


config.json:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/595M [00:00<?, ?B/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Input ids are automatically padded to be a multiple of `config.attention_window`: 1024



outputs: <class 'transformers.models.longformer.modeling_longformer.LongformerSequenceClassifierOutput'> odict_keys(['logits'])
LongformerSequenceClassifierOutput(loss=None, logits=tensor([[-0.8075,  1.2086,  1.4755,  0.1166, -1.2017, -0.1298]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None, global_attentions=None)


In [90]:
print("It's the end")

It's the end


In [91]:
raise Exception("I stop here")

Exception: I stop here

===========================================================================================================

In [None]:
example = datasetDict['test'][0]
print(f"datasetDict['test'][0]: {type(example)} {example.keys()}\n{example}")

In [None]:
inputs = tokenizer(
    example['text'],
    truncation     = True,
    padding        = 'max_length',
    max_length     = max_length,
    return_tensors = 'pt'
)

In [None]:
with torch.no_grad():  # Disable gradient calculation during prediction
    outputs = model(
        input_ids=inputs.input_ids.to(device),
        attention_mask=inputs.attention_mask.to(device)
    )


In [None]:
probs = torch.sigmoid(outputs.logits)


In [None]:
best_thresholds = [0.4, 0.3, 0.3, 0.3, 0.4, 0.5]
preds = np.zeros_like(probs)  # Initialize predictions array
for label_idx in range(num_labels):
  preds[:, label_idx] = (probs[:, label_idx] > best_thresholds[label_idx])  #.astype(int)

In [None]:
print(f"probs: {type(probs)} shape={probs.shape}\n{probs}")
print(f"preds: {type(preds)} shape={preds.shape}\n{preds}")

In [None]:
print(f"labels: {type(labels)} {len(labels)}\n{labels}")
# '390': False, '135': False, '136': True, '137': True, '138': True, '139': False}