<a href="https://colab.research.google.com/github/claudelepere/ML_GitHub/blob/main/Trainer_02_2e_5_1024_FocalLoss_alpha05_gamma4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q accelerate
!pip install -q huggingface_hub
!pip install -q scikit-learn

# transformers and datasets are Hugging Face libraries
!pip install -q transformers datasets

!pip install -q wandb



In [None]:
import json
import numpy as np
import os
import pandas as pd
import pickle
import sys
import time
import torch
import torch.nn.functional as F
import wandb

In [None]:
from datasets              import DatasetDict
from google.colab          import auth, drive, files, userdata
from huggingface_hub       import create_repo, login, upload_file, hf_hub_download
from huggingface_hub.utils import RepositoryNotFoundError
from sklearn.metrics       import accuracy_score, average_precision_score, classification_report, f1_score, precision_score, precision_recall_curve, precision_recall_fscore_support, recall_score, roc_auc_score
from torch.utils.data      import DataLoader
from tqdm.auto             import tqdm
from transformers          import AdamW, EvalPrediction, LongformerTokenizerFast, LongformerForSequenceClassification, Trainer, TrainingArguments
from torch.nn              import BCEWithLogitsLoss, Module

In [None]:
import os
from google.colab    import userdata
from huggingface_hub import login, hf_hub_download

In [None]:
# Hugging Face Authenticate
os.environ["HF_TOKEN"] = userdata.get("HF_TOKEN")    # Store the key in os.environ
hf_token               = os.environ.get('HF_TOKEN')
login(token=hf_token)

In [None]:
# Verify
!huggingface-cli whoami

In [None]:
file_path = hf_hub_download(
    repo_id   ="claudelepere/skill_classification",
    repo_type = "dataset",
    filename  = "test_model_eval_results.csv"
)
print(f"file_path: {file_path}")  # /root/.cache/huggingface/hub/datasets--claudelepere--skill_classification/snapshots/51ead81f69b1689fc19694b3f034585cde9f56e1/test_model_eval_results.csv

Next, open in Colab or download to local

In [None]:
"""
# Check the Python version
print(sys.version)
print()

# Get the installed packages (you can see that conda is not installed (do not install it))
!pip list
print()

# Check system information
!cat /etc/os-release
!uname -m
print()

# Check the GPU details (only if the runtime type is T4 GPU)
#!nvidia-smi
#print()

# Check RAM
!free -h
print()

# Check disk space
!df -h
print()

# Get environment variables
for key, value in os.environ.items():
    print(f"{key}: {value}")
"""
!python -V

In [None]:
print(f"currentdir: {os.getcwd()}")

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"device: {device}")

In [None]:
datasetDict_zip_file_name = "dataset_11_12000.zip"
datasetDict_dir_name      = os.path.splitext(datasetDict_zip_file_name)[0]
print(f"datasetDict_zip_file_name: {datasetDict_zip_file_name}")
print(f"datasetDict_dir_name     : {datasetDict_dir_name}")
print()

In [None]:
# OOM: reduce batch size
#      small sizes (1 to 32):            PROs: better generalization in some cases
#                                        CONs: may produce noisier gradients
#      large sizes (128, 256, or higer): PROs: gradients are smoother, leading to more stable training
#                                        CONs: poorer generalization (overfitting) in some cases
#      intermediate sizes (32, 64):      combines the benefits of small and large sizes
batch_size = 8

In [None]:
# OOM: enable gradient accumulation to compensate for smaller batch sizes by accumulating gradients over several steps
#      effective batch size = per-device batch size x gradient accumulation steps;
#      in each iteration, the model computes the gradients, these gradients are immediately used to update the model parameters
gradient_accumulation_steps = 4  #<<<<<<<<<<<<<<<<<<< gradient_accumulation_steps may not be None => comment it in TrainingArguments

In [None]:
# OOM: use PYTORCH_CUDA_ALLOC_CONF to handle memory fragmentation
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

In [None]:
# OOM: check for zombie processes
if torch.cuda.is_available():
  !nvidia-smi
  torch.cuda.memory_summary()
!ps aux | grep python
!kill -9 <PID>
!nvidia-smi     # Checked if killed

In [None]:
# OOM: use fp16 (half precision) mixed precision training
#      reduces memory requirements by up to 50%
fp16 = True

OOM: limit the number of GPU workers: 0 (default) or 1 in Colab
dataloader_num_workers = 1

In [None]:
# OOM: reduce model size or input tokens
#      1) LongformerTokenizer.from_pretrained('allenai/longformer-base/large-4096'): large/base: 435M/149M parameters
#      2) max_length: 4096 max for Longformer; 1 word can give several tokens, stop words are NOT discarded!
#         word_text_length_counts_sorted: jobs count                 : 50000
#                                         jobs count under  512 words: 44794  89.59%
#                                         jobs count under  640 words: 47894  95.79%
#                                         jobs count under  768 words: 49123  98.25%
#                                         jobs count under  896 words: 49691  99.38%
#                                         jobs count under 1024 words: 49917  99.83%
#                                         jobs count under 2048 words: 50000 100.00%
#                                         jobs count under 4096 words: 50000 100.00%
#max_length =  768    #      37 min    #
max_length = 1024    #      38 min    # GPU RAM: 12.2 / 40 GB
#max_length = 2048    # 1 hr 10 min    # GPU RAM: 21.4 / 40 GB
#max_length = 4096    # 2 hr 10 min    # GPU RAM: 39.5 / 40 GB => OutOfMemoryError

In [None]:
# OOM: free up GPU memory
torch.cuda.empty_cache()

In [None]:
# OOM: monitor GPU memory usage
!nividia-smi

In [None]:
# 1 epoch is a complete pass through the entire training dataset;
# with n datapoints and batch size = b, n/b iterations to complete 1 epoch;
# 1 iteration is a single update of the model's parameters
epochs = 5

In [None]:
# A common rule is to scale the learning rate proportionaly with the effective batch size
# note: get_linear_schedule_with_warmup <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
learning_rate = 2e-5  # 1e-5 x 32/8

Reduce the number of transformers layers
hidden_layers = 12    # 12 (default) or 6

In [None]:
# Threshold: 0.5 (default)
threshold = 0.2

In [None]:
if fp16:
  _fp = "fp16"
else:
  _fp = "fp32"

In [None]:
if 'gradient_accumulation_steps' not in globals():
  run_name = f"Longformer-multilabel-{datasetDict_dir_name}-length{max_length}-batch{batch_size}-epochs{epochs}-lr{learning_rate}-{_fp}-threshold{threshold}"
else:
  run_name = f"Longformer-multilabel-{datasetDict_dir_name}-length{max_length}-batch{batch_size}x{gradient_accumulation_steps}-epochs{epochs}-lr{learning_rate}-{_fp}-threshold{threshold}"
print(f"run_name                 : {run_name}")

In [None]:
def upload_unzip_dataset(file_name=datasetDict_zip_file_name):
  # Check if the file exists
  if not os.path.exists(file_name):
    print(f"'{file_name}' not found in /content. Uploading...")
    uploaded_files = files.upload()                              # Prompt file upload dialog
    if file_name not in uploaded_files:
      raise FileNotFoundError(f"'{file_name}' was not uploaded. Please try again.")
    print(f"'{file_name}' successfully uploaded to /content")
    uploaded_file_name = list(uploaded_files.keys())[0]          # Get the name of the uploaded file

    !unzip {uploaded_file_name}

    unzipped_dir_name = os.path.splitext(uploaded_file_name)[0]
    assert unzipped_dir_name==datasetDict_dir_name, "unzipped_dir_name != datasetDict_dir_name"
  else:
    print(f"'{datasetDict_dir_name}' already exists in /content.")

In [None]:
upload_unzip_dataset(datasetDict_zip_file_name)

Hugging Face Authenticate

In [None]:
os.environ["HF_TOKEN"] = userdata.get("HF_TOKEN")    # Store the key in os.environ
hf_token               = os.environ.get('HF_TOKEN')
login(token=hf_token)

In [None]:
# Verify
!huggingface-cli whoami

Create the skill_classification repo on the Hugging Face Hub

In [None]:
HF_name         = "claudelepere/skill_classification"
repo_id_model   = HF_name
repo_id_dataset = HF_name

In [None]:
repo_model_url = create_repo(
    repo_id   = repo_id_model,
    repo_type = "model",
    private   = True,
    exist_ok  = True
)
print(f"Repo model url: {repo_model_url} created successfully as a private repo.")

In [None]:
repo_dataset_url = create_repo(
    repo_id   = repo_id_dataset,
    repo_type = "dataset",
    private   = True,
    exist_ok  = True
)
print(f"Repo datasets url: {repo_dataset_url} created successfully as a private repo.")

In [None]:
repo_id_dataset = f"datasets/{HF_name}"

In [None]:
print(f"repo_id_model: {repo_id_model}")
print(f"repo_id_dataset: {repo_id_dataset}")

W&B initialization

In [None]:
os.environ["WANDB_API_KEY"] = userdata.get("WANDB_API_KEY")        # Store the key in os.environ
wandb_api_key               = os.environ.get('WANDB_API_KEY')
wandb.login(key=wandb_api_key)

In [None]:
try:
  wandb.init(
      project = "skill_classification",
      name    = run_name,
      entity  = "claudelepere-c-cile-cy",
      config  = {
          "learning_rate": learning_rate,
          "epochs"       : 5,
          "batch_size"   : batch_size
      }
  )
except wandb.errors.CommError as err:
  print(f"CommError: {err}")
except Exception as exc:
  print(f"Exception: {exc}")

Create the dataset: 3 Hugging Face Dataset in a Hugging Face DatasetDict

In [None]:
datasetDict = DatasetDict.load_from_disk(datasetDict_dir_name)

In [None]:
print(f"datasetDict: {type(datasetDict)} {datasetDict.shape}\n{datasetDict}")
print(f"datasetDict.keys(): {datasetDict.keys()}")
print(f"datasetDict['train']: {type(datasetDict['train'])} {datasetDict['train'].shape}")
print(f"datasetDict['validation']: {type(datasetDict['validation'])} {datasetDict['validation'].shape}")
print(f"datasetDict['test']: {type(datasetDict['test'])} {datasetDict['test'].shape}")

In [None]:
example = datasetDict['train'][0]
print(f"datasetDict['train'][0]: {type(example)} {example.keys()}\n{example}")

Create the label list and the id2label and label2id mappings.

In [None]:
"""
dataset 7_1000_125_125  ,  48 labels
dataset 7_128_18_54     ,  42 labels
dataset 8910_1087_68_204, 206 labels
dataset 11_1000         ,   6 labels
"""

In [None]:
labels = [label for label in datasetDict['train'].features.keys() if label not in ['id', 'text']]
labels.sort()
print(f"labels: {type(labels)} {len(labels)}\n{labels}")

In [None]:
id2label = {idx:label for idx, label in enumerate(labels)}
print(f"id2label: {type(id2label)} {len(id2label)}\n{id2label}")

In [None]:
label2id = {label:idx for idx, label in enumerate(labels)}
print(f"label2id: {type(label2id)} {len(label2id)}\n{label2id}")

Load tokenizer and model

In [None]:
model_name = "allenai/longformer-base-4096"

In [None]:
tokenizer = LongformerTokenizerFast.from_pretrained(model_name)

In [None]:
model = LongformerForSequenceClassification.from_pretrained(
    model_name,
    num_labels        = len(labels),
#    num_hidden_layers = hidden_layers,
    problem_type      = 'multi_label_classification')

In [None]:
# Configure attention window size
model.config.attention_window = 512

In [None]:
optimizer = AdamW(model.parameters(), lr=learning_rate)

Tokenize ('input_ids' and 'attention_mask'), add 'global_attention_mask' (for Longformer), add 'labels'

In [None]:
def preprocess_data(examples, indices):
  # Step 1: Extract text and tokenize
  text = examples['text']             # Batch of texts
  encoding = tokenizer(
      text,                           # Tokenize text
      truncation     = True,
      padding        = 'max_length',
      max_length     = max_length,
      return_tensors = 'pt'           # Return PyTorch tensors
  )

  # Step 2: Create and add the global attention mask
  global_attention_mask             = torch.zeros_like(encoding['input_ids'])  # Initialize global attention mask with zeros (same shape as input_ids)
  global_attention_mask[:, 0]       = 1                                        # Set global attention on the first token ([CLS], token ID=0) in each sequence
  encoding['global_attention_mask'] = global_attention_mask                    # Add the global_attention_mask to the batch

  # Step 3: Create and populate the label matrix
  labels_matrix = torch.zeros((len(text), len(labels)), dtype=torch.float32)   # Create an empty label matrix
  #print(f"labels_matrix: {type(labels_matrix)} {labels_matrix.shape}")
  #---------Populate label matrix
  for idx, label in enumerate(labels):
    #print(f"idx:{idx} label:{label}")
    if label in examples:
      labels_matrix[:, idx] = torch.tensor(
          [1.0 if val else 0.0 for val in examples[label]],
          dtype=torch.float32
          )
  print(f"labels_matrix: {type(labels_matrix)} {labels_matrix.shape}")

  encoding['labels'] = labels_matrix                                           # Add labels to the encoding
  print(f"encoding['labels']: {type(encoding['labels'])} {encoding['labels'].shape}")

  # encoding: <class 'transformers.tokenization_utils_base.BatchEncoding'> dict_keys(['input_ids', 'attention_mask', 'global_attention_mask', 'labels'])
  #   'input_ids': tensor([[
  #   'attention_mask': tensor([[
  #   'global_attention_mask': tensor([[
  #   'labels': tensor([[
  #print(f"1 preprocess_data call: encoding: {type(encoding)} {encoding.keys()}")

  return encoding

Create the 3 encoded datasets, train, validation and test

In [None]:
encoded_dataset = datasetDict.map(
    preprocess_data,
    batched        = True,
    remove_columns = datasetDict['train'].column_names,
    with_indices   = True
)
#train_dataset      = encoded_dataset['train']
#validation_dataset = encoded_dataset['validation']
#test_dataset       = encoded_dataset['test']

train_labels_list_of_lists = train_dataset['labels'].tolist()
print("=============================================")
print(f"encoded_dataset: {type(encoded_dataset)} {encoded_dataset.shape}\n{encoded_dataset}")
print(f"train_dataset: {type(train_dataset)} {train_dataset.shape} {train_dataset.features}\n{train_dataset}")
print(f"validation_dataset: {type(validation_dataset)} {validation_dataset.shape} {validation_dataset.features}")
print(f"test_dataset: {type(test_dataset)} {test_dataset.shape} {test_dataset.features}")
print("---")
print(f"train_dataset['labels']: {type(train_dataset['labels'])} len={len(train_dataset['labels'])}\n{train_dataset['labels']}")
print("---")
print(f"train_dataset[0]['input_ids']: {type(train_dataset[0]['input_ids'])} {len(train_dataset[0]['input_ids'])}\n{train_dataset['input_ids'][0]}")
print(f"train_dataset[0]['attention_mask']: {type(train_dataset[0]['attention_mask'])} {len(train_dataset[0]['attention_mask'])}\n{train_dataset['attention_mask'][0]}")
print(f"train_dataset[0]['global_attention_mask']: {type(train_dataset[0]['global_attention_mask'])} {len(train_dataset[0]['global_attention_mask'])}\n{train_dataset['global_attention_mask'][0]}")
print(f"train_dataset[0]['labels']: {type(train_dataset[0]['labels'])} {len(train_dataset[0]['labels'])} {train_dataset[0]['labels']}")
print(f"train_dataset['labels'][0]: {type(train_dataset['labels'][0])} {len(train_dataset['labels'][0])}\n{train_dataset['labels'][0]}")

In [None]:
encoded_dataset.set_format('torch')
train_dataset      = encoded_dataset['train']
validation_dataset = encoded_dataset['validation']
test_dataset       = encoded_dataset['test']
print(f"train_dataset_tensor: {type(train_dataset)} {train_dataset.shape} {train_dataset.features}\n{train_dataset}")
print(f"train_dataset_tensor['input_ids']:             {type(train_dataset['input_ids'])}             len={len(train_dataset['input_ids'])}             shape={train_dataset['input_ids'].shape}            ") #\n{train_dataset['input_ids']}")
print(f"train_dataset_tensor['attention_mask']:        {type(train_dataset['attention_mask'])}        len={len(train_dataset['attention_mask'])}        shape={train_dataset['attention_mask'].shape}       ") #\n{train_dataset['attention_mask']}")
print(f"train_dataset_tensor['global_attention_mask']: {type(train_dataset['global_attention_mask'])} len={len(train_dataset['global_attention_mask'])} shape={train_dataset['global_attention_mask'].shape}") #\n{train_dataset['global_attention_mask']}")
print(f"train_dataset_tensor['labels']:                {type(train_dataset['labels'])}                len={len(train_dataset['labels'])}                shape={train_dataset['labels'].shape}               ") #\n{train_dataset['labels']}")

Truncated part

In [None]:
def get_truncated_part(text):
  tokens = tokenizer(
      text,
      truncation                = True,
      padding                   = 'max_length',
      max_length                = max_length,
      return_overflowing_tokens = True,
      return_tensors            = None
      )
  print(f"tokens.keys(): {tokens.keys()}")

  # Get the truncated tokens
  truncated_ids = tokens["input_ids"][0]
  print(f"truncated_ids: {type(truncated_ids)} {truncated_ids}")
  #overflow_ids  = tokens["overflow_to_sample_mapping"][0]
  #print(f"overflow_ids: {type(overflow_ids)} {overflow_ids}")

  # Decode the tokens back to text
  truncated_text = tokenizer.decode(truncated_ids, skip_special_tokens=True)
  #overflow_text  = tokenizer.decode(overflow_ids, skip_special_tokens=True)

  print(f"original_text :\n{text}")
  print(f"truncated_text:\n{truncated_text}")
  #print(f"overflow_text:\n{overflow_text}")

  original_tokens  = tokenizer.tokenize(text)
  truncated_tokens = tokenizer.tokenize(truncated_text)
  #overflow_tokens  = tokenizer.tokenize(overflow_text)

  print(f"original_tokens count : {len(original_tokens)}")
  print(f"truncated_tokens count: {len(truncated_tokens)}")
  #print(f"overflow_tokens count: {len(overflow_tokens)}")

In [None]:
example_text = datasetDict['train'][0]['text']
#get_truncated_part(example_text)

In [None]:
inputs = tokenizer(
    example_text,
    truncation     = True,
    padding        = 'max_length',
    max_length     = max_length,
    return_tensors = 'pt'
)

inputs: <class 'transformers.tokenization_utils_base.BatchEncoding'> dict_keys(['input_ids', 'attention_mask'])
  {'input_ids': tensor([[
  'attention_mask': tensor([[
print(f"inputs: {type(inputs)} {inputs.keys()}") #\n{inputs}")
print(f"inputs_ids: {type(inputs.input_ids)} {inputs.input_ids.shape}\n{inputs.input_ids}")
print(f"attention_mask: {type(inputs.attention_mask)} {inputs.attention_mask.shape}\n{inputs.attention_mask}")
print(f"labels: {inputs.labels.shape}")

4. Forward pass for multi-label classification

In [None]:
outputs = model(
    input_ids      = inputs.input_ids,
    attention_mask = inputs.attention_mask
    )

In [None]:
print(f"outputs: {type(outputs)} {outputs.keys()}\n{outputs}")

In [None]:
# Logits (= raw model outputs)
logits = outputs.logits
print(f"logits: {type(logits)} {logits.shape}\n{logits}")

In [None]:
# Convert logits to probabilities
sigmoid = torch.nn.Sigmoid()
probs   = sigmoid(logits)
print(f"probs: {type(probs)} {probs.shape}\n{probs}")

In [None]:
example = encoded_dataset['train'][0]

In [None]:
print(f"example: {type(example)} {example.keys()}\n{example}")
print()
#print(f"example['input_ids']: {type(example['input_ids'])} {len(example['input_ids'])}\n{example['input_ids']}")
#print(f"example['attention_mask']: {type(example['attention_mask'])} {len(example['attention_mask'])}\n{example['attention_mask']}")
#print(f"example['labels']:  {type(example['labels'])} {len(example['labels'])}\n{example['labels']}")

In [None]:
tokenizer.decode(example['input_ids'])

In [None]:
[id2label[idx] for idx, label in enumerate(example['labels']) if label == 1.0]

Set PyTorch format to ensures correctness and compatibility with PyTorch pipelines

In [None]:
# The 3 Hugging Face Dataset are formatted as PyTorch Dataset
encoded_dataset.set_format('torch')

In [None]:
batch_size  = batch_size
metric_name = "f1"

In [None]:
training_args = TrainingArguments(
    output_dir                  = './training_results',  # where model predictions and checkpoints will be written during training
    overwrite_output_dir        = True,
    logging_dir                 = './logs',
    logging_steps               = 50,
    save_steps                  = 500,
    save_total_limit            = 2,
    eval_strategy               = 'epoch',
    save_strategy               = 'epoch',
    learning_rate               = learning_rate,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size  = batch_size,
    gradient_accumulation_steps = gradient_accumulation_steps,
    num_train_epochs            = epochs,
    weight_decay                = 0.01,
    load_best_model_at_end      = True,
    metric_for_best_model       = metric_name,
    run_name                    = run_name,
    fp16                        = fp16,
    #dataloader_num_workers      = dataloader_num_workers,
    report_to                  = 'wandb'
    )

Metrics
  source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/

In [None]:
def multi_label_metrics(predictions, labels):
    average = 'micro'    # 'micro' or 'weighted'

    # first, apply sigmoid on predictions whose shape is (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs   = sigmoid(torch.Tensor(predictions))

    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1

    # finally, compute metrics
    y_true               = labels
    f1                   = f1_score               (y_true=y_true, y_pred=y_pred, average=average)    #, zero_division=1)
    precision            = precision_score        (y_true=y_true, y_pred=y_pred, average=average)    #, zero_division=1)
    recall               = recall_score           (y_true=y_true, y_pred=y_pred, average=average)    #, zero_division=1)
    roc_auc              = roc_auc_score          (y_true=y_true, y_score=probs, average=average)
    precision_recall_auc = average_precision_score(y_true=y_true, y_score=probs, average=average)
    accuracy             = accuracy_score         (y_true=y_true, y_pred=y_pred)

    # return as dictionary
    metrics = {
        'f1'                  : f1,
        'precision'           : precision,
        'recall'              : recall,
        'roc_auc'             : roc_auc,
        'precision_recall_auc': precision_recall_auc,
        'accuracy'            : accuracy
        }

    return metrics

In [None]:
def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    result = multi_label_metrics(
        predictions = preds,
        labels      = p.label_ids
        )
    return result

In [None]:
"""Let's verify a batch as well as a forward pass:"""

In [None]:
print(f"input_ids:              {type(encoded_dataset['train']['input_ids'][0])}\t{encoded_dataset['train']['input_ids'][0].shape}")
print(f"attention_mask:         {type(encoded_dataset['train']['attention_mask'][0])}\t{encoded_dataset['train']['attention_mask'][0].shape}")
print(f"global_attention_mask:  {type(encoded_dataset['train']['global_attention_mask'][0])}\t{encoded_dataset['train']['global_attention_mask'][0].shape}")
print(f"labels:                 {type(encoded_dataset['train'][0]['labels'])}\t{encoded_dataset['train'][0]['labels'].shape}")

Execute a forward pass for debugging or verification purposes (cf. BERT_3_1 in Notion BERT database)

In [None]:
outputs = model(
    input_ids      = encoded_dataset['train']['input_ids'][0].unsqueeze(0),
    attention_mask = encoded_dataset['train']['attention_mask'][0].unsqueeze(0),
    labels         = encoded_dataset['train'][0]['labels'].unsqueeze(0)
    )

In [None]:
print(f"outputs: {type(outputs)} {outputs.keys()}\n{outputs}")

In [None]:
"""# Define the weighted loss function

class_weights = torch.tensor([7.68, 2.15, 0.61, 0.47, 0.68, 6.26], dtype=torch.float32).to(device)
loss_fn       = BCEWithLogitsLoss(pos_weight=class_weights)  # For multi-label classification (binary classification per label)

## Class supports, class weigths, weighted loss function

Reminder:
*   df_jobs      : <class 'pandas.core.frame.DataFrame'>
*   df_jobs['id']: <class 'pandas.core.series.Series'>

dataset = Dataset.from_pandas(df_jobs)
*   dataset      : <class 'datasets.arrow_dataset.Dataset'>
*   dataset['id']: <class 'list'>

*   dataset_dict_jobs : <class 'datasets.dataset_dict.DatasetDict'>
*   train_dataset     : <class 'datasets.arrow_dataset.Dataset'>
*   validation_dataset: <class 'datasets.arrow_dataset.Dataset'>
*   test_dataset      : <class 'datasets.arrow_dataset.Dataset'>


We calculate the class supports for the train, validation and test datasets; the class weights and the weighted loss function are used for training only; the class supports of validation_dataset and test_dataset are calculated for information only.

def get_train_class_weights(datasetDict, labels):
  print(f"datasetDict: {type(datasetDict)} shape={datasetDict.shape}\n{datasetDict}")
  print(f"labels: {type(labels)} len={len(labels)}\n{labels}")

  dataset_train      = datasetDict['train']
  dataset_validation = datasetDict['validation']
  dataset_test       = datasetDict['test']

  def calculate_class_supports(dataset, labels):
    class_supports = dataset.map(
        lambda example: {col: example[col] for col in labels},
        batched=True
    ).to_pandas()[labels].sum(axis=0)
    return class_supports

  class_supports = {}

  for split_name, split_dataset in datasetDict.items():
    class_supports[split_name] = calculate_class_supports(split_dataset, labels)

  for split_name, split_class_supports in class_supports.items():
    print(f"{split_name}: {type(split_class_supports)} len={len(split_class_supports)}\n{split_class_supports}")

  train_class_supports_list = class_supports['train'].tolist()
  print(f"train_class_supports_list: {type(train_class_supports_list)} len={len(train_class_supports_list)} {train_class_supports_list}")

  train_class_supports_tensor = torch.tensor(train_class_supports_list, dtype=torch.float32)
  print(f"train_class_supports_tensor: {type(train_class_supports_tensor)} len={len(train_class_supports_tensor)} {train_class_supports_tensor}")

  train_total_samples = dataset_train.num_rows
  print(f"train_total_samples: {train_total_samples}")

  number_of_classes = len(labels)
  print(f"number_of_classes: {number_of_classes}")

  train_class_weights = train_total_samples / (number_of_classes * train_class_supports_tensor)
  print(f"train_class_weights: {type(train_class_weights)} len={len(train_class_weights)} {train_class_weights}")

  train_class_weights_sum = train_class_weights.sum()
  print(f"train_class_weights_sum: {train_class_weights_sum}")

  normalized_train_class_weights = (train_class_weights / train_class_weights_sum) * number_of_classes
  print(f"normalized_train_class_weights: {type(normalized_train_class_weights)} len={len(normalized_train_class_weights)} {normalized_train_class_weights}")

  # Positives samples per label
  supports = train_class_supports_tensor
  print(f"supports: {type(supports)} {len(supports)} {supports}")

  # Negatives samples per label
  negatives = train_total_samples - supports
  print(f"negatives: {type(negatives)} {len(negatives)} {negatives}")

  # pos_weights = negative to positive ratios
  pos_weights = negatives/supports
  print(f"pos_weights: {type(pos_weights)} {len(pos_weights)} {pos_weights}")

  # Normalize using min-max scaling
  normalized_pos_weights_minmax = (pos_weights - pos_weights.min()) / (pos_weights.max() - pos_weights.min())
  print(f"normalized_pos_weights_minmax: {type(normalized_pos_weights_minmax)} {len(normalized_pos_weights_minmax)} {normalized_pos_weights_minmax}")

  # Normalize using z-score standardization
  normalized_pos_weights_zscore = (pos_weights - pos_weights.mean()) / pos_weights.std()
  print(f"normalized_pos_weights_zscore: {type(normalized_pos_weights_zscore)} {len(normalized_pos_weights_zscore)} {normalized_pos_weights_zscore}")

  # Normalize using min-max scaling
  normalized_pos_weights_minmax = (pos_weights - pos_weights.min()) / (pos_weights.max() - pos_weights.min())
  print(f"normalized_pos_weights_minmax: {type(normalized_pos_weights_minmax)} {len(normalized_pos_weights_minmax)} {normalized_pos_weights_minmax}")

  # Normalize using z-score standardization
  normalized_pos_weights_zscore = (pos_weights - pos_weights.mean()) / pos_weights.std()
  print(f"normalized_pos_weights_zscore: {type(normalized_pos_weights_zscore)} {len(normalized_pos_weights_zscore)} {normalized_pos_weights_zscore}")

  # Normalize using sum-to-one
  normalized_pos_weights_sum1 = pos_weights / pos_weights.sum()
  print(f"normalized_pos_weights_sum1: {type(normalized_pos_weights_sum1)} {len(normalized_pos_weights_sum1)} {normalized_pos_weights_sum1}")

  return normalized_pos_weights_minmax
  #return normalized_pos_weights_zscore
  #return normalized_pos_weights_sum1

pos_weights = get_train_class_weights(datasetDict, labels)

loss_fn = BCEWithLogitsLoss(pos_weight=pos_weights.to(device))  # For multi-label classification (binary classification per label)
print(f"loss_fn: {type(loss_fn)} {loss_fn}")
"""

In [None]:
def get_class_weights(labels=encoded_dataset['train']['labels']):
  print(f"labels: {type(labels)} len={len(labels)} shape={labels.shape}\n{labels}")

  num_samples, num_labels = labels.shape
  print(f"num_samples: {type(num_samples)} {num_samples}")
  print(f"num_labels:  {type(num_labels)}  {num_labels}")

  class_counts = labels.sum(dim=0)
  print(f"class_counts: {type(class_counts)} len={len(class_counts)}\n{class_counts}")

  pos_weights = (num_samples-class_counts) / class_counts
  print(f"pos_weights: {type(pos_weights)} len={len(pos_weights)}\n{pos_weights}")

  normalized_pos_weights_minmax = (pos_weights - pos_weights.min()) / (pos_weights.max() - pos_weights.min())
  print(f"normalized_pos_weights_minmax: {type(normalized_pos_weights_minmax)} {len(normalized_pos_weights_minmax)} {normalized_pos_weights_minmax}")

  normalized_pos_weights_zscore = (pos_weights - pos_weights.mean()) / pos_weights.std()
  print(f"normalized_pos_weights_zscore: {type(normalized_pos_weights_zscore)} {len(normalized_pos_weights_zscore)} {normalized_pos_weights_zscore}")

  normalized_pos_weights_sum1 = pos_weights / pos_weights.sum()
  print(f"normalized_pos_weights_sum1: {type(normalized_pos_weights_sum1)} {len(normalized_pos_weights_sum1)} {normalized_pos_weights_sum1}")

  #return pos_weights
  #return normalized_pos_weights_minmax
  #return normalized_pos_weights_zscore
  return normalized_pos_weights_sum1

In [None]:
pos_weights = get_class_weights()

In [None]:
loss_fn = BCEWithLogitsLoss(pos_weight=pos_weights.to(device))

raise Exception("Stop here")

In [None]:
class FocalLoss(Module):
  """
  Focal Loss implementation
  """
  def __init__(self, alpha=1.0, gamma=2.0, logits=False, reduce=True):
    super(FocalLoss, self).__init__()
    self.alpha   = alpha
    self.gamma   = gamma
    self.logits  = logits  # This flag is to indicate whether input is logits or probability
    self.reduce  = reduce

  # inputs  = model's predictions: PyTorch tensor, shape=(batch_size, num_classes)
  # targets = ground truth labels: PyTorch tensor, shape=same as inputs shape
  def forward(self, inputs, targets):
    # Here, we check if input is probability or logits
    if self.logits:
      # Input is logits
      BCE_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction='none')
    else:
      # Input is probability
      BCE_loss = F.binary_cross_entropy(inputs, targets, reduction='none')
    pt = torch.exp(-BCE_loss)
    F_loss = self.alpha * (1-pt)**self.gamma * BCE_loss

    if self.reduce:
      return torch.mean(F_loss)
    else:
      return F_loss

  def __repr__(self):
    return f"FocalLoss(alpha={self.alpha}, gamma={self.gamma}, logits={self.logits}, reduce={self.reduce})"

  def __str__(self):
    return f"FocalLoss(alpha={self.alpha}, gamma={self.gamma}, logits={self.logits}, reduce={self.reduce})"

  def __call__(self, inputs, targets):
    return self.forward(inputs, targets)

In [None]:
focal_loss_fn = FocalLoss(alpha=0.5, gamma=4.0, logits=True, reduce=True)
print(f"focal_loss_fn: {type(focal_loss_fn)} {focal_loss_fn}")

CustomTrainer

In [None]:
class CustomTrainer(Trainer):

  def __init__(self, *args, loss_fn=None, **kwargs):
    super().__init__(*args, **kwargs)
    self.loss_fn = loss_fn

  """
  # No print in compute_loss because out of memory because prints are batch per batch
  def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):

    #print(f"inputs passed to compute_loss: {inputs.keys()}")
    #input_ids             = inputs['input_ids']                        # shape: batch_size, sequence_length
    #attention_mask        = inputs['attention_mask']                   # shape: batch_size, sequence_length
    #global_attention_mask = inputs.get('global_attention_mask', None)  # shape: batch_size, sequence_length; optional as LongFormer specific
    labels                = inputs.pop('labels', None)                 # shape: batch_size, num_labels; needed for loss computation, not required by the model

    #outputs = model(**inputs, global_attention_mask=global_attention_mask)  # Forward pass
    # Forward pass
    #outputs = model(
    #    input_ids             = input_ids,
    #    attention_mask        = attention_mask,
    #    global_attention_mask = global_attention_mask,
    #    labels                = labels
    #)
    outputs = model(**inputs, labels=labels)
    #print(f"outputs: {type(outputs)} {outputs.keys()}\n{outputs}")
    logits = outputs.logits  # shape: (batch_size, num_labels)

    # If labels are provided, compute loss
    if labels is not None:
      # Use the custom loss function if provided
      if self.loss_fn is not None:
        loss = self.loss_fn(logits, labels)  # Compute weighted loss
      else:
        # Default loss: BCEWithLogitsLoss
        loss_fn = BCEWithLogitsLoss()
        loss    = loss_fn(logits, labels)    # Compute loss
      return (loss, outputs) if return_outputs else loss

    # If no labels, return outputs only, for evaluation or inference
    return outputs
    """
  def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
    labels  = inputs.pop('labels', None)
    outputs = model(**inputs, labels=labels)
    logits  = outputs.logits

    if labels is not None:
      if self.loss_fn is not None:
        loss = self.loss_fn(logits, labels)
      else:
        loss_fn = BCEWithLogitsLoss()
        loss    = loss_fn(logits, labels)
      return (loss, outputs) if return_outputs else loss

    return outputs

Create a Hugging Face's transformers trainer (which abstracts the training loop)

In [None]:
trainer = CustomTrainer(
#trainer = Trainer(
    model           = model,
    args            = training_args,
    train_dataset   = encoded_dataset["train"],
    eval_dataset    = encoded_dataset["validation"],
    compute_metrics = compute_metrics,                # Optional: custom metrics function
    loss_fn         = focal_loss_fn,
)

Train

In [None]:
trainer_train = trainer.train()
print(f"trainer_train: {type(trainer_train)} len={len(trainer_train)}\n{trainer_train}")

In [None]:
file_path = "trainer_train.json"
with open(file_path, "w") as f:
  json.dump(trainer_train, f)
print(f"Train output successfully saved to {file_path}.")

In [None]:
print("Training successfully completed.")

In [None]:
"""## Upload model, tokenizer, train results, evaluate results"""

Save model to /content

In [None]:
model_path = "model"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)

Upload model and tokenizer to HF repo_id_model

In [None]:
tokenizer = LongformerTokenizerFast.from_pretrained(model_path)
model     = LongformerForSequenceClassification.from_pretrained(model_path)

In [None]:
tokenizer.push_to_hub(repo_id_model)
model.push_to_hub(repo_id_model)

Upload Train results to HF repo_id_dataset

In [None]:
# Train
upload_file(
    path_or_fileobj = 'trainer_train.json',
    path_in_repo    = 'trainer_train.json',
    repo_id         = HF_name,
    repo_type       = 'dataset'
)

In [None]:
"""To Get Results of Evaluation and Test"""

In [None]:
def get_results(model, dataset, batch_size, threshold, phase):
  # Clear GPU cache
  torch.cuda.empty_cache()

  # Set the model to evaluation mode to disable dropout and other training-specific behaviors
  model.eval()

  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  model.to(device)

  dataLoader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

  all_preds       = []
  all_probs       = []
  all_true_labels = []

  for batch in tqdm(dataLoader):
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
      outputs = model(**batch)
    logits = outputs.logits

    # Convert logits to probabilities and probabilities to predictions
    sigmoid = torch.nn.Sigmoid()
    probs   = sigmoid(logits).cpu().numpy()    # Convert to Numpy
    preds   = (probs > threshold).astype(int)  # Convert to binary Numpy array; convert the boolean result to int (0 or 1)

    # Accumulate probabilities, predictions and labels
    all_probs.append(probs)
    all_preds.append(preds)
    all_true_labels.append(batch['labels'].cpu().numpy())

  # Concatenate results from all batches
  all_probs       = np.concatenate(all_probs, axis=0)        # shape: [num_samples, num_labels]
  all_preds       = np.concatenate(all_preds, axis=0)        # shape: [num_samples, num_labels]
  all_true_labels = np.concatenate(all_true_labels, axis=0)  # shape: [num_samples, num_labels]

  print(f"all_probs:       {type(all_probs)} {all_probs.shape}")
  print(f"all_preds:       {type(all_preds)} {all_preds.shape}")
  print(f"all_true_labels: {type(all_true_labels)} {all_true_labels.shape}")
  results_df = pd.DataFrame({
      'predictions'  : [list(pred)  for pred  in all_preds],
      'probabilities': [list(prob)  for prob  in all_probs],
      'true_labels'  : [list(label) for label in all_true_labels]
  })
  results_file_path = f"{phase}_results.csv"
  results_df.to_csv(results_file_path, index=False)

  # Classification report for precision, recall, F1 score
  report = classification_report(
      y_true        = all_true_labels,
      y_pred        = all_preds,
      target_names  = labels,
      zero_division = 0,
      output_dict   = True
  )
  print(f"Classification Report:\n{report}")

  # ROC AUC for multi-label classification
  roc_auc = roc_auc_score(
      y_true  = all_true_labels,
      y_score = all_probs,
      average = 'micro'
  )
  print(f"ROC AUC: {roc_auc}")

  metrics = {
      'classification_report': report,
      'roc_auc'              : roc_auc
  }
  metrics_file_path = f"{phase}_metrics.json"
  with open(metrics_file_path, "w") as f:
    json.dump(metrics, f, indent=4)

  print(f"{phase} Results Saved to {results_file_path}")
  print(f"{phase} Metrics Saved to {metrics_file_path}")

In [None]:
def get_results_with_threshold_tuning(model, dataset, batch_size, tune_thresholds=False, average, phase='eval'):
  """
  Evaluates a model on a given dataset and optionally tunes thresholds for multi-label classification.

  Args:
    model          : The trained model to evaluate.
    dataset        : The dataset to evaluate (validation or test).
    batch_size     : Batch size for DataLoader.
    tune_thresholds: Whether to tune thresholds per label (default = False).
    average        : The averaging method for metrics ('micro', 'macro', 'weighted', etc.).
    phase          : The phase (eval or test) for saving results (default = 'eval').

  Returns:
    A dictionary with metrics and optionally tuned thresholds.
  """

  # Clear GPU cache
  torch.cuda.empty_cache()

  # Set the model to eval mode to disable dropout and other training-specific behaviors
  model.eval

  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  model.to(device)

  # Create DataLoader
  dataLoader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

  all_preds       = []
  all_probs       = []
  all_true_labels = []

  for batch in tqdm(dataLoader):
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
      outputs = model(**batch)
    logits = outputs.logits

  # Convert logits to probabilities
  sigmoid = torch.nn.Sigmoid()
  probs   = sigmoid(logits).cpu().numpy()    # Convert to Numpy
  all_probs.append(probs)
  all_true_labels.append(batch['labels'].cpu().numpy())

  # Concatenate results from all batches
  all_probs       = np.concatenate(all_probs, axis=0)        # shape: [num_samples, num_labels]
  all_true_labels = np.concatenate(all_true_labels, axis=0)  # shape: [num_samples, num_labels]

  # Initialize thresholds (default = 0.5)
  thresholds = np.full(all_probs.shape[1], 0.5)  # shape: [num_samples]

  # Thresholds tuning (if enabled)
  if tune_thresholds:
    # Tune thresholds for each label
    print(f"Tuning thresholds for {phase} phase")
    for label_idx in range(all_probs.shape[1]):                           # Iterate over labels
      precision, recall, thresholds_label = precision_recall_curve(all_true_labels[:, label_idx], all_probs[:, label_idx])
      f1_scores = 2 * (precision * recall) / (precision + recall + 1e-8)  # Avoid division by zero
      best_threshold_idx = np.argmax(f1_scores)
      thresholds[label_idx] = thresholds_label[best_threshold_idx]        # Set the best threshold for this label

  # Apply thresholds to probabilities to generate predictions
  all_preds = (all_probs > thresholds).astype(int)                        # Convert to binary Numpy array; convert the boolean result to int (0 or 1

  # Save predictions, probabilities, and true labels to a DataFrame
  results_df = pd.DataFrame({
      'predictions'  : [list(pred)  for pred  in all_preds],
      'probabilities': [list(prob)  for prob  in all_probs],
      'true_labels'  : [list(label) for label in all_true_labels]
  })
  results_file_path = f"{phase}_results.csv"
  results_df.to_csv(results_file_path, index=False)

  # Compute metrics
  print(f"Computing metrics for {phase} phase")
  classification_report_dict = classification_report(
      y_true        = all_true_labels,
      y_pred        = all_preds,
      target_names  = labels,
      zero_division = 0,
      output_dict   = True
  )

  # Compute roc_auc
  roc_auc = roc_auc_score(
      y_true  = all_true_labels,
      y_score = all_probs,
      average = average
  )

  # Computer precision_recall_auc
  precision_recall_auc = average_precision_score(all_true_labels, all_probs, average=average)

  #precision, recall, _ = precision_recall_curve(all_true_labels, all_probs)
  #precision_recall_auc = auc(recall, precision)

  metrics = {
      'classification_report': classification_report_dict,
      'roc_auc'              : roc_auc,
      'precision_recall_auc' : precision_recall_auc,
      'thresholds'           : thresholds.tolist() if tune_thresholds else "Default (0.5)",  # Convert numpy array to list
  }

  metrics_file_path = f"{phase}_metrics.json"
  with open(metrics_file_path, "w") as f:
    json.dump(metrics, f, indent=4)

  print(f"{phase} Results Saved to {results_file_path}")
  print(f"{phase} Metrics Saved to {metrics_file_path}")

  return metrics


In [None]:
"""## Evaluate

After training, we evaluate our model on the validation set.
"""

First evaluate results

In [None]:
phase_evaluate = 'evaluate_model_eval'

In [None]:
get_results(
    model      = model,
    dataset    = validation_dataset,
    batch_size = batch_size,
    threshold  = threshold,
    phase      = phase_evaluate
)

In [None]:
print("First evaluation successfully completed.")

Second evaluate results

In [None]:
trainer_evaluate = trainer.evaluate()
print(f"trainer_evaluate: {type(trainer_evaluate)} len={len(trainer_evaluate)}\n{trainer_evaluate}")

In [None]:
file_path = "trainer_evaluate.json"
with open(file_path, "w") as f:
  json.dump(trainer_evaluate, f)
print(f"Evaluate output successfully saved to {file_path}.")

In [None]:
print("Second evaluation successfully completed.")

Upload Evaluate Results to HF repo_id_dataset

In [None]:
upload_file(
    path_or_fileobj = f"{phase_evaluate}_results.csv",
    path_in_repo    = f"{phase_evaluate}_results.csv",
    repo_id         = HF_name,
    repo_type       = 'dataset'
)
upload_file(
    path_or_fileobj = f"{phase_evaluate}_metrics.json",
    path_in_repo    = f"{phase_evaluate}_metrics.json",
    repo_id         = HF_name,
    repo_type       = 'dataset'
)
upload_file(
    path_or_fileobj = 'trainer_evaluate.json',
    path_in_repo    = 'trainer_evaluate.json',
    repo_id         = HF_name,
    repo_type       = 'dataset'
)

In [None]:
"""## Test"""

First test results

In [None]:
phase_test = 'test_model_eval'

In [None]:
get_results(
    model      = model,
    dataset    = test_dataset,
    batch_size = batch_size,
    threshold  = threshold,
    phase      = phase_test
)

In [None]:
print("First test successfully completed.")

Second test results

In [None]:
trainer_predict = trainer.predict(test_dataset)
print(f"trainer_predict: {type(trainer_predict)} len={len(trainer_predict)}\n{trainer_predict}")
print(f"trainer_predict.predictions: {type(trainer_predict.predictions)} shape={trainer_predict.predictions.shape}")  # Model logits
print(f"trainer_predict.label_ids: {type(trainer_predict.label_ids)} shape={trainer_predict.label_ids.shape}")        # Ground truth labels
print(f"trainer_predict.metrics: {type(trainer_predict.metrics)} len={len(trainer_predict.metrics)}")

In [None]:
trainer_predict_json_serializable = {
    'predictions': trainer_predict.predictions.tolist(),  # Convert Numpy array to list
    'label_ids'  : trainer_predict.label_ids.tolist(),    # Convert Numpy array to list
    'metrics'    : trainer_predict.metrics                # Dictionary is already serializable
}

In [None]:
file_path = "trainer_predict.json"
with open(file_path, "w") as f:
  json.dump(trainer_predict_json_serializable, f)
print(f"Test output successfully saved to {file_path}.")

In [None]:
print("Second test successfully completed.")

Upload Test Results to HF repo_id_dataset

In [None]:
upload_file(
    path_or_fileobj = f"{phase_test}_results.csv",
    path_in_repo    = f"{phase_test}_results.csv",
    repo_id         = HF_name,
    repo_type       = 'dataset'
)
upload_file(
    path_or_fileobj = f"{phase_test}_metrics.json",
    path_in_repo    = f"{phase_test}_metrics.json",
    repo_id         = HF_name,
    repo_type       = 'dataset'
)
upload_file(
    path_or_fileobj = 'trainer_predict.json',
    path_in_repo    = 'trainer_predict.json',
    repo_id         = HF_name,
    repo_type       = 'dataset'
)

In [None]:
print("It's the end")