<a href="https://colab.research.google.com/github/claudelepere/ML_GitHub/blob/main/Trainer_02_2e_5_1024_FocalLoss_alpha05_gamma4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q accelerate
!pip install -q huggingface_hub
!pip install -q scikit-learn

# transformers and datasets are Hugging Face libraries
!pip install -q transformers datasets

!pip install -q wandb

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/480.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m32.8 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/179.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m20.5 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/143.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import json
import numpy as np
import os
import pandas as pd
import pickle
import sys
import time
import torch
import torch.nn.functional as F
import wandb

In [3]:
from datasets              import DatasetDict
from google.colab          import auth, drive, files, userdata
from huggingface_hub       import create_repo, login, upload_file, hf_hub_download
from huggingface_hub.utils import RepositoryNotFoundError
from sklearn.metrics       import accuracy_score, average_precision_score, classification_report, f1_score, precision_score, precision_recall_curve, precision_recall_fscore_support, recall_score, roc_auc_score
from torch.utils.data      import DataLoader
from tqdm.auto             import tqdm
from transformers          import AdamW, EvalPrediction, LongformerTokenizerFast, LongformerForSequenceClassification, Trainer, TrainingArguments
from torch.nn              import BCEWithLogitsLoss, Module

In [4]:
import os
from google.colab    import userdata
from huggingface_hub import login, hf_hub_download

In [5]:
# Hugging Face Authenticate
os.environ["HF_TOKEN"] = userdata.get("HF_TOKEN")    # Store the key in os.environ
hf_token               = os.environ.get('HF_TOKEN')
login(token=hf_token)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [6]:
# Verify
!huggingface-cli whoami

claudelepere


In [7]:
file_path = hf_hub_download(
    repo_id   ="claudelepere/skill_classification",
    repo_type = "dataset",
    filename  = "test_model_eval_results.csv"
)
print(f"file_path: {file_path}")  # /root/.cache/huggingface/hub/datasets--claudelepere--skill_classification/snapshots/51ead81f69b1689fc19694b3f034585cde9f56e1/test_model_eval_results.csv

test_model_eval_results.csv:   0%|          | 0.00/152k [00:00<?, ?B/s]

file_path: /root/.cache/huggingface/hub/datasets--claudelepere--skill_classification/snapshots/2337401f7b26cd3281321f7f99cbb58c11510b92/test_model_eval_results.csv


Next, open in Colab or download to local

In [8]:
"""
# Check the Python version
print(sys.version)
print()

# Get the installed packages (you can see that conda is not installed (do not install it))
!pip list
print()

# Check system information
!cat /etc/os-release
!uname -m
print()

# Check the GPU details (only if the runtime type is T4 GPU)
#!nvidia-smi
#print()

# Check RAM
!free -h
print()

# Check disk space
!df -h
print()

# Get environment variables
for key, value in os.environ.items():
    print(f"{key}: {value}")
"""
!python -V

Python 3.11.11


In [9]:
print(f"currentdir: {os.getcwd()}")

currentdir: /content


In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"device: {device}")

device: cuda


In [11]:
datasetDict_zip_file_name = "dataset_11_24000.zip"
datasetDict_dir_name      = os.path.splitext(datasetDict_zip_file_name)[0]
print(f"datasetDict_zip_file_name: {datasetDict_zip_file_name}")
print(f"datasetDict_dir_name     : {datasetDict_dir_name}")
print()

datasetDict_zip_file_name: dataset_11_24000.zip
datasetDict_dir_name     : dataset_11_24000



In [12]:
# OOM: reduce batch size
#      small sizes (1 to 32):            PROs: better generalization in some cases
#                                        CONs: may produce noisier gradients
#      large sizes (128, 256, or higer): PROs: gradients are smoother, leading to more stable training
#                                        CONs: poorer generalization (overfitting) in some cases
#      intermediate sizes (32, 64):      combines the benefits of small and large sizes
batch_size = 8

In [13]:
# OOM: enable gradient accumulation to compensate for smaller batch sizes by accumulating gradients over several steps
#      effective batch size = per-device batch size x gradient accumulation steps;
#      in each iteration, the model computes the gradients, these gradients are immediately used to update the model parameters
gradient_accumulation_steps = 4  #<<<<<<<<<<<<<<<<<<< gradient_accumulation_steps may not be None => comment it in TrainingArguments

In [14]:
# OOM: use PYTORCH_CUDA_ALLOC_CONF to handle memory fragmentation
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

In [15]:
# OOM: check for zombie processes
if torch.cuda.is_available():
  !nvidia-smi
  torch.cuda.memory_summary()
!ps aux | grep python
!kill -9 <PID>
!nvidia-smi     # Checked if killed

Wed Jan 22 20:20:47 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:00:04.0 Off |                    0 |
| N/A   31C    P0              46W / 400W |      5MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [16]:
# OOM: use fp16 (half precision) mixed precision training
#      reduces memory requirements by up to 50%
fp16 = True

OOM: limit the number of GPU workers: 0 (default) or 1 in Colab
dataloader_num_workers = 1

In [17]:
# OOM: reduce model size or input tokens
#      1) LongformerTokenizer.from_pretrained('allenai/longformer-base/large-4096'): large/base: 435M/149M parameters
#      2) max_length: 4096 max for Longformer; 1 word can give several tokens, stop words are NOT discarded!
#         word_text_length_counts_sorted: jobs count                 : 50000
#                                         jobs count under  512 words: 44794  89.59%
#                                         jobs count under  640 words: 47894  95.79%
#                                         jobs count under  768 words: 49123  98.25%
#                                         jobs count under  896 words: 49691  99.38%
#                                         jobs count under 1024 words: 49917  99.83%
#                                         jobs count under 2048 words: 50000 100.00%
#                                         jobs count under 4096 words: 50000 100.00%
#max_length =  768    #      37 min    #
max_length = 1024    #      38 min    # GPU RAM: 12.2 / 40 GB
#max_length = 2048    # 1 hr 10 min    # GPU RAM: 21.4 / 40 GB
#max_length = 4096    # 2 hr 10 min    # GPU RAM: 39.5 / 40 GB => OutOfMemoryError

In [18]:
# OOM: free up GPU memory
torch.cuda.empty_cache()

In [19]:
# OOM: monitor GPU memory usage
!nividia-smi

/bin/bash: line 1: nividia-smi: command not found


In [20]:
# 1 epoch is a complete pass through the entire training dataset;
# with n datapoints and batch size = b, n/b iterations to complete 1 epoch;
# 1 iteration is a single update of the model's parameters
epochs = 5

In [21]:
# A common rule is to scale the learning rate proportionaly with the effective batch size
# note: get_linear_schedule_with_warmup <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
learning_rate = 2e-5  # 1e-5 x 32/8

Reduce the number of transformers layers
hidden_layers = 12    # 12 (default) or 6

In [22]:
# Threshold: 0.5 (default)
threshold = 0.2

In [23]:
if fp16:
  _fp = "fp16"
else:
  _fp = "fp32"

In [24]:
if 'gradient_accumulation_steps' not in globals():
  run_name = f"Longformer-multilabel-{datasetDict_dir_name}-length{max_length}-batch{batch_size}-epochs{epochs}-lr{learning_rate}-{_fp}-threshold{threshold}"
else:
  run_name = f"Longformer-multilabel-{datasetDict_dir_name}-length{max_length}-batch{batch_size}x{gradient_accumulation_steps}-epochs{epochs}-lr{learning_rate}-{_fp}-threshold{threshold}"
print(f"run_name                 : {run_name}")

run_name                 : Longformer-multilabel-dataset_11_24000-length1024-batch8x4-epochs5-lr2e-05-fp16-threshold0.2


In [25]:
def upload_unzip_dataset(file_name=datasetDict_zip_file_name):
  # Check if the file exists
  if not os.path.exists(file_name):
    print(f"'{file_name}' not found in /content. Uploading...")
    uploaded_files = files.upload()                              # Prompt file upload dialog
    if file_name not in uploaded_files:
      raise FileNotFoundError(f"'{file_name}' was not uploaded. Please try again.")
    print(f"'{file_name}' successfully uploaded to /content")
    uploaded_file_name = list(uploaded_files.keys())[0]          # Get the name of the uploaded file

    !unzip {uploaded_file_name}

    unzipped_dir_name = os.path.splitext(uploaded_file_name)[0]
    assert unzipped_dir_name==datasetDict_dir_name, "unzipped_dir_name != datasetDict_dir_name"
  else:
    print(f"'{datasetDict_dir_name}' already exists in /content.")

In [26]:
upload_unzip_dataset(datasetDict_zip_file_name)

'dataset_11_24000.zip' not found in /content. Uploading...


Saving dataset_11_24000.zip to dataset_11_24000.zip
'dataset_11_24000.zip' successfully uploaded to /content
Archive:  dataset_11_24000.zip
  inflating: dataset_11_24000/dataset_dict.json  
  inflating: dataset_11_24000/test/data-00000-of-00001.arrow  
  inflating: dataset_11_24000/test/dataset_info.json  
  inflating: dataset_11_24000/test/state.json  
  inflating: dataset_11_24000/train/data-00000-of-00001.arrow  
  inflating: dataset_11_24000/train/dataset_info.json  
  inflating: dataset_11_24000/train/state.json  
  inflating: dataset_11_24000/validation/data-00000-of-00001.arrow  
  inflating: dataset_11_24000/validation/dataset_info.json  
  inflating: dataset_11_24000/validation/state.json  


Hugging Face Authenticate

In [27]:
os.environ["HF_TOKEN"] = userdata.get("HF_TOKEN")    # Store the key in os.environ
hf_token               = os.environ.get('HF_TOKEN')
login(token=hf_token)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [28]:
# Verify
!huggingface-cli whoami

claudelepere


Create the skill_classification repo on the Hugging Face Hub

In [29]:
HF_name         = "claudelepere/skill_classification"
repo_id_model   = HF_name
repo_id_dataset = HF_name

In [30]:
repo_model_url = create_repo(
    repo_id   = repo_id_model,
    repo_type = "model",
    private   = True,
    exist_ok  = True
)
print(f"Repo model url: {repo_model_url} created successfully as a private repo.")

Repo model url: https://huggingface.co/claudelepere/skill_classification created successfully as a private repo.


In [31]:
repo_dataset_url = create_repo(
    repo_id   = repo_id_dataset,
    repo_type = "dataset",
    private   = True,
    exist_ok  = True
)
print(f"Repo datasets url: {repo_dataset_url} created successfully as a private repo.")

Repo datasets url: https://huggingface.co/datasets/claudelepere/skill_classification created successfully as a private repo.


In [32]:
repo_id_dataset = f"datasets/{HF_name}"

In [33]:
print(f"repo_id_model: {repo_id_model}")
print(f"repo_id_dataset: {repo_id_dataset}")

repo_id_model: claudelepere/skill_classification
repo_id_dataset: datasets/claudelepere/skill_classification


W&B initialization

In [34]:
os.environ["WANDB_API_KEY"] = userdata.get("WANDB_API_KEY")        # Store the key in os.environ
wandb_api_key               = os.environ.get('WANDB_API_KEY')
wandb.login(key=wandb_api_key)

[34m[1mwandb[0m: Currently logged in as: [33mclaudelepere[0m ([33mclaudelepere-c-cile-cy[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [35]:
try:
  wandb.init(
      project = "skill_classification",
      name    = run_name,
      entity  = "claudelepere-c-cile-cy",
      config  = {
          "learning_rate": learning_rate,
          "epochs"       : 5,
          "batch_size"   : batch_size
      }
  )
except wandb.errors.CommError as err:
  print(f"CommError: {err}")
except Exception as exc:
  print(f"Exception: {exc}")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Create the dataset: 3 Hugging Face Dataset in a Hugging Face DatasetDict

In [36]:
datasetDict = DatasetDict.load_from_disk(datasetDict_dir_name)

In [37]:
print(f"datasetDict: {type(datasetDict)} {datasetDict.shape}\n{datasetDict}")
print(f"datasetDict.keys(): {datasetDict.keys()}")
print(f"datasetDict['train']: {type(datasetDict['train'])} {datasetDict['train'].shape}")
print(f"datasetDict['validation']: {type(datasetDict['validation'])} {datasetDict['validation'].shape}")
print(f"datasetDict['test']: {type(datasetDict['test'])} {datasetDict['test'].shape}")

datasetDict: <class 'datasets.dataset_dict.DatasetDict'> {'train': (19200, 8), 'validation': (2400, 8), 'test': (2400, 8)}
DatasetDict({
    train: Dataset({
        features: ['id', 'text', '390', '135', '136', '137', '138', '139'],
        num_rows: 19200
    })
    validation: Dataset({
        features: ['id', 'text', '390', '135', '136', '137', '138', '139'],
        num_rows: 2400
    })
    test: Dataset({
        features: ['id', 'text', '390', '135', '136', '137', '138', '139'],
        num_rows: 2400
    })
})
datasetDict.keys(): dict_keys(['train', 'validation', 'test'])
datasetDict['train']: <class 'datasets.arrow_dataset.Dataset'> (19200, 8)
datasetDict['validation']: <class 'datasets.arrow_dataset.Dataset'> (2400, 8)
datasetDict['test']: <class 'datasets.arrow_dataset.Dataset'> (2400, 8)


In [38]:
example = datasetDict['train'][0]
print(f"datasetDict['train'][0]: {type(example)} {example.keys()}\n{example}")

datasetDict['train'][0]: <class 'dict'> dict_keys(['id', 'text', '390', '135', '136', '137', '138', '139'])
{'id': 196243, 'text': "Vivid Resourcing - Software Engineer React.JS Node.js Vue.js Nest.JS Vivid Resourcing I'm partnered with a startup based in Brussels who are looking for an experienced Software Engineer to strengthen it's development team. The client created an AI-enabled Intelligence Platform for business enterprises. This platform analyses significant competitors, industry trends, market dynamics, new technologies, and business ecosystem evolutions to ensure that companies remain constantly up date. You'll be responsible for managing the interchange of data between the server and the users. Your key tasks will be developing the server-side logic, defining and maintaining the core database, and guaranteeing front-end performance and responsiveness. You'll work closely with other teams such as Product Managers and Data Engineers. Your profile At least 3+ Years of experienc

Create the label list and the id2label and label2id mappings.

In [39]:
"""
dataset 7_1000_125_125  ,  48 labels
dataset 7_128_18_54     ,  42 labels
dataset 8910_1087_68_204, 206 labels
dataset 11_1000         ,   6 labels
"""

'\ndataset 7_1000_125_125  ,  48 labels\ndataset 7_128_18_54     ,  42 labels\ndataset 8910_1087_68_204, 206 labels\ndataset 11_1000         ,   6 labels\n'

In [40]:
labels = [label for label in datasetDict['train'].features.keys() if label not in ['id', 'text']]
labels.sort()
print(f"labels: {type(labels)} {len(labels)}\n{labels}")

labels: <class 'list'> 6
['135', '136', '137', '138', '139', '390']


In [41]:
id2label = {idx:label for idx, label in enumerate(labels)}
print(f"id2label: {type(id2label)} {len(id2label)}\n{id2label}")

id2label: <class 'dict'> 6
{0: '135', 1: '136', 2: '137', 3: '138', 4: '139', 5: '390'}


In [42]:
label2id = {label:idx for idx, label in enumerate(labels)}
print(f"label2id: {type(label2id)} {len(label2id)}\n{label2id}")

label2id: <class 'dict'> 6
{'135': 0, '136': 1, '137': 2, '138': 3, '139': 4, '390': 5}


Load tokenizer and model

In [43]:
model_name = "allenai/longformer-base-4096"

In [44]:
tokenizer = LongformerTokenizerFast.from_pretrained(model_name)

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

In [45]:
model = LongformerForSequenceClassification.from_pretrained(
    model_name,
    num_labels        = len(labels),
#    num_hidden_layers = hidden_layers,
    problem_type      = 'multi_label_classification')

pytorch_model.bin:   0%|          | 0.00/597M [00:00<?, ?B/s]

Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [46]:
# Configure attention window size
model.config.attention_window = 512

In [47]:
optimizer = AdamW(model.parameters(), lr=learning_rate)



Tokenize ('input_ids' and 'attention_mask'), add 'global_attention_mask' (for Longformer), add 'labels'

In [48]:
def preprocess_data(examples, indices):
  # Step 1: Extract text and tokenize
  text = examples['text']             # Batch of texts
  encoding = tokenizer(
      text,                           # Tokenize text
      truncation     = True,
      padding        = 'max_length',
      max_length     = max_length,
      return_tensors = 'pt'           # Return PyTorch tensors
  )

  # Step 2: Create and add the global attention mask
  global_attention_mask             = torch.zeros_like(encoding['input_ids'])  # Initialize global attention mask with zeros (same shape as input_ids)
  global_attention_mask[:, 0]       = 1                                        # Set global attention on the first token ([CLS], token ID=0) in each sequence
  encoding['global_attention_mask'] = global_attention_mask                    # Add the global_attention_mask to the batch

  # Step 3: Create and populate the label matrix
  labels_matrix = torch.zeros((len(text), len(labels)), dtype=torch.float32)   # Create an empty label matrix
  #print(f"labels_matrix: {type(labels_matrix)} {labels_matrix.shape}")
  #---------Populate label matrix
  for idx, label in enumerate(labels):
    #print(f"idx:{idx} label:{label}")
    if label in examples:
      labels_matrix[:, idx] = torch.tensor(
          [1.0 if val else 0.0 for val in examples[label]],
          dtype=torch.float32
          )
  print(f"labels_matrix: {type(labels_matrix)} {labels_matrix.shape}")

  encoding['labels'] = labels_matrix                                           # Add labels to the encoding
  print(f"encoding['labels']: {type(encoding['labels'])} {encoding['labels'].shape}")

  # encoding: <class 'transformers.tokenization_utils_base.BatchEncoding'> dict_keys(['input_ids', 'attention_mask', 'global_attention_mask', 'labels'])
  #   'input_ids': tensor([[
  #   'attention_mask': tensor([[
  #   'global_attention_mask': tensor([[
  #   'labels': tensor([[
  #print(f"1 preprocess_data call: encoding: {type(encoding)} {encoding.keys()}")

  return encoding

Create the 3 encoded datasets, train, validation and test

In [49]:
encoded_dataset = datasetDict.map(
    preprocess_data,
    batched        = True,
    remove_columns = datasetDict['train'].column_names,
    with_indices   = True
)
#train_dataset      = encoded_dataset['train']
#validation_dataset = encoded_dataset['validation']
#test_dataset       = encoded_dataset['test']

Map:   0%|          | 0/19200 [00:00<?, ? examples/s]

labels_matrix: <class 'torch.Tensor'> torch.Size([1000, 6])
encoding['labels']: <class 'torch.Tensor'> torch.Size([1000, 6])
labels_matrix: <class 'torch.Tensor'> torch.Size([1000, 6])
encoding['labels']: <class 'torch.Tensor'> torch.Size([1000, 6])
labels_matrix: <class 'torch.Tensor'> torch.Size([1000, 6])
encoding['labels']: <class 'torch.Tensor'> torch.Size([1000, 6])
labels_matrix: <class 'torch.Tensor'> torch.Size([1000, 6])
encoding['labels']: <class 'torch.Tensor'> torch.Size([1000, 6])
labels_matrix: <class 'torch.Tensor'> torch.Size([1000, 6])
encoding['labels']: <class 'torch.Tensor'> torch.Size([1000, 6])
labels_matrix: <class 'torch.Tensor'> torch.Size([1000, 6])
encoding['labels']: <class 'torch.Tensor'> torch.Size([1000, 6])
labels_matrix: <class 'torch.Tensor'> torch.Size([1000, 6])
encoding['labels']: <class 'torch.Tensor'> torch.Size([1000, 6])
labels_matrix: <class 'torch.Tensor'> torch.Size([1000, 6])
encoding['labels']: <class 'torch.Tensor'> torch.Size([1000, 6])


Map:   0%|          | 0/2400 [00:00<?, ? examples/s]

labels_matrix: <class 'torch.Tensor'> torch.Size([1000, 6])
encoding['labels']: <class 'torch.Tensor'> torch.Size([1000, 6])
labels_matrix: <class 'torch.Tensor'> torch.Size([1000, 6])
encoding['labels']: <class 'torch.Tensor'> torch.Size([1000, 6])
labels_matrix: <class 'torch.Tensor'> torch.Size([400, 6])
encoding['labels']: <class 'torch.Tensor'> torch.Size([400, 6])


Map:   0%|          | 0/2400 [00:00<?, ? examples/s]

labels_matrix: <class 'torch.Tensor'> torch.Size([1000, 6])
encoding['labels']: <class 'torch.Tensor'> torch.Size([1000, 6])
labels_matrix: <class 'torch.Tensor'> torch.Size([1000, 6])
encoding['labels']: <class 'torch.Tensor'> torch.Size([1000, 6])
labels_matrix: <class 'torch.Tensor'> torch.Size([400, 6])
encoding['labels']: <class 'torch.Tensor'> torch.Size([400, 6])


train_labels_list_of_lists = train_dataset['labels'].tolist()
print("=============================================")
print(f"encoded_dataset: {type(encoded_dataset)} {encoded_dataset.shape}\n{encoded_dataset}")
print(f"train_dataset: {type(train_dataset)} {train_dataset.shape} {train_dataset.features}\n{train_dataset}")
print(f"validation_dataset: {type(validation_dataset)} {validation_dataset.shape} {validation_dataset.features}")
print(f"test_dataset: {type(test_dataset)} {test_dataset.shape} {test_dataset.features}")
print("---")
print(f"train_dataset['labels']: {type(train_dataset['labels'])} len={len(train_dataset['labels'])}\n{train_dataset['labels']}")
print("---")
print(f"train_dataset[0]['input_ids']: {type(train_dataset[0]['input_ids'])} {len(train_dataset[0]['input_ids'])}\n{train_dataset['input_ids'][0]}")
print(f"train_dataset[0]['attention_mask']: {type(train_dataset[0]['attention_mask'])} {len(train_dataset[0]['attention_mask'])}\n{train_dataset['attention_mask'][0]}")
print(f"train_dataset[0]['global_attention_mask']: {type(train_dataset[0]['global_attention_mask'])} {len(train_dataset[0]['global_attention_mask'])}\n{train_dataset['global_attention_mask'][0]}")
print(f"train_dataset[0]['labels']: {type(train_dataset[0]['labels'])} {len(train_dataset[0]['labels'])} {train_dataset[0]['labels']}")
print(f"train_dataset['labels'][0]: {type(train_dataset['labels'][0])} {len(train_dataset['labels'][0])}\n{train_dataset['labels'][0]}")

In [50]:
encoded_dataset.set_format('torch')
train_dataset      = encoded_dataset['train']
validation_dataset = encoded_dataset['validation']
test_dataset       = encoded_dataset['test']
print(f"train_dataset_tensor: {type(train_dataset)} {train_dataset.shape} {train_dataset.features}\n{train_dataset}")
print(f"train_dataset_tensor['input_ids']:             {type(train_dataset['input_ids'])}             len={len(train_dataset['input_ids'])}             shape={train_dataset['input_ids'].shape}            ") #\n{train_dataset['input_ids']}")
print(f"train_dataset_tensor['attention_mask']:        {type(train_dataset['attention_mask'])}        len={len(train_dataset['attention_mask'])}        shape={train_dataset['attention_mask'].shape}       ") #\n{train_dataset['attention_mask']}")
print(f"train_dataset_tensor['global_attention_mask']: {type(train_dataset['global_attention_mask'])} len={len(train_dataset['global_attention_mask'])} shape={train_dataset['global_attention_mask'].shape}") #\n{train_dataset['global_attention_mask']}")
print(f"train_dataset_tensor['labels']:                {type(train_dataset['labels'])}                len={len(train_dataset['labels'])}                shape={train_dataset['labels'].shape}               ") #\n{train_dataset['labels']}")

train_dataset_tensor: <class 'datasets.arrow_dataset.Dataset'> (19200, 4) {'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None), 'global_attention_mask': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), 'labels': Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None)}
Dataset({
    features: ['input_ids', 'attention_mask', 'global_attention_mask', 'labels'],
    num_rows: 19200
})
train_dataset_tensor['input_ids']:             <class 'torch.Tensor'>             len=19200             shape=torch.Size([19200, 1024])            
train_dataset_tensor['attention_mask']:        <class 'torch.Tensor'>        len=19200        shape=torch.Size([19200, 1024])       
train_dataset_tensor['global_attention_mask']: <class 'torch.Tensor'> len=19200 shape=torch.Size([19200, 1024])
train_dataset_tensor['labels']:                <class 'torch.Tensor'>     

Truncated part

In [51]:
def get_truncated_part(text):
  tokens = tokenizer(
      text,
      truncation                = True,
      padding                   = 'max_length',
      max_length                = max_length,
      return_overflowing_tokens = True,
      return_tensors            = None
      )
  print(f"tokens.keys(): {tokens.keys()}")

  # Get the truncated tokens
  truncated_ids = tokens["input_ids"][0]
  print(f"truncated_ids: {type(truncated_ids)} {truncated_ids}")
  #overflow_ids  = tokens["overflow_to_sample_mapping"][0]
  #print(f"overflow_ids: {type(overflow_ids)} {overflow_ids}")

  # Decode the tokens back to text
  truncated_text = tokenizer.decode(truncated_ids, skip_special_tokens=True)
  #overflow_text  = tokenizer.decode(overflow_ids, skip_special_tokens=True)

  print(f"original_text :\n{text}")
  print(f"truncated_text:\n{truncated_text}")
  #print(f"overflow_text:\n{overflow_text}")

  original_tokens  = tokenizer.tokenize(text)
  truncated_tokens = tokenizer.tokenize(truncated_text)
  #overflow_tokens  = tokenizer.tokenize(overflow_text)

  print(f"original_tokens count : {len(original_tokens)}")
  print(f"truncated_tokens count: {len(truncated_tokens)}")
  #print(f"overflow_tokens count: {len(overflow_tokens)}")

In [52]:
example_text = datasetDict['train'][0]['text']
#get_truncated_part(example_text)

In [53]:
inputs = tokenizer(
    example_text,
    truncation     = True,
    padding        = 'max_length',
    max_length     = max_length,
    return_tensors = 'pt'
)

inputs: <class 'transformers.tokenization_utils_base.BatchEncoding'> dict_keys(['input_ids', 'attention_mask'])
  {'input_ids': tensor([[
  'attention_mask': tensor([[
print(f"inputs: {type(inputs)} {inputs.keys()}") #\n{inputs}")
print(f"inputs_ids: {type(inputs.input_ids)} {inputs.input_ids.shape}\n{inputs.input_ids}")
print(f"attention_mask: {type(inputs.attention_mask)} {inputs.attention_mask.shape}\n{inputs.attention_mask}")
print(f"labels: {inputs.labels.shape}")

4. Forward pass for multi-label classification

In [54]:
outputs = model(
    input_ids      = inputs.input_ids,
    attention_mask = inputs.attention_mask
    )

Initializing global attention on CLS token...


In [55]:
print(f"outputs: {type(outputs)} {outputs.keys()}\n{outputs}")

outputs: <class 'transformers.models.longformer.modeling_longformer.LongformerSequenceClassifierOutput'> odict_keys(['logits'])
LongformerSequenceClassifierOutput(loss=None, logits=tensor([[ 0.1413,  0.0402, -0.0247,  0.1433, -0.0155, -0.0199]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None, global_attentions=None)


In [56]:
# Logits (= raw model outputs)
logits = outputs.logits
print(f"logits: {type(logits)} {logits.shape}\n{logits}")

logits: <class 'torch.Tensor'> torch.Size([1, 6])
tensor([[ 0.1413,  0.0402, -0.0247,  0.1433, -0.0155, -0.0199]],
       grad_fn=<AddmmBackward0>)


In [57]:
# Convert logits to probabilities
sigmoid = torch.nn.Sigmoid()
probs   = sigmoid(logits)
print(f"probs: {type(probs)} {probs.shape}\n{probs}")

probs: <class 'torch.Tensor'> torch.Size([1, 6])
tensor([[0.5353, 0.5101, 0.4938, 0.5358, 0.4961, 0.4950]],
       grad_fn=<SigmoidBackward0>)


In [58]:
example = encoded_dataset['train'][0]

In [59]:
print(f"example: {type(example)} {example.keys()}\n{example}")
print()
#print(f"example['input_ids']: {type(example['input_ids'])} {len(example['input_ids'])}\n{example['input_ids']}")
#print(f"example['attention_mask']: {type(example['attention_mask'])} {len(example['attention_mask'])}\n{example['attention_mask']}")
#print(f"example['labels']:  {type(example['labels'])} {len(example['labels'])}\n{example['labels']}")

example: <class 'dict'> dict_keys(['input_ids', 'attention_mask', 'global_attention_mask', 'labels'])
{'input_ids': tensor([   0,  846, 6837,  ...,    1,    1,    1]), 'attention_mask': tensor([1, 1, 1,  ..., 0, 0, 0]), 'global_attention_mask': tensor([1, 0, 0,  ..., 0, 0, 0]), 'labels': tensor([0., 0., 1., 1., 1., 0.])}



In [60]:
tokenizer.decode(example['input_ids'])

"<s>Vivid Resourcing - Software Engineer React.JS Node.js Vue.js Nest.JS Vivid Resourcing I'm partnered with a startup based in Brussels who are looking for an experienced Software Engineer to strengthen it's development team. The client created an AI-enabled Intelligence Platform for business enterprises. This platform analyses significant competitors, industry trends, market dynamics, new technologies, and business ecosystem evolutions to ensure that companies remain constantly up date. You'll be responsible for managing the interchange of data between the server and the users. Your key tasks will be developing the server-side logic, defining and maintaining the core database, and guaranteeing front-end performance and responsiveness. You'll work closely with other teams such as Product Managers and Data Engineers. Your profile At least 3+ Years of experience with JavaScript. Extensive experience developing frontend applications using React and related libraries. Experience with Node

In [61]:
[id2label[idx] for idx, label in enumerate(example['labels']) if label == 1.0]

['137', '138', '139']

Set PyTorch format to ensures correctness and compatibility with PyTorch pipelines

In [62]:
# The 3 Hugging Face Dataset are formatted as PyTorch Dataset
encoded_dataset.set_format('torch')

In [63]:
batch_size  = batch_size
metric_name = "f1"

In [64]:
training_args = TrainingArguments(
    output_dir                  = './training_results',  # where model predictions and checkpoints will be written during training
    overwrite_output_dir        = True,
    logging_dir                 = './logs',
    logging_steps               = 50,
    save_steps                  = 500,
    save_total_limit            = 2,
    eval_strategy               = 'epoch',
    save_strategy               = 'epoch',
    learning_rate               = learning_rate,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size  = batch_size,
    gradient_accumulation_steps = gradient_accumulation_steps,
    num_train_epochs            = epochs,
    weight_decay                = 0.01,
    load_best_model_at_end      = True,
    metric_for_best_model       = metric_name,
    run_name                    = run_name,
    fp16                        = fp16,
    #dataloader_num_workers      = dataloader_num_workers,
    report_to                  = 'wandb'
    )

Metrics
  source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/

In [65]:
def multi_label_metrics(predictions, labels):
    average = 'micro'    # 'micro' or 'weighted'

    # first, apply sigmoid on predictions whose shape is (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs   = sigmoid(torch.Tensor(predictions))

    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1

    # finally, compute metrics
    y_true               = labels
    f1                   = f1_score               (y_true=y_true, y_pred=y_pred, average=average)    #, zero_division=1)
    precision            = precision_score        (y_true=y_true, y_pred=y_pred, average=average)    #, zero_division=1)
    recall               = recall_score           (y_true=y_true, y_pred=y_pred, average=average)    #, zero_division=1)
    roc_auc              = roc_auc_score          (y_true=y_true, y_score=probs, average=average)
    precision_recall_auc = average_precision_score(y_true=y_true, y_score=probs, average=average)
    accuracy             = accuracy_score         (y_true=y_true, y_pred=y_pred)

    # return as dictionary
    metrics = {
        'f1'                  : f1,
        'precision'           : precision,
        'recall'              : recall,
        'roc_auc'             : roc_auc,
        'precision_recall_auc': precision_recall_auc,
        'accuracy'            : accuracy
        }

    return metrics

In [66]:
def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    result = multi_label_metrics(
        predictions = preds,
        labels      = p.label_ids
        )
    return result

In [67]:
"""Let's verify a batch as well as a forward pass:"""

"Let's verify a batch as well as a forward pass:"

In [68]:
print(f"input_ids:              {type(encoded_dataset['train']['input_ids'][0])}\t{encoded_dataset['train']['input_ids'][0].shape}")
print(f"attention_mask:         {type(encoded_dataset['train']['attention_mask'][0])}\t{encoded_dataset['train']['attention_mask'][0].shape}")
print(f"global_attention_mask:  {type(encoded_dataset['train']['global_attention_mask'][0])}\t{encoded_dataset['train']['global_attention_mask'][0].shape}")
print(f"labels:                 {type(encoded_dataset['train'][0]['labels'])}\t{encoded_dataset['train'][0]['labels'].shape}")

input_ids:              <class 'torch.Tensor'>	torch.Size([1024])
attention_mask:         <class 'torch.Tensor'>	torch.Size([1024])
global_attention_mask:  <class 'torch.Tensor'>	torch.Size([1024])
labels:                 <class 'torch.Tensor'>	torch.Size([6])


Execute a forward pass for debugging or verification purposes (cf. BERT_3_1 in Notion BERT database)

In [69]:
outputs = model(
    input_ids      = encoded_dataset['train']['input_ids'][0].unsqueeze(0),
    attention_mask = encoded_dataset['train']['attention_mask'][0].unsqueeze(0),
    labels         = encoded_dataset['train'][0]['labels'].unsqueeze(0)
    )

In [70]:
print(f"outputs: {type(outputs)} {outputs.keys()}\n{outputs}")

outputs: <class 'transformers.models.longformer.modeling_longformer.LongformerSequenceClassifierOutput'> odict_keys(['loss', 'logits'])
LongformerSequenceClassifierOutput(loss=tensor(0.6989, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>), logits=tensor([[ 0.1413,  0.0402, -0.0247,  0.1433, -0.0155, -0.0199]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None, global_attentions=None)


In [71]:
"""# Define the weighted loss function

class_weights = torch.tensor([7.68, 2.15, 0.61, 0.47, 0.68, 6.26], dtype=torch.float32).to(device)
loss_fn       = BCEWithLogitsLoss(pos_weight=class_weights)  # For multi-label classification (binary classification per label)

## Class supports, class weigths, weighted loss function

Reminder:
*   df_jobs      : <class 'pandas.core.frame.DataFrame'>
*   df_jobs['id']: <class 'pandas.core.series.Series'>

dataset = Dataset.from_pandas(df_jobs)
*   dataset      : <class 'datasets.arrow_dataset.Dataset'>
*   dataset['id']: <class 'list'>

*   dataset_dict_jobs : <class 'datasets.dataset_dict.DatasetDict'>
*   train_dataset     : <class 'datasets.arrow_dataset.Dataset'>
*   validation_dataset: <class 'datasets.arrow_dataset.Dataset'>
*   test_dataset      : <class 'datasets.arrow_dataset.Dataset'>


We calculate the class supports for the train, validation and test datasets; the class weights and the weighted loss function are used for training only; the class supports of validation_dataset and test_dataset are calculated for information only.

def get_train_class_weights(datasetDict, labels):
  print(f"datasetDict: {type(datasetDict)} shape={datasetDict.shape}\n{datasetDict}")
  print(f"labels: {type(labels)} len={len(labels)}\n{labels}")

  dataset_train      = datasetDict['train']
  dataset_validation = datasetDict['validation']
  dataset_test       = datasetDict['test']

  def calculate_class_supports(dataset, labels):
    class_supports = dataset.map(
        lambda example: {col: example[col] for col in labels},
        batched=True
    ).to_pandas()[labels].sum(axis=0)
    return class_supports

  class_supports = {}

  for split_name, split_dataset in datasetDict.items():
    class_supports[split_name] = calculate_class_supports(split_dataset, labels)

  for split_name, split_class_supports in class_supports.items():
    print(f"{split_name}: {type(split_class_supports)} len={len(split_class_supports)}\n{split_class_supports}")

  train_class_supports_list = class_supports['train'].tolist()
  print(f"train_class_supports_list: {type(train_class_supports_list)} len={len(train_class_supports_list)} {train_class_supports_list}")

  train_class_supports_tensor = torch.tensor(train_class_supports_list, dtype=torch.float32)
  print(f"train_class_supports_tensor: {type(train_class_supports_tensor)} len={len(train_class_supports_tensor)} {train_class_supports_tensor}")

  train_total_samples = dataset_train.num_rows
  print(f"train_total_samples: {train_total_samples}")

  number_of_classes = len(labels)
  print(f"number_of_classes: {number_of_classes}")

  train_class_weights = train_total_samples / (number_of_classes * train_class_supports_tensor)
  print(f"train_class_weights: {type(train_class_weights)} len={len(train_class_weights)} {train_class_weights}")

  train_class_weights_sum = train_class_weights.sum()
  print(f"train_class_weights_sum: {train_class_weights_sum}")

  normalized_train_class_weights = (train_class_weights / train_class_weights_sum) * number_of_classes
  print(f"normalized_train_class_weights: {type(normalized_train_class_weights)} len={len(normalized_train_class_weights)} {normalized_train_class_weights}")

  # Positives samples per label
  supports = train_class_supports_tensor
  print(f"supports: {type(supports)} {len(supports)} {supports}")

  # Negatives samples per label
  negatives = train_total_samples - supports
  print(f"negatives: {type(negatives)} {len(negatives)} {negatives}")

  # pos_weights = negative to positive ratios
  pos_weights = negatives/supports
  print(f"pos_weights: {type(pos_weights)} {len(pos_weights)} {pos_weights}")

  # Normalize using min-max scaling
  normalized_pos_weights_minmax = (pos_weights - pos_weights.min()) / (pos_weights.max() - pos_weights.min())
  print(f"normalized_pos_weights_minmax: {type(normalized_pos_weights_minmax)} {len(normalized_pos_weights_minmax)} {normalized_pos_weights_minmax}")

  # Normalize using z-score standardization
  normalized_pos_weights_zscore = (pos_weights - pos_weights.mean()) / pos_weights.std()
  print(f"normalized_pos_weights_zscore: {type(normalized_pos_weights_zscore)} {len(normalized_pos_weights_zscore)} {normalized_pos_weights_zscore}")

  # Normalize using min-max scaling
  normalized_pos_weights_minmax = (pos_weights - pos_weights.min()) / (pos_weights.max() - pos_weights.min())
  print(f"normalized_pos_weights_minmax: {type(normalized_pos_weights_minmax)} {len(normalized_pos_weights_minmax)} {normalized_pos_weights_minmax}")

  # Normalize using z-score standardization
  normalized_pos_weights_zscore = (pos_weights - pos_weights.mean()) / pos_weights.std()
  print(f"normalized_pos_weights_zscore: {type(normalized_pos_weights_zscore)} {len(normalized_pos_weights_zscore)} {normalized_pos_weights_zscore}")

  # Normalize using sum-to-one
  normalized_pos_weights_sum1 = pos_weights / pos_weights.sum()
  print(f"normalized_pos_weights_sum1: {type(normalized_pos_weights_sum1)} {len(normalized_pos_weights_sum1)} {normalized_pos_weights_sum1}")

  return normalized_pos_weights_minmax
  #return normalized_pos_weights_zscore
  #return normalized_pos_weights_sum1

pos_weights = get_train_class_weights(datasetDict, labels)

loss_fn = BCEWithLogitsLoss(pos_weight=pos_weights.to(device))  # For multi-label classification (binary classification per label)
print(f"loss_fn: {type(loss_fn)} {loss_fn}")
"""

'# Define the weighted loss function\n\nclass_weights = torch.tensor([7.68, 2.15, 0.61, 0.47, 0.68, 6.26], dtype=torch.float32).to(device)\nloss_fn       = BCEWithLogitsLoss(pos_weight=class_weights)  # For multi-label classification (binary classification per label)\n\n## Class supports, class weigths, weighted loss function\n\nReminder:\n*   df_jobs      : <class \'pandas.core.frame.DataFrame\'>\n*   df_jobs[\'id\']: <class \'pandas.core.series.Series\'>\n\ndataset = Dataset.from_pandas(df_jobs)\n*   dataset      : <class \'datasets.arrow_dataset.Dataset\'>\n*   dataset[\'id\']: <class \'list\'>\n\n*   dataset_dict_jobs : <class \'datasets.dataset_dict.DatasetDict\'>\n*   train_dataset     : <class \'datasets.arrow_dataset.Dataset\'>\n*   validation_dataset: <class \'datasets.arrow_dataset.Dataset\'>\n*   test_dataset      : <class \'datasets.arrow_dataset.Dataset\'>\n\n\nWe calculate the class supports for the train, validation and test datasets; the class weights and the weighted l

In [72]:
def get_class_weights(labels=encoded_dataset['train']['labels']):
  print(f"labels: {type(labels)} len={len(labels)} shape={labels.shape}\n{labels}")

  num_samples, num_labels = labels.shape
  print(f"num_samples: {type(num_samples)} {num_samples}")
  print(f"num_labels:  {type(num_labels)}  {num_labels}")

  class_counts = labels.sum(dim=0)
  print(f"class_counts: {type(class_counts)} len={len(class_counts)}\n{class_counts}")

  pos_weights = (num_samples-class_counts) / class_counts
  print(f"pos_weights: {type(pos_weights)} len={len(pos_weights)}\n{pos_weights}")

  normalized_pos_weights_minmax = (pos_weights - pos_weights.min()) / (pos_weights.max() - pos_weights.min())
  print(f"normalized_pos_weights_minmax: {type(normalized_pos_weights_minmax)} {len(normalized_pos_weights_minmax)} {normalized_pos_weights_minmax}")

  normalized_pos_weights_zscore = (pos_weights - pos_weights.mean()) / pos_weights.std()
  print(f"normalized_pos_weights_zscore: {type(normalized_pos_weights_zscore)} {len(normalized_pos_weights_zscore)} {normalized_pos_weights_zscore}")

  normalized_pos_weights_sum1 = pos_weights / pos_weights.sum()
  print(f"normalized_pos_weights_sum1: {type(normalized_pos_weights_sum1)} {len(normalized_pos_weights_sum1)} {normalized_pos_weights_sum1}")

  #return pos_weights
  #return normalized_pos_weights_minmax
  #return normalized_pos_weights_zscore
  return normalized_pos_weights_sum1

In [73]:
pos_weights = get_class_weights()

labels: <class 'torch.Tensor'> len=19200 shape=torch.Size([19200, 6])
tensor([[0., 0., 1., 1., 1., 0.],
        [0., 0., 0., 1., 1., 0.],
        [0., 1., 1., 1., 0., 0.],
        ...,
        [0., 0., 1., 1., 1., 0.],
        [0., 0., 1., 1., 1., 0.],
        [0., 0., 0., 1., 1., 0.]])
num_samples: <class 'int'> 19200
num_labels:  <class 'int'>  6
class_counts: <class 'torch.Tensor'> len=6
tensor([ 1153.,  4075., 13850., 17561., 13651.,  1560.])
pos_weights: <class 'torch.Tensor'> len=6
tensor([15.6522,  3.7117,  0.3863,  0.0933,  0.4065, 11.3077])
normalized_pos_weights_minmax: <class 'torch.Tensor'> 6 tensor([1.0000, 0.2326, 0.0188, 0.0000, 0.0201, 0.7208])
normalized_pos_weights_zscore: <class 'torch.Tensor'> 6 tensor([ 1.5633, -0.2328, -0.7331, -0.7771, -0.7300,  0.9098])
normalized_pos_weights_sum1: <class 'torch.Tensor'> 6 tensor([0.4960, 0.1176, 0.0122, 0.0030, 0.0129, 0.3583])


In [74]:
loss_fn = BCEWithLogitsLoss(pos_weight=pos_weights.to(device))

raise Exception("Stop here")

In [75]:
class FocalLoss(Module):
  """
  Focal Loss implementation
  """
  def __init__(self, alpha=1.0, gamma=2.0, logits=False, reduce=True):
    super(FocalLoss, self).__init__()
    self.alpha   = alpha
    self.gamma   = gamma
    self.logits  = logits  # This flag is to indicate whether input is logits or probability
    self.reduce  = reduce

  # inputs  = model's predictions: PyTorch tensor, shape=(batch_size, num_classes)
  # targets = ground truth labels: PyTorch tensor, shape=same as inputs shape
  def forward(self, inputs, targets):
    # Here, we check if input is probability or logits
    if self.logits:
      # Input is logits
      BCE_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction='none')
    else:
      # Input is probability
      BCE_loss = F.binary_cross_entropy(inputs, targets, reduction='none')
    pt = torch.exp(-BCE_loss)
    F_loss = self.alpha * (1-pt)**self.gamma * BCE_loss

    if self.reduce:
      return torch.mean(F_loss)
    else:
      return F_loss

  def __repr__(self):
    return f"FocalLoss(alpha={self.alpha}, gamma={self.gamma}, logits={self.logits}, reduce={self.reduce})"

  def __str__(self):
    return f"FocalLoss(alpha={self.alpha}, gamma={self.gamma}, logits={self.logits}, reduce={self.reduce})"

  def __call__(self, inputs, targets):
    return self.forward(inputs, targets)

In [76]:
focal_loss_fn = FocalLoss(alpha=0.5, gamma=4.0, logits=True, reduce=True)
print(f"focal_loss_fn: {type(focal_loss_fn)} {focal_loss_fn}")

focal_loss_fn: <class '__main__.FocalLoss'> FocalLoss(alpha=0.5, gamma=4.0, logits=True, reduce=True)


CustomTrainer

In [77]:
class CustomTrainer(Trainer):

  def __init__(self, *args, loss_fn=None, **kwargs):
    super().__init__(*args, **kwargs)
    self.loss_fn = loss_fn

  """
  # No print in compute_loss because out of memory because prints are batch per batch
  def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):

    #print(f"inputs passed to compute_loss: {inputs.keys()}")
    #input_ids             = inputs['input_ids']                        # shape: batch_size, sequence_length
    #attention_mask        = inputs['attention_mask']                   # shape: batch_size, sequence_length
    #global_attention_mask = inputs.get('global_attention_mask', None)  # shape: batch_size, sequence_length; optional as LongFormer specific
    labels                = inputs.pop('labels', None)                 # shape: batch_size, num_labels; needed for loss computation, not required by the model

    #outputs = model(**inputs, global_attention_mask=global_attention_mask)  # Forward pass
    # Forward pass
    #outputs = model(
    #    input_ids             = input_ids,
    #    attention_mask        = attention_mask,
    #    global_attention_mask = global_attention_mask,
    #    labels                = labels
    #)
    outputs = model(**inputs, labels=labels)
    #print(f"outputs: {type(outputs)} {outputs.keys()}\n{outputs}")
    logits = outputs.logits  # shape: (batch_size, num_labels)

    # If labels are provided, compute loss
    if labels is not None:
      # Use the custom loss function if provided
      if self.loss_fn is not None:
        loss = self.loss_fn(logits, labels)  # Compute weighted loss
      else:
        # Default loss: BCEWithLogitsLoss
        loss_fn = BCEWithLogitsLoss()
        loss    = loss_fn(logits, labels)    # Compute loss
      return (loss, outputs) if return_outputs else loss

    # If no labels, return outputs only, for evaluation or inference
    return outputs
    """
  def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
    labels  = inputs.pop('labels', None)
    outputs = model(**inputs, labels=labels)
    logits  = outputs.logits

    if labels is not None:
      if self.loss_fn is not None:
        loss = self.loss_fn(logits, labels)
      else:
        loss_fn = BCEWithLogitsLoss()
        loss    = loss_fn(logits, labels)
      return (loss, outputs) if return_outputs else loss

    return outputs

Create a Hugging Face's transformers trainer (which abstracts the training loop)

In [78]:
trainer = CustomTrainer(
#trainer = Trainer(
    model           = model,
    args            = training_args,
    train_dataset   = encoded_dataset["train"],
    eval_dataset    = encoded_dataset["validation"],
    compute_metrics = compute_metrics,                # Optional: custom metrics function
    loss_fn         = focal_loss_fn,
)

Train

In [79]:
trainer_train = trainer.train()
print(f"trainer_train: {type(trainer_train)} len={len(trainer_train)}\n{trainer_train}")

Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Roc Auc,Precision Recall Auc,Accuracy
1,0.0391,0.01015,0.676637,0.511461,0.999391,0.950153,0.93938,0.035833
2,0.0377,0.009433,0.704915,0.544707,0.998629,0.956405,0.946714,0.01875
3,0.0332,0.009714,0.727132,0.571753,0.998477,0.956992,0.946292,0.03
4,0.0315,0.009816,0.74412,0.593314,0.997715,0.958294,0.948606,0.043333
5,0.0283,0.010058,0.750416,0.601636,0.996953,0.957823,0.947802,0.05875


trainer_train: <class 'transformers.trainer_utils.TrainOutput'> len=3
TrainOutput(global_step=3000, training_loss=0.0358163849512736, metrics={'train_runtime': 4817.2294, 'train_samples_per_second': 19.928, 'train_steps_per_second': 0.623, 'total_flos': 6.3059569016832e+16, 'train_loss': 0.0358163849512736, 'epoch': 5.0})


In [80]:
file_path = "trainer_train.json"
with open(file_path, "w") as f:
  json.dump(trainer_train, f)
print(f"Train output successfully saved to {file_path}.")

Train output successfully saved to trainer_train.json.


In [81]:
print("Training successfully completed.")

Training successfully completed.


In [82]:
"""## Upload model, tokenizer, train results, evaluate results"""

'## Upload model, tokenizer, train results, evaluate results'

Upload the tokenizer and the model to the Hugging Face Hub

In [83]:
# Push the tokenizer and the model
tokenizer.push_to_hub(repo_id_model)
model.push_to_hub(repo_id_model)

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


model.safetensors:   0%|          | 0.00/595M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/claudelepere/skill_classification/commit/9b98f25601062601ca0a5208d4ef3c50a749743b', commit_message='Upload LongformerForSequenceClassification', commit_description='', oid='9b98f25601062601ca0a5208d4ef3c50a749743b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/claudelepere/skill_classification', endpoint='https://huggingface.co', repo_type='model', repo_id='claudelepere/skill_classification'), pr_revision=None, pr_num=None)

Verify the Upload

In [84]:
# Load the tokenizer and the model from the Hugging Face Hub
tokenizer = LongformerTokenizerFast.from_pretrained(repo_id_model)
model     = LongformerForSequenceClassification.from_pretrained(repo_id_model)

# Test the tokenizer and the model
inputs = tokenizer("Hello, my dog is cute", return_tensors="pt", truncation=True, padding=True)
outputs = model(**inputs)
print(outputs)

tokenizer_config.json:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.56M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/595M [00:00<?, ?B/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Input ids are automatically padded to be a multiple of `config.attention_window`: 512


LongformerSequenceClassifierOutput(loss=None, logits=tensor([[-0.2066,  0.2900,  0.3301, -0.0197, -0.3633,  0.1552]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None, global_attentions=None)


Upload Train results to HF repo_id_dataset

In [85]:
# Train
upload_file(
    path_or_fileobj = 'trainer_train.json',
    path_in_repo    = 'trainer_train.json',
    repo_id         = HF_name,
    repo_type       = 'dataset'
)

CommitInfo(commit_url='https://huggingface.co/datasets/claudelepere/skill_classification/commit/26d05ad682de5a92e1f861a42b32217e8a4036e0', commit_message='Upload trainer_train.json with huggingface_hub', commit_description='', oid='26d05ad682de5a92e1f861a42b32217e8a4036e0', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/claudelepere/skill_classification', endpoint='https://huggingface.co', repo_type='dataset', repo_id='claudelepere/skill_classification'), pr_revision=None, pr_num=None)

In [86]:
"""To Get Results of Evaluation and Test"""

'To Get Results of Evaluation and Test'

In [87]:
def get_results(model, dataset, batch_size, threshold, phase):
  # Clear GPU cache
  torch.cuda.empty_cache()

  # Set the model to evaluation mode to disable dropout and other training-specific behaviors
  model.eval()

  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  model.to(device)

  dataLoader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

  all_preds       = []
  all_probs       = []
  all_true_labels = []

  for batch in tqdm(dataLoader):
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
      outputs = model(**batch)
    logits = outputs.logits

    # Convert logits to probabilities and probabilities to predictions
    sigmoid = torch.nn.Sigmoid()
    probs   = sigmoid(logits).cpu().numpy()    # Convert to Numpy
    preds   = (probs > threshold).astype(int)  # Convert to binary Numpy array; convert the boolean result to int (0 or 1)

    # Accumulate probabilities, predictions and labels
    all_probs.append(probs)
    all_preds.append(preds)
    all_true_labels.append(batch['labels'].cpu().numpy())

  # Concatenate results from all batches
  all_probs       = np.concatenate(all_probs, axis=0)        # shape: [num_samples, num_labels]
  all_preds       = np.concatenate(all_preds, axis=0)        # shape: [num_samples, num_labels]
  all_true_labels = np.concatenate(all_true_labels, axis=0)  # shape: [num_samples, num_labels]

  print(f"all_probs:       {type(all_probs)} {all_probs.shape}")
  print(f"all_preds:       {type(all_preds)} {all_preds.shape}")
  print(f"all_true_labels: {type(all_true_labels)} {all_true_labels.shape}")
  results_df = pd.DataFrame({
      'predictions'  : [list(pred)  for pred  in all_preds],
      'probabilities': [list(prob)  for prob  in all_probs],
      'true_labels'  : [list(label) for label in all_true_labels]
  })
  results_file_path = f"{phase}_results.csv"
  results_df.to_csv(results_file_path, index=False)

  # Classification report for precision, recall, F1 score
  report = classification_report(
      y_true        = all_true_labels,
      y_pred        = all_preds,
      target_names  = labels,
      zero_division = 0,
      output_dict   = True
  )
  print(f"Classification Report:\n{report}")

  # ROC AUC for multi-label classification
  roc_auc = roc_auc_score(
      y_true  = all_true_labels,
      y_score = all_probs,
      average = 'micro'
  )
  print(f"ROC AUC: {roc_auc}")

  metrics = {
      'classification_report': report,
      'roc_auc'              : roc_auc
  }
  metrics_file_path = f"{phase}_metrics.json"
  with open(metrics_file_path, "w") as f:
    json.dump(metrics, f, indent=4)

  print(f"{phase} Results Saved to {results_file_path}")
  print(f"{phase} Metrics Saved to {metrics_file_path}")

In [88]:
def get_results_with_threshold_tuning(model, dataset, batch_size, average, tune_thresholds=False, phase='eval'):
  """
  Evaluates a model on a given dataset and optionally tunes thresholds for multi-label classification.

  Args:
    model          : The trained model to evaluate.
    dataset        : The dataset to evaluate (validation or test).
    batch_size     : Batch size for DataLoader.
    tune_thresholds: Whether to tune thresholds per label (default = False).
    average        : The averaging method for metrics ('micro', 'macro', 'weighted', etc.).
    phase          : The phase (eval or test) for saving results (default = 'eval').

  Returns:
    A dictionary with metrics and optionally tuned thresholds.
  """

  # Clear GPU cache
  torch.cuda.empty_cache()

  # Set the model to eval mode to disable dropout and other training-specific behaviors
  model.eval

  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  model.to(device)

  # Create DataLoader
  dataLoader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

  all_preds       = []
  all_probs       = []
  all_true_labels = []

  for batch in tqdm(dataLoader):
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
      outputs = model(**batch)
    logits = outputs.logits

  # Convert logits to probabilities
  sigmoid = torch.nn.Sigmoid()
  probs   = sigmoid(logits).cpu().numpy()    # Convert to Numpy
  all_probs.append(probs)
  all_true_labels.append(batch['labels'].cpu().numpy())

  # Concatenate results from all batches
  all_probs       = np.concatenate(all_probs, axis=0)        # shape: [num_samples, num_labels]
  all_true_labels = np.concatenate(all_true_labels, axis=0)  # shape: [num_samples, num_labels]

  # Initialize thresholds (default = 0.5)
  thresholds = np.full(all_probs.shape[1], 0.5)  # shape: [num_samples]

  # Thresholds tuning (if enabled)
  if tune_thresholds:
    # Tune thresholds for each label
    print(f"Tuning thresholds for {phase} phase")
    for label_idx in range(all_probs.shape[1]):                           # Iterate over labels
      precision, recall, thresholds_label = precision_recall_curve(all_true_labels[:, label_idx], all_probs[:, label_idx])
      f1_scores = 2 * (precision * recall) / (precision + recall + 1e-8)  # Avoid division by zero
      best_threshold_idx = np.argmax(f1_scores)
      thresholds[label_idx] = thresholds_label[best_threshold_idx]        # Set the best threshold for this label

  # Apply thresholds to probabilities to generate predictions
  all_preds = (all_probs > thresholds).astype(int)                        # Convert to binary Numpy array; convert the boolean result to int (0 or 1

  # Save predictions, probabilities, and true labels to a DataFrame
  results_df = pd.DataFrame({
      'predictions'  : [list(pred)  for pred  in all_preds],
      'probabilities': [list(prob)  for prob  in all_probs],
      'true_labels'  : [list(label) for label in all_true_labels]
  })
  results_file_path = f"{phase}_results.csv"
  results_df.to_csv(results_file_path, index=False)

  # Compute metrics
  print(f"Computing metrics for {phase} phase")
  classification_report_dict = classification_report(
      y_true        = all_true_labels,
      y_pred        = all_preds,
      target_names  = labels,
      zero_division = 0,
      output_dict   = True
  )

  # Compute roc_auc
  roc_auc = roc_auc_score(
      y_true  = all_true_labels,
      y_score = all_probs,
      average = average
  )

  # Computer precision_recall_auc
  precision_recall_auc = average_precision_score(all_true_labels, all_probs, average=average)

  #precision, recall, _ = precision_recall_curve(all_true_labels, all_probs)
  #precision_recall_auc = auc(recall, precision)

  metrics = {
      'classification_report': classification_report_dict,
      'roc_auc'              : roc_auc,
      'precision_recall_auc' : precision_recall_auc,
      'thresholds'           : thresholds.tolist() if tune_thresholds else "Default (0.5)",  # Convert numpy array to list
  }

  metrics_file_path = f"{phase}_metrics.json"
  with open(metrics_file_path, "w") as f:
    json.dump(metrics, f, indent=4)

  print(f"{phase} Results Saved to {results_file_path}")
  print(f"{phase} Metrics Saved to {metrics_file_path}")

  return metrics


In [89]:
"""## Evaluate

After training, we evaluate our model on the validation set.
"""

'## Evaluate\n\nAfter training, we evaluate our model on the validation set.\n'

First evaluate results

In [90]:
phase_evaluate = 'evaluate_model_eval'

In [91]:
get_results(
    model      = model,
    dataset    = validation_dataset,
    batch_size = batch_size,
    threshold  = threshold,
    phase      = phase_evaluate
)

  0%|          | 0/300 [00:00<?, ?it/s]

all_probs:       <class 'numpy.ndarray'> (2400, 6)
all_preds:       <class 'numpy.ndarray'> (2400, 6)
all_true_labels: <class 'numpy.ndarray'> (2400, 6)
Classification Report:
{'135': {'precision': 0.21198830409356725, 'recall': 0.9415584415584416, 'f1-score': 0.3460620525059666, 'support': 154.0}, '136': {'precision': 0.252757793764988, 'recall': 0.9981060606060606, 'f1-score': 0.40336777650210487, 'support': 528.0}, '137': {'precision': 0.7216666666666667, 'recall': 1.0, 'f1-score': 0.8383349467570184, 'support': 1732.0}, '138': {'precision': 0.9125, 'recall': 1.0, 'f1-score': 0.954248366013072, 'support': 2190.0}, '139': {'precision': 0.7323825503355704, 'recall': 1.0, 'f1-score': 0.8455205811138015, 'support': 1746.0}, '390': {'precision': 0.2210184182015168, 'recall': 0.9532710280373832, 'f1-score': 0.35883905013192613, 'support': 214.0}, 'micro avg': {'precision': 0.6016917984553145, 'recall': 0.9969530773918343, 'f1-score': 0.7504587155963303, 'support': 6564.0}, 'macro avg': {'

In [92]:
print("First evaluation successfully completed.")

First evaluation successfully completed.


Second evaluate results

In [93]:
trainer_evaluate = trainer.evaluate()
print(f"trainer_evaluate: {type(trainer_evaluate)} len={len(trainer_evaluate)}\n{trainer_evaluate}")

trainer_evaluate: <class 'dict'> len=11
{'eval_loss': 0.01005785632878542, 'eval_f1': 0.7504156871739006, 'eval_precision': 0.6016364806472373, 'eval_recall': 0.9969530773918343, 'eval_roc_auc': 0.9578234131816808, 'eval_precision_recall_auc': 0.9478020756679744, 'eval_accuracy': 0.05875, 'eval_runtime': 35.7648, 'eval_samples_per_second': 67.105, 'eval_steps_per_second': 8.388, 'epoch': 5.0}


In [94]:
file_path = "trainer_evaluate.json"
with open(file_path, "w") as f:
  json.dump(trainer_evaluate, f)
print(f"Evaluate output successfully saved to {file_path}.")

Evaluate output successfully saved to trainer_evaluate.json.


In [95]:
print("Second evaluation successfully completed.")

Second evaluation successfully completed.


Upload Evaluate Results to HF repo_id_dataset

In [96]:
upload_file(
    path_or_fileobj = f"{phase_evaluate}_results.csv",
    path_in_repo    = f"{phase_evaluate}_results.csv",
    repo_id         = HF_name,
    repo_type       = 'dataset'
)
upload_file(
    path_or_fileobj = f"{phase_evaluate}_metrics.json",
    path_in_repo    = f"{phase_evaluate}_metrics.json",
    repo_id         = HF_name,
    repo_type       = 'dataset'
)
upload_file(
    path_or_fileobj = 'trainer_evaluate.json',
    path_in_repo    = 'trainer_evaluate.json',
    repo_id         = HF_name,
    repo_type       = 'dataset'
)

CommitInfo(commit_url='https://huggingface.co/datasets/claudelepere/skill_classification/commit/a19738aa3293a7c301c80605eb9a591330cbe27c', commit_message='Upload trainer_evaluate.json with huggingface_hub', commit_description='', oid='a19738aa3293a7c301c80605eb9a591330cbe27c', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/claudelepere/skill_classification', endpoint='https://huggingface.co', repo_type='dataset', repo_id='claudelepere/skill_classification'), pr_revision=None, pr_num=None)

In [97]:
"""## Test"""

'## Test'

First test results

In [98]:
phase_test = 'test_model_eval'

In [99]:
get_results(
    model      = model,
    dataset    = test_dataset,
    batch_size = batch_size,
    threshold  = threshold,
    phase      = phase_test
)

  0%|          | 0/300 [00:00<?, ?it/s]

all_probs:       <class 'numpy.ndarray'> (2400, 6)
all_preds:       <class 'numpy.ndarray'> (2400, 6)
all_true_labels: <class 'numpy.ndarray'> (2400, 6)
Classification Report:
{'135': {'precision': 0.19129082426127528, 'recall': 0.924812030075188, 'f1-score': 0.3170103092783505, 'support': 133.0}, '136': {'precision': 0.2542290961817303, 'recall': 0.994328922495274, 'f1-score': 0.4049268668206313, 'support': 529.0}, '137': {'precision': 0.7129166666666666, 'recall': 1.0, 'f1-score': 0.8324008756993432, 'support': 1711.0}, '138': {'precision': 0.9137140475197999, 'recall': 1.0, 'f1-score': 0.9549117839250708, 'support': 2192.0}, '139': {'precision': 0.7242833052276559, 'recall': 1.0, 'f1-score': 0.8400977995110025, 'support': 1718.0}, '390': {'precision': 0.19319429198682767, 'recall': 0.9361702127659575, 'f1-score': 0.3202911737943585, 'support': 188.0}, 'micro avg': {'precision': 0.597183620529924, 'recall': 0.9961366094884871, 'f1-score': 0.7467130031856357, 'support': 6471.0}, 'macr

In [100]:
print("First test successfully completed.")

First test successfully completed.


Second test results

In [101]:
trainer_predict = trainer.predict(test_dataset)
print(f"trainer_predict: {type(trainer_predict)} len={len(trainer_predict)}\n{trainer_predict}")
print(f"trainer_predict.predictions: {type(trainer_predict.predictions)} shape={trainer_predict.predictions.shape}")  # Model logits
print(f"trainer_predict.label_ids: {type(trainer_predict.label_ids)} shape={trainer_predict.label_ids.shape}")        # Ground truth labels
print(f"trainer_predict.metrics: {type(trainer_predict.metrics)} len={len(trainer_predict.metrics)}")

trainer_predict: <class 'transformers.trainer_utils.PredictionOutput'> len=3
PredictionOutput(predictions=array([[-0.33740234,  0.29736328,  1.2216797 ,  0.8041992 ,  0.02587891,
        -0.20812988],
       [-2.3828125 , -1.5146484 , -0.23669434,  1.6767578 ,  1.0546875 ,
        -2.0703125 ],
       [-2.0683594 , -1.0917969 ,  1.3691406 ,  1.3251953 ,  0.48706055,
        -1.9228516 ],
       ...,
       [-1.5439453 , -0.40966797,  0.39257812,  1.0429688 ,  0.51464844,
        -1.0742188 ],
       [-0.9140625 , -0.2175293 ,  0.50390625,  0.61035156, -0.10705566,
        -0.9604492 ],
       [-1.9589844 , -0.9663086 , -0.03497314,  1.2304688 ,  0.5136719 ,
        -1.7626953 ]], dtype=float32), label_ids=array([[0., 1., 1., 0., 0., 0.],
       [0., 0., 0., 1., 1., 0.],
       [0., 0., 1., 1., 1., 0.],
       ...,
       [0., 0., 0., 1., 1., 0.],
       [1., 1., 1., 1., 1., 1.],
       [0., 0., 1., 1., 1., 0.]], dtype=float32), metrics={'test_loss': 0.009995140135288239, 'test_f1': 0.7

In [102]:
trainer_predict_json_serializable = {
    'predictions': trainer_predict.predictions.tolist(),  # Convert Numpy array to list
    'label_ids'  : trainer_predict.label_ids.tolist(),    # Convert Numpy array to list
    'metrics'    : trainer_predict.metrics                # Dictionary is already serializable
}

In [103]:
file_path = "trainer_predict.json"
with open(file_path, "w") as f:
  json.dump(trainer_predict_json_serializable, f)
print(f"Test output successfully saved to {file_path}.")

Test output successfully saved to trainer_predict.json.


In [104]:
print("Second test successfully completed.")

Second test successfully completed.


Upload Test Results to HF repo_id_dataset

In [105]:
upload_file(
    path_or_fileobj = f"{phase_test}_results.csv",
    path_in_repo    = f"{phase_test}_results.csv",
    repo_id         = HF_name,
    repo_type       = 'dataset'
)
upload_file(
    path_or_fileobj = f"{phase_test}_metrics.json",
    path_in_repo    = f"{phase_test}_metrics.json",
    repo_id         = HF_name,
    repo_type       = 'dataset'
)
upload_file(
    path_or_fileobj = 'trainer_predict.json',
    path_in_repo    = 'trainer_predict.json',
    repo_id         = HF_name,
    repo_type       = 'dataset'
)

CommitInfo(commit_url='https://huggingface.co/datasets/claudelepere/skill_classification/commit/d4353e3b2565c7c46a55a10c03950b97e00f4b24', commit_message='Upload trainer_predict.json with huggingface_hub', commit_description='', oid='d4353e3b2565c7c46a55a10c03950b97e00f4b24', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/claudelepere/skill_classification', endpoint='https://huggingface.co', repo_type='dataset', repo_id='claudelepere/skill_classification'), pr_revision=None, pr_num=None)

In [106]:
print("It's the end")

It's the end
