<a href="https://colab.research.google.com/github/claudelepere/ML_GitHub/blob/main/Longformer_11_1200_32_768_weighted_loss_function.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q accelerate
!pip install -q huggingface_hub
!pip install -q scikit-learn
!pip install -q transformers datasets  # transformers and datasets are Hugging Face libraries
!pip install -q wandb

import json
import numpy as np
import os
import sys
import time
import torch
import wandb

from datasets              import DatasetDict
from google.colab          import auth, drive, files, userdata
from huggingface_hub       import create_repo, login, upload_file
from huggingface_hub.utils import RepositoryNotFoundError
from sklearn.metrics       import accuracy_score, average_precision_score, classification_report, f1_score, precision_score, precision_recall_fscore_support, recall_score, roc_auc_score
from torch.utils.data      import DataLoader
from tqdm.auto             import tqdm
from transformers          import AdamW, EvalPrediction, LongformerTokenizerFast, LongformerForSequenceClassification, Trainer, TrainingArguments
from torch.nn              import BCEWithLogitsLoss


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/480.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m471.0/480.6 kB[0m [31m17.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the fo

In [2]:
"""
# Check the Python version
print(sys.version)
print()

# Get the installed packages (you can see that conda is not installed (do not install it))
!pip list
print()

# Check system information
!cat /etc/os-release
!uname -m
print()

# Check the GPU details (only if the runtime type is T4 GPU)
#!nvidia-smi
#print()

# Check RAM
!free -h
print()

# Check disk space
!df -h
print()

# Get environment variables
for key, value in os.environ.items():
    print(f"{key}: {value}")
"""
!python -V

print(f"currentdir: {os.getcwd()}")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"device: {device}")

datasetDict_zip_file_name = "dataset_11_1200.zip"
datasetDict_dir_name      = os.path.splitext(datasetDict_zip_file_name)[0]
print(f"datasetDict_zip_file_name: {datasetDict_zip_file_name}")
print(f"datasetDict_dir_name     : {datasetDict_dir_name}")
print()

# OOM: reduce batch size
#      small sizes (1 to 32):            PROs: better generalization in some cases
#                                        CONs: may produce noisier gradients
#      large sizes (128, 256, or higer): PROs: gradients are smoother, leading to more stable training
#                                        CONs: poorer generalization (overfitting) in some cases
#      intermediate sizes (32, 64):      combines the benefits of small and large sizes
batch_size = 8

# OOM: enable gradient accumulation to compensate for smaller batch sizes by accumulating gradients over several steps
#      effective batch size = per-device batch size x gradient accumulation steps;
#      in each iteration, the model computes the gradients, these gradients are immediately used to update the model parameters
gradient_accumulation_steps = 4  #<<<<<<<<<<<<<<<<<<< gradient_accumulation_steps may not be None => comment it in TrainingArguments

# OOM: use PYTORCH_CUDA_ALLOC_CONF to handle memory fragmentation
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

# OOM: check for zombie processes
if torch.cuda.is_available():
  !nvidia-smi
  torch.cuda.memory_summary()
!ps aux | grep python
#!kill -9 <PID>
#!nvidia-smi     # Checked if killed

# OOM: use fp16 (half precision) mixed precision training
#      reduces memory requirements by up to 50%
fp16 = True

# OOM: limit the number of GPU workers: 0 (default) or 1 in Colab
#dataloader_num_workers = 1

# OOM: reduce model size or input tokens
#      1) LongformerTokenizer.from_pretrained('allenai/longformer-base/large-4096'): large/base: 435M/149M parameters
#      2) max_length: 4096 max for Longformer; 1 word can give several tokens, stop words are NOT discarded!
#         word_text_length_counts_sorted: jobs count                 : 50000
#                                         jobs count under  512 words: 44794  89.59%
#                                         jobs count under  640 words: 47894  95.79%
#                                         jobs count under  768 words: 49123  98.25%
#                                         jobs count under  896 words: 49691  99.38%
#                                         jobs count under 1024 words: 49917  99.83%
#                                         jobs count under 2048 words: 50000 100.00%
#                                         jobs count under 4096 words: 50000 100.00%
max_length = 768

# OOM: free up GPU memory
torch.cuda.empty_cache()

# OOM: monitor GPU memory usage
#!nividia-smi

# 1 epoch is a complete pass through the entire training dataset;
# with n datapoints and batch size = b, n/b iterations to complete 1 epoch;
# 1 iteration is a single update of the model's parameters
epochs = 5

# A common rule is to scale the learning rate proportionaly with the effective batch size
# note: get_linear_schedule_with_warmup <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
learning_rate = 2e-5  # 1e-5 x 32/8

# Reduce the number of transformers layers
#hidden_layers = 12    # 12 (default) or 6

# Threshold: 0.5 (default)
threshold = 0.2

if fp16:
  _fp = "fp16"
else:
  _fp = "fp32"

if 'gradient_accumulation_steps' not in globals():
  run_name = f"Longformer-multilabel-{datasetDict_dir_name}-length{max_length}-batch{batch_size}-epochs{epochs}-lr{learning_rate}-{_fp}-threshold{threshold}"
else:
  run_name = f"Longformer-multilabel-{datasetDict_dir_name}-length{max_length}-batch{batch_size}x{gradient_accumulation_steps}-epochs{epochs}-lr{learning_rate}-{_fp}-threshold{threshold}"
print(f"run_name                 : {run_name}")

Python 3.10.12
currentdir: /content
device: cpu
datasetDict_zip_file_name: dataset_11_1200.zip
datasetDict_dir_name     : dataset_11_1200

root          69 11.1  0.0      0     0 ?        Z    16:57   0:16 [python3] <defunct>
root          70  0.5  0.3  66788 52328 ?        S    16:57   0:00 python3 /usr/local/bin/colab-file
root          87  3.4  0.8 363272 117972 ?       Sl   16:57   0:05 /usr/bin/python3 /usr/local/bin/j
root         333 26.2  8.4 5467952 1125252 ?     Ssl  16:58   0:23 /usr/bin/python3 -m colab_kernel_
root         368  1.0  0.1 543988 18136 ?        Sl   16:58   0:00 /usr/bin/python3 /usr/local/lib/p
root         787  0.0  0.0   7376  3400 ?        S    17:00   0:00 /bin/bash -c ps aux | grep python
root         789  0.0  0.0   6484  2256 ?        S    17:00   0:00 grep python
run_name                 : Longformer-multilabel-dataset_11_1200-length768-batch8x4-epochs5-lr2e-05-fp16-threshold0.2


In [3]:
def upload_unzip_dataset(file_name=datasetDict_zip_file_name):
  # Check if the file exists
  if not os.path.exists(file_name):
    print(f"'{file_name}' not found in /content. Uploading...")
    uploaded_files = files.upload()                              # Prompt file upload dialog
    if file_name not in uploaded_files:
      raise FileNotFoundError(f"'{file_name}' was not uploaded. Please try again.")
    print(f"'{file_name}' successfully uploaded to /content")
    uploaded_file_name = list(uploaded_files.keys())[0]          # Get the name of the uploaded file

    !unzip {uploaded_file_name}

    unzipped_dir_name = os.path.splitext(uploaded_file_name)[0]
    assert unzipped_dir_name==datasetDict_dir_name, "unzipped_dir_name != datasetDict_dir_name"
  else:
    print(f"'{datasetDict_dir_name}' already exists in /content.")


In [4]:
upload_unzip_dataset(datasetDict_zip_file_name)

'dataset_11_1200.zip' not found in /content. Uploading...


Saving dataset_11_1200.zip to dataset_11_1200.zip
'dataset_11_1200.zip' successfully uploaded to /content
Archive:  dataset_11_1200.zip
  inflating: dataset_11_1200/dataset_dict.json  
  inflating: dataset_11_1200/test/data-00000-of-00001.arrow  
  inflating: dataset_11_1200/test/dataset_info.json  
  inflating: dataset_11_1200/test/state.json  
  inflating: dataset_11_1200/train/data-00000-of-00001.arrow  
  inflating: dataset_11_1200/train/dataset_info.json  
  inflating: dataset_11_1200/train/state.json  
  inflating: dataset_11_1200/validation/data-00000-of-00001.arrow  
  inflating: dataset_11_1200/validation/dataset_info.json  
  inflating: dataset_11_1200/validation/state.json  


In [5]:
# Hugging Face Authenticate

os.environ["HF_TOKEN"] = userdata.get("HF_TOKEN")    # Store the key in os.environ
hf_token               = os.environ.get('HF_TOKEN')
login(token=hf_token)

# Verify
!huggingface-cli whoami

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


claudelepere


In [6]:
# Create the skill_classification repo on the Hugging Face Hub

HF_name         = "claudelepere/skill_classification"
repo_id_model   = HF_name
repo_id_dataset = HF_name

repo_model_url = create_repo(
    repo_id   = repo_id_model,
    repo_type = "model",
    private   = True,
    exist_ok  = True
)
print(f"Repo model url: {repo_model_url} created successfully as a private repo.")

repo_dataset_url = create_repo(
    repo_id   = repo_id_dataset,
    repo_type = "dataset",
    private   = True,
    exist_ok  = True
)
print(f"Repo datasets url: {repo_dataset_url} created successfully as a private repo.")

repo_id_dataset = f"datasets/{HF_name}"

print(f"repo_id_model: {repo_id_model}")
print(f"repo_id_dataset: {repo_id_dataset}")

Repo model url: https://huggingface.co/claudelepere/skill_classification created successfully as a private repo.
Repo datasets url: https://huggingface.co/datasets/claudelepere/skill_classification created successfully as a private repo.
repo_id_model: claudelepere/skill_classification
repo_id_dataset: datasets/claudelepere/skill_classification


In [7]:
# W&B initialization

os.environ["WANDB_API_KEY"] = userdata.get("WANDB_API_KEY")        # Store the key in os.environ
wandb_api_key               = os.environ.get('WANDB_API_KEY')
wandb.login(key=wandb_api_key)

try:
  wandb.init(
      project = "skill_classification",
      name    = run_name,
      entity  = "claudelepere-c-cile-cy",
      config  = {
          "learning_rate": learning_rate,
          "epochs"       : 5,
          "batch_size"   : batch_size
      }
  )
except wandb.errors.CommError as err:
  print(f"CommError: {err}")
except Exception as exc:
  print(f"Exception: {exc}")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mclaudelepere[0m ([33mclaudelepere-c-cile-cy[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [8]:
# Create the dataset: 3 Hugging Face Dataset in a Hugging Face DatasetDict

datasetDict = DatasetDict.load_from_disk(datasetDict_dir_name)

print(f"datasetDict: {type(datasetDict)} {datasetDict.shape}\n{datasetDict}")
print(f"datasetDict.keys(): {datasetDict.keys()}")
print(f"datasetDict['train']: {type(datasetDict['train'])} {datasetDict['train'].shape}")
print(f"datasetDict['validation']: {type(datasetDict['validation'])} {datasetDict['validation'].shape}")
print(f"datasetDict['test']: {type(datasetDict['test'])} {datasetDict['test'].shape}")

datasetDict: <class 'datasets.dataset_dict.DatasetDict'> {'train': (960, 8), 'validation': (120, 8), 'test': (120, 8)}
DatasetDict({
    train: Dataset({
        features: ['id', 'text', '390', '135', '136', '137', '138', '139'],
        num_rows: 960
    })
    validation: Dataset({
        features: ['id', 'text', '390', '135', '136', '137', '138', '139'],
        num_rows: 120
    })
    test: Dataset({
        features: ['id', 'text', '390', '135', '136', '137', '138', '139'],
        num_rows: 120
    })
})
datasetDict.keys(): dict_keys(['train', 'validation', 'test'])
datasetDict['train']: <class 'datasets.arrow_dataset.Dataset'> (960, 8)
datasetDict['validation']: <class 'datasets.arrow_dataset.Dataset'> (120, 8)
datasetDict['test']: <class 'datasets.arrow_dataset.Dataset'> (120, 8)


In [9]:
example = datasetDict['train'][0]
print(f"datasetDict['train'][0]: {type(example)} {example.keys()}\n{example}")

datasetDict['train'][0]: <class 'dict'> dict_keys(['id', 'text', '390', '135', '136', '137', '138', '139'])
{'id': 331119, 'text': 'Atcon Global - Scrum Master Scrum,Agile Atcon Global Context of the mission/ objective(s) of the job What we are looking for: Transparency and honesty on the application. If the candidate has not yet embraced the role of a Scrum Master in different companies for at least 3 years, please write a very good motivation on why Scrum Master is her/his new career move. We are looking for long-term collaboration with people human centric who will focus on the people in the team to support. Agile hard skills are necessary however, Agile soft skills are mandatory! Client Group started to implement Lean six years ago with Application Development and Maintenance (Business Analyst and IT) and some projects in the business (outside IT) with the Scrum framework. Now, new Agile initiatives are popping up a bit everywhere in the company. From prototyping in our incubator t

In [10]:
# Create the label list and the id2label and label2id mappings.

"""
dataset 7_1000_125_125  ,  48 labels
dataset 7_128_18_54     ,  42 labels
dataset 8910_1087_68_204, 206 labels
dataset 11_1000         ,   6 labels
"""

labels = [label for label in datasetDict['train'].features.keys() if label not in ['id', 'text']]
labels.sort()
print(f"labels: {type(labels)} {len(labels)}\n{labels}")

id2label = {idx:label for idx, label in enumerate(labels)}
print(f"id2label: {type(id2label)} {len(id2label)}\n{id2label}")

label2id = {label:idx for idx, label in enumerate(labels)}
print(f"label2id: {type(label2id)} {len(label2id)}\n{label2id}")

labels: <class 'list'> 6
['135', '136', '137', '138', '139', '390']
id2label: <class 'dict'> 6
{0: '135', 1: '136', 2: '137', 3: '138', 4: '139', 5: '390'}
label2id: <class 'dict'> 6
{'135': 0, '136': 1, '137': 2, '138': 3, '139': 4, '390': 5}


In [11]:
# Load tokenizer and model

model_name = "allenai/longformer-base-4096"

tokenizer = LongformerTokenizerFast.from_pretrained(model_name)

model = LongformerForSequenceClassification.from_pretrained(
    model_name,
    num_labels        = len(labels),
#    num_hidden_layers = hidden_layers,
    problem_type      = 'multi_label_classification')

# Configure attention window size
model.config.attention_window = 512

optimizer = AdamW(model.parameters(), lr=learning_rate)

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/597M [00:00<?, ?B/s]

Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# Tokenize ('input_ids' and 'attention_mask'), add 'global_attention_mask' (for Longformer), add 'labels'

def preprocess_data(examples, indices):
  # Step 1: Extract text and tokenize
  text = examples['text']             # Batch of texts
  encoding = tokenizer(
      text,                           # Tokenize text
      truncation     = True,
      padding        = 'max_length',
      max_length     = max_length,
      return_tensors = 'pt'           # Return PyTorch tensors
  )

  # Step 2: Create and add the global attention mask
  global_attention_mask             = torch.zeros_like(encoding['input_ids'])  # Initialize global attention mask with zeros (same shape as input_ids)
  global_attention_mask[:, 0]       = 1                                        # Set global attention on the first token ([CLS], token ID=0) in each sequence
  encoding['global_attention_mask'] = global_attention_mask                    # Add the global_attention_mask to the batch

  # Step 3: Create and populate the label matrix
  labels_matrix = torch.zeros((len(text), len(labels)), dtype=torch.float32)   # Create an empty label matrix
  #print(f"labels_matrix: {type(labels_matrix)} {labels_matrix.shape}")
  #---------Populate label matrix
  for idx, label in enumerate(labels):
    #print(f"idx:{idx} label:{label}")
    if label in examples:
      labels_matrix[:, idx] = torch.tensor(
          [1.0 if val else 0.0 for val in examples[label]],
          dtype=torch.float32
          )
  #print(f"labels_matrix: {type(labels_matrix)} {labels_matrix.shape}")

  encoding['labels'] = labels_matrix                                           # Add labels to the encoding
  #print(f"encoding['labels']: {encoding['labels']}")

  # encoding: <class 'transformers.tokenization_utils_base.BatchEncoding'> dict_keys(['input_ids', 'attention_mask', 'global_attention_mask', 'labels'])
  #   'input_ids': tensor([[
  #   'attention_mask': tensor([[
  #   'global_attention_mask': tensor([[
  #   'labels': tensor([[
  print(f"encoding: {type(encoding)} {encoding.keys()}\n{encoding}")

  return encoding

In [13]:
# Create the 3 encoded datasets, train, validation and test

encoded_dataset = datasetDict.map(
    preprocess_data,
    batched        = True,
    remove_columns = datasetDict['train'].column_names,
    with_indices   = True
    )
train_dataset      = encoded_dataset['train']
validation_dataset = encoded_dataset['validation']
test_dataset       = encoded_dataset['test']

print(f"encoded_dataset: {type(encoded_dataset)} {encoded_dataset.shape}\n{encoded_dataset}")
print(f"train_dataset: {type(train_dataset)} {train_dataset.shape} {train_dataset.features}")
print(f"validation_dataset: {type(validation_dataset)} {validation_dataset.shape} {validation_dataset.features}")
print(f"test_dataset: {type(test_dataset)} {test_dataset.shape} {test_dataset.features}")
print("---")
print(f"test_dataset['labels']: {type(test_dataset['labels'])} {len(test_dataset['labels'])}\n{test_dataset['labels']}")
print("---")
print(f"train_dataset[0]['input_ids']: {type(train_dataset[0]['input_ids'])} {len(train_dataset[0]['input_ids'])}\n{train_dataset['input_ids'][0]}")
print(f"train_dataset[0]['attention_mask']: {type(train_dataset[0]['attention_mask'])} {len(train_dataset[0]['attention_mask'])}\n{train_dataset['attention_mask'][0]}")
print(f"train_dataset[0]['global_attention_mask']: {type(train_dataset[0]['global_attention_mask'])} {len(train_dataset[0]['global_attention_mask'])}\n{train_dataset['global_attention_mask'][0]}")
print(f"train_dataset[0]['labels']: {type(train_dataset[0]['labels'])} {len(train_dataset[0]['labels'])}\n{train_dataset[0]['labels']}")
print(f"train_dataset['labels'][0]: {type(train_dataset['labels'][0])} {len(train_dataset['labels'][0])}\n{train_dataset['labels'][0]}")


Map:   0%|          | 0/960 [00:00<?, ? examples/s]

encoding: <class 'transformers.tokenization_utils_base.BatchEncoding'> dict_keys(['input_ids', 'attention_mask', 'global_attention_mask', 'labels'])
{'input_ids': tensor([[    0,  3750,  3865,  ...,     1,     1,     1],
        [    0,  1000,  2253,  ...,     1,     1,     1],
        [    0, 38789, 21094,  ...,     1,     1,     1],
        ...,
        [    0, 32379,  7941,  ..., 23113,  3677,     2],
        [    0,   104,  1517,  ...,     1,     1,     1],
        [    0,   534,  1342,  ...,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'global_attention_mask': tensor([[1, 0, 0,  ..., 0, 0, 0],
        [1, 0, 0,  ..., 0, 0, 0],
        [1, 0, 0,  ..., 0, 0, 0],
        ...,
        [1, 0, 0,  ..., 0, 0, 0],
        [1, 0, 0,  ..., 0, 0, 0],
        [1, 0, 0,  ..., 0, 0,

Map:   0%|          | 0/120 [00:00<?, ? examples/s]

encoding: <class 'transformers.tokenization_utils_base.BatchEncoding'> dict_keys(['input_ids', 'attention_mask', 'global_attention_mask', 'labels'])
{'input_ids': tensor([[    0,  1121, 38823,  ...,     1,     1,     1],
        [    0, 17488,  9932,  ...,     1,     1,     1],
        [    0,  1121,   594,  ...,     1,     1,     1],
        ...,
        [    0, 43703,  1671,  ...,     7,   109,     2],
        [    0,   118, 36025,  ...,     1,     1,     1],
        [    0, 26145,  1290,  ...,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'global_attention_mask': tensor([[1, 0, 0,  ..., 0, 0, 0],
        [1, 0, 0,  ..., 0, 0, 0],
        [1, 0, 0,  ..., 0, 0, 0],
        ...,
        [1, 0, 0,  ..., 0, 0, 0],
        [1, 0, 0,  ..., 0, 0, 0],
        [1, 0, 0,  ..., 0, 0,

Map:   0%|          | 0/120 [00:00<?, ? examples/s]

encoding: <class 'transformers.tokenization_utils_base.BatchEncoding'> dict_keys(['input_ids', 'attention_mask', 'global_attention_mask', 'labels'])
{'input_ids': tensor([[    0,   104,  1517,  ..., 30884,     7,     2],
        [    0,  6209, 14286,  ...,    84,  2731,     2],
        [    0, 36812,  4189,  ...,     1,     1,     1],
        ...,
        [    0,   725,  7073,  ...,     1,     1,     1],
        [    0, 11094,  7424,  ...,     1,     1,     1],
        [    0, 36812,  4189,  ...,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'global_attention_mask': tensor([[1, 0, 0,  ..., 0, 0, 0],
        [1, 0, 0,  ..., 0, 0, 0],
        [1, 0, 0,  ..., 0, 0, 0],
        ...,
        [1, 0, 0,  ..., 0, 0, 0],
        [1, 0, 0,  ..., 0, 0, 0],
        [1, 0, 0,  ..., 0, 0,

In [14]:
# Truncated part

def get_truncated_part(text):
  tokens = tokenizer(
      text,
      truncation                = True,
      padding                   = 'max_length',
      max_length                = max_length,
      return_overflowing_tokens = True,
      return_tensors            = None
      )
  print(f"tokens.keys(): {tokens.keys()}")

  # Get the truncated tokens
  truncated_ids = tokens["input_ids"][0]
  print(f"truncated_ids: {type(truncated_ids)} {truncated_ids}")
  #overflow_ids  = tokens["overflow_to_sample_mapping"][0]
  #print(f"overflow_ids: {type(overflow_ids)} {overflow_ids}")

  # Decode the tokens back to text
  truncated_text = tokenizer.decode(truncated_ids, skip_special_tokens=True)
  #overflow_text  = tokenizer.decode(overflow_ids, skip_special_tokens=True)

  print(f"original_text :\n{text}")
  print(f"truncated_text:\n{truncated_text}")
  #print(f"overflow_text:\n{overflow_text}")

  original_tokens  = tokenizer.tokenize(text)
  truncated_tokens = tokenizer.tokenize(truncated_text)
  #overflow_tokens  = tokenizer.tokenize(overflow_text)

  print(f"original_tokens count : {len(original_tokens)}")
  print(f"truncated_tokens count: {len(truncated_tokens)}")
  #print(f"overflow_tokens count: {len(overflow_tokens)}")


In [21]:
example_text = datasetDict['train'][0]['text']
get_truncated_part(example_text)

inputs = tokenizer(
    example_text,
    truncation     = True,
    padding        = 'max_length',
    max_length     = max_length,
    return_tensors = 'pt'
)

# inputs: <class 'transformers.tokenization_utils_base.BatchEncoding'> dict_keys(['input_ids', 'attention_mask'])
#   {'input_ids': tensor([[
#   'attention_mask': tensor([[
print(f"inputs: {type(inputs)} {inputs.keys()}") #\n{inputs}")
print(f"inputs_ids: {type(inputs.input_ids)} {inputs.input_ids.shape}\n{inputs.input_ids}")
print(f"attention_mask: {type(inputs.attention_mask)} {inputs.attention_mask.shape}\n{inputs.attention_mask}")
#print(f"token_type_ids: {inputs.token_type_ids.shape}")
#print(f"labels: {inputs.labels.shape}")
raise Exception("stop here")

tokens.keys(): dict_keys(['input_ids', 'attention_mask', 'overflow_to_sample_mapping'])
truncated_ids: <class 'list'> [0, 3750, 3865, 1849, 111, 2741, 10904, 6935, 2741, 10904, 6, 19897, 1848, 497, 3865, 1849, 43885, 9, 5, 2511, 73, 4554, 1640, 29, 43, 9, 5, 633, 653, 52, 32, 546, 13, 35, 24244, 8, 19439, 15, 5, 2502, 4, 318, 5, 1984, 34, 45, 648, 11589, 5, 774, 9, 10, 2741, 10904, 6935, 11, 430, 451, 13, 23, 513, 155, 107, 6, 2540, 3116, 10, 182, 205, 10563, 15, 596, 2741, 10904, 6935, 16, 69, 73, 12724, 92, 756, 517, 4, 166, 32, 546, 13, 251, 12, 1279, 4918, 19, 82, 1050, 715, 4063, 54, 40, 1056, 15, 5, 82, 11, 5, 165, 7, 323, 4, 3303, 1848, 543, 2417, 32, 2139, 959, 6, 3303, 1848, 3793, 2417, 32, 8549, 328, 33536, 826, 554, 7, 5731, 37882, 411, 107, 536, 19, 11199, 2717, 8, 29738, 36, 18562, 9821, 8, 3779, 43, 8, 103, 1377, 11, 5, 265, 36, 35301, 3779, 43, 19, 5, 2741, 10904, 7208, 4, 978, 6, 92, 3303, 1848, 5287, 32, 20220, 62, 10, 828, 6128, 11, 5, 138, 4, 1740, 40004, 154, 11, 84

AttributeError: 

In [None]:
# 4. Forward pass for multi-label classification

outputs = model(
    input_ids      = inputs.input_ids,
    attention_mask = inputs.attention_mask
    )

print(f"outputs: {type(outputs)} {outputs.keys()}\n{outputs}")

# Logits (= raw model outputs)
logits = outputs.logits
print(f"logits: {type(logits)} {logits.shape}\n{logits}")

# Convert logits to probabilities
sigmoid = torch.nn.Sigmoid()
probs   = sigmoid(logits)
print(f"probs: {type(probs)} {probs.shape}\n{probs}")

In [None]:
example = encoded_dataset['train'][0]

#print(f"example: {type(example)} {example.keys()}\n{example}")
#print()
#print(f"example['input_ids']: {type(example['input_ids'])} {len(example['input_ids'])}\n{example['input_ids']}")
##print(f"example['token_type_ids']: {type(example['token_type_ids'])} {len(example['token_type_ids'])}\n{example['token_type_ids']}")
#print(f"example['attention_mask']: {type(example['attention_mask'])} {len(example['attention_mask'])}\n{example['attention_mask']}")
#print(f"example['labels']:  {type(example['labels'])} {len(example['labels'])}\n{example['labels']}")

In [None]:
tokenizer.decode(example['input_ids'])

In [None]:
[id2label[idx] for idx, label in enumerate(example['labels']) if label == 1.0]

In [None]:
# Set PyTorch format to ensures correctness and compatibility with PyTorch pipelines

# The 3 Hugging Face Dataset are formatted as PyTorch Dataset
encoded_dataset.set_format('torch')

In [None]:
batch_size  = batch_size
metric_name = "f1"

In [None]:
args = TrainingArguments(
    output_dir                  = './training_results',  # where model predictions and checkpoints will be written during training
    overwrite_output_dir        = True,
    logging_dir                 = './logs',
    logging_steps               = 50,
    save_steps                  = 500,
    save_total_limit            = 2,
    eval_strategy               = 'epoch',
    save_strategy               = 'epoch',
    learning_rate               = learning_rate,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size  = batch_size,
    gradient_accumulation_steps = gradient_accumulation_steps,
    num_train_epochs            = epochs,
    weight_decay                = 0.01,
    load_best_model_at_end      = True,
    metric_for_best_model       = metric_name,
    run_name                    = run_name,
    fp16                        = fp16,
    #dataloader_num_workers      = dataloader_num_workers,
    report_to                  = 'wandb'
    )

In [None]:
# Metrics
#   source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/

def multi_label_metrics(predictions, labels):
    average = 'micro'    # 'micro' or 'weighted'

    # first, apply sigmoid on predictions whose shape is (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs   = sigmoid(torch.Tensor(predictions))

    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1

    # finally, compute metrics
    y_true               = labels
    f1                   = f1_score               (y_true=y_true, y_pred=y_pred, average=average)    #, zero_division=1)
    precision            = precision_score        (y_true=y_true, y_pred=y_pred, average=average)    #, zero_division=1)
    recall               = recall_score           (y_true=y_true, y_pred=y_pred, average=average)    #, zero_division=1)
    roc_auc              = roc_auc_score          (y_true=y_true, y_score=probs, average=average)
    precision_recall_auc = average_precision_score(y_true=y_true, y_score=probs, average=average)
    accuracy             = accuracy_score         (y_true=y_true, y_pred=y_pred)

    # return as dictionary
    metrics = {
        'f1'                  : f1,
        'precision'           : precision,
        'recall'              : recall,
        'roc_auc'             : roc_auc,
        'precision_recall_auc': precision_recall_auc,
        'accuracy'            : accuracy
        }

    return metrics

In [None]:
def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    result = multi_label_metrics(
        predictions = preds,
        labels      = p.label_ids
        )
    return result

Let's verify a batch as well as a forward pass:

In [None]:
print(f"input_ids:        {type(encoded_dataset['train']['input_ids'][0])}\t{encoded_dataset['train']['input_ids'][0].shape}")
#print(f"token_type_ids': {type(encoded_dataset['train']['token_type_ids'][0])}\t{encoded_dataset['train']['token_type_ids'][0].shape}")
print(f"attention_mask:  {type(encoded_dataset['train']['attention_mask'][0])}\t{encoded_dataset['train']['attention_mask'][0].shape}")
print(f"labels:          {type(encoded_dataset['train'][0]['labels'])}\t{encoded_dataset['train'][0]['labels'].shape}")

In [None]:
# Execute a forward pass for debugging or verification purposes (cf. BERT_3_1 in Notion BERT database)

outputs = model(
    input_ids      = encoded_dataset['train']['input_ids'][0].unsqueeze(0),
    attention_mask = encoded_dataset['train']['attention_mask'][0].unsqueeze(0),
    labels         = encoded_dataset['train'][0]['labels'].unsqueeze(0)
    )

print(f"outputs: {type(outputs)} {outputs.keys()}\n{outputs}")

In [None]:
# Define the weighted loss function

class_weights = torch.tensor([7.68, 2.15, 0.61, 0.47, 0.68, 6.26], dtype=torch.float32).to(device)
loss_fn       = BCEWithLogitsLoss(pos_weight=class_weights)

In [None]:
# Custom Trainer

class CustomTrainer(Trainer):

  def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
    print(f"inputs passed to compute_loss: {inputs.keys()}")
    labels  = inputs.get('labels')     # Extract labels
    outputs = model(**inputs)          # Forward pass
    logits  = outputs.logits           # Get the logits
    loss    = loss_fn(logits, labels)  # Compute weighted loss

    return (loss, outputs) if return_outputs else loss

In [None]:
# Create a Hugging Face's transformers trainer (which abstracts the training loop)

trainer = CustomTrainer(
    model,
    args,
    train_dataset   = encoded_dataset["train"],
    eval_dataset    = encoded_dataset["validation"],
    tokenizer       = tokenizer,
    compute_metrics = compute_metrics
    )


In [None]:
# Train, save the results as a JSON file

train_output  = trainer.train()

train_results = {
    'global_step':   train_output.global_step,    # total steps completed during training
    'training_loss': train_output.training_loss,  # average loss during training
    'metrics':       train_output.metrics         # dictionary of metrics
}

# Save train results
with open("train_results.json", "w") as f:
  json.dump(train_results, f, indent=4)
print(f"train_results: {type(train_results)} {len(train_results)}\n{train_results}")

In [None]:
print("Training successfully completed.")

## Evaluate

After training, we evaluate our model on the validation set.

In [None]:
def get_results(model, dataset, batch_size, threshold):
  # Clear GPU cache
  torch.cuda.empty_cache()

  # Set the model to evaluation mode to disable dropout and other training-specific behaviors
  model.eval()

  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  model.to(device)

  test_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

  all_preds       = []
  all_probs       = []
  all_true_labels = []

  for batch in tqdm(test_loader):
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
      outputs = model(**batch)
    logits = outputs.logits

    # Convert logits to probabilities and probabilities to predictions
    sigmoid = torch.nn.Sigmoid()
    probs   = sigmoid(logits).cpu().numpy()    # Convert to Numpy
    preds   = (probs > threshold).astype(int)  # Convert to binary Numpy array

    # Accumulate probabilities, predictions and labels
    all_probs.append(probs)
    all_preds.append(preds)
    all_true_labels.append(batch['labels'].cpu().numpy())

  # Concatenate results from all batches
  all_probs       = np.concatenate(all_probs, axis=0)        # shape: [num_samples, num_labels]
  all_preds       = np.concatenate(all_preds, axis=0)        # shape: [num_samples, num_labels]
  all_true_labels = np.concatenate(all_true_labels, axis=0)  # shape: [num_samples, num_labels]

  print(f"all_probs:       {type(all_probs)} {all_probs.shape}")
  print(f"all_preds:       {type(all_preds)} {all_preds.shape}")
  print(f"all_true_labels: {type(all_true_labels)} {all_true_labels.shape}")

  # Classification report for precision, recall, F1 score
  print(classification_report(
      y_true        = all_true_labels,
      y_pred        = all_preds,
      target_names  = labels,
      zero_division = 0
      ))

  # ROC AUC for multi-label classification
  roc_auc = roc_auc_score(
      y_true  = all_true_labels,
      y_score = all_probs,
      average = 'micro'
      )
  print(f"ROC AUC: {roc_auc}")

In [None]:
# First evaluate results NO SAVE

get_results(model=model, dataset=validation_dataset, batch_size=batch_size, threshold=threshold)

In [None]:
print("First evaluation successfully completed.")

In [None]:
# Second evaluate results; save to /content

eval_output = trainer.evaluate()

# Save evaluate results
with open("eval_results.json", "w") as f:
  json.dump(eval_output, f, indent=4)

In [None]:
print("Second evaluation successfully completed.")

## Upload model, tokenizer, train results, evaluate results

In [None]:
# Save model to /content

model_path = "model"
trainer.save_model(model_path)

In [None]:
# Upload model and tokenizer to the HF repo_id_model

tokenizer = LongformerTokenizerFast.from_pretrained(model_path)
model     = LongformerForSequenceClassification.from_pretrained(model_path)

tokenizer.push_to_hub(repo_id_model)
model.push_to_hub(repo_id_model)


In [None]:
# Upload train_results.json and eval_results.json to the HF repo_id_dataset BETTER to upload to wanddb repo?

upload_file(
    path_or_fileobj = "train_results.json",
    path_in_repo    = "train_results.json",
    repo_id         = HF_name,
    repo_type       = "dataset"
    )

upload_file(
    path_or_fileobj = "eval_results.json",
    path_in_repo    = "eval_results.json",
    repo_id         = HF_name,
    repo_type       = "dataset"
    )

## Test

In [None]:
# Test: first results NO SAVE

get_results(model=model, dataset=test_dataset, batch_size=batch_size, threshold=threshold)

In [None]:
print("First test successfully completed.")

In [None]:
# Test: second results NO SAVE

predictions = trainer.predict(test_dataset)

#print(f"predictions.predictions: {type(predictions.predictions)} {predictions.predictions.shape}\n{predictions.predictions}")  # Model logits
#print(f"predictions.label_ids: {type(predictions.label_ids)} {predictions.label_ids.shape}\n{predictions.label_ids}")          # Ground truth labels
print(f"predictions.metrics: {type(predictions.metrics)} {len(predictions.metrics)}\n{predictions.metrics}")                  # Metrics


In [None]:
print("Second test successfully completed.")

### Or otherwise

In [None]:
# Test: third results NO SAVE

predictions = trainer.predict(test_dataset)
#print(predictions.predictions)  # Model logits
#print(predictions.label_ids)    # Ground truth labels
print(predictions.metrics)      # Metrics

In [None]:
print("Third test successfully completed.")