<a href="https://colab.research.google.com/github/claudelepere/ML_GitHub/blob/main/Longformer_11_10000_768.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q accelerate
!pip install -q huggingface_hub
!pip install -q scikit-learn
!pip install -q transformers datasets
!pip install -q wandb

import json
import numpy as np
import os
import sys
import time
import torch
import wandb

from datasets              import DatasetDict
from google.colab          import auth, drive, files, userdata
from huggingface_hub       import create_repo, login, upload_file
from huggingface_hub.utils import RepositoryNotFoundError
from sklearn.metrics       import accuracy_score, average_precision_score, classification_report, f1_score, precision_score, precision_recall_fscore_support, recall_score, roc_auc_score
from torch.utils.data      import DataLoader
from tqdm.auto             import tqdm
from transformers          import AdamW, EvalPrediction, LongformerTokenizerFast, LongformerForSequenceClassification, Trainer, TrainingArguments
from torch.nn              import BCEWithLogitsLoss


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/480.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/179.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/134.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
"""
# Check the Python version
print(sys.version)
print()

# Get the installed packages (you can see that conda is not installed (do not install it))
!pip list
print()

# Check system information
!cat /etc/os-release
!uname -m
print()

# Check the GPU details (only if the runtime type is T4 GPU)
#!nvidia-smi
#print()

# Check RAM
!free -h
print()

# Check disk space
!df -h
print()

# Get environment variables
for key, value in os.environ.items():
    print(f"{key}: {value}")
"""
!python -V

print(f"currentdir: {os.getcwd()}")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"device: {device}")

datasetDict_zip_file_name = "dataset_11_10000.zip"
datasetDict_dir_name      = os.path.splitext(datasetDict_zip_file_name)[0]
print(f"datasetDict_zip_file_name: {datasetDict_zip_file_name}")
print(f"datasetDict_dir_name     : {datasetDict_dir_name}")
print()

# OOM: reduce batch size
#      small sizes (1 to 32):            PROs: better generalization in some cases
#                                        CONs: may produce noisier gradients
#      large sizes (128, 256, or higer): PROs: gradients are smoother, leading to more stable training
#                                        CONs: poorer generalization (overfitting) in some cases
#      intermediate sizes (32, 64):      combines the benefits of small and large sizes
batch_size = 8

# OOM: enable gradient accumulation to compensate for smaller batch sizes by accumulating gradients over several steps
#      effective batch size = per-device batch size x gradient accumulation steps;
#      in each iteration, the model computes the gradients, these gradients are immediately used to update the model parameters
gradient_accumulation_steps = 4

# OOM: use PYTORCH_CUDA_ALLOC_CONF to handle memory fragmentation
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

# OOM: check for zombie processes
if torch.cuda.is_available():
  !nvidia-smi
  torch.cuda.memory_summary()
!ps aux | grep python
#!kill -9 <PID>
#!nvidia-smi     # Checked if killed

# OOM: use fp16 (half precision) mixed precision training
#      reduces memory requirements by up to 50%
fp16 = True

# OOM: limit the number of GPU workers: 0 (default) or 1 in Colab
#dataloader_num_workers = 1

# OOM: reduce model size or input tokens
#      1) LongformerTokenizer.from_pretrained('allenai/longformer-base/large-4096'): large/base: 435M/149M parameters
#      2) max_length: 4096 max for Longformer; 1 word can give several tokens, stop words are NOT discarded!
#         word_text_length_counts_sorted: jobs count                 : 50000
#                                         jobs count under  512 words: 44794  89.59%
#                                         jobs count under  640 words: 47894  95.79%
#                                         jobs count under  768 words: 49123  98.25%
#                                         jobs count under  896 words: 49691  99.38%
#                                         jobs count under 1024 words: 49917  99.83%
#                                         jobs count under 2048 words: 50000 100.00%
#                                         jobs count under 4096 words: 50000 100.00%
max_length = 768

# OOM: free up GPU memory
torch.cuda.empty_cache()

# OOM: monitor GPU memory usage
#!nividia-smi

# 1 epoch is a complete pass through the entire training dataset;
# with n datapoints and batch size = b, n/b iterations to complete 1 epoch;
# 1 iteration is a single update of the model's parameters
epochs = 5

# A common rule is to scale the learning rate proportionaly with the effective batch size
# note: get_linear_schedule_with_warmup <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
learning_rate = 2e-5  # 1e-5 x 32/8

# Reduce the number of transformers layers
#hidden_layers = 12    # 12 (default) or 6

# Threshold: 0.5 (default)
threshold = 0.2

if fp16:
  _fp = "fp16"
else:
  _fp = "fp32"
run_name = f"Longformer-multilabel-{datasetDict_dir_name}-length{max_length}-batch{batch_size}x{gradient_accumulation_steps}-epochs{epochs}-lr{learning_rate}-{_fp}-threshold{threshold}"
print(f"run_name                 : {run_name}")

Python 3.10.12
currentdir: /content
device: cuda
datasetDict_zip_file_name: dataset_11_10000.zip
datasetDict_dir_name     : dataset_11_10000

Thu Dec 26 09:36:14 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P8               9W /  70W |      3MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+---------------

In [3]:
def upload_unzip_dataset(file_name=datasetDict_zip_file_name):
  # Check if the file exists
  if not os.path.exists(file_name):
    print(f"'{file_name}' not found in /content. Uploading...")
    uploaded_files = files.upload()                              # Prompt file upload dialog
    if file_name not in uploaded_files:
      raise FileNotFoundError(f"'{file_name}' was not uploaded. Please try again.")
    print(f"'{file_name}' successfully uploaded to /content")
    uploaded_file_name = list(uploaded_files.keys())[0]          # Get the name of the uploaded file

    !unzip {uploaded_file_name}

    unzipped_dir_name = os.path.splitext(uploaded_file_name)[0]
    assert unzipped_dir_name==datasetDict_dir_name, "unzipped_dir_name != datasetDict_dir_name"
  else:
    print(f"'{datasetDict_dir_name}' already exists in /content.")


In [4]:
upload_unzip_dataset(datasetDict_zip_file_name)

'dataset_11_10000.zip' not found in /content. Uploading...


Saving dataset_11_10000.zip to dataset_11_10000.zip
'dataset_11_10000.zip' successfully uploaded to /content
Archive:  dataset_11_10000.zip
  inflating: dataset_11_10000/dataset_dict.json  
  inflating: dataset_11_10000/test/data-00000-of-00001.arrow  
  inflating: dataset_11_10000/test/dataset_info.json  
  inflating: dataset_11_10000/test/state.json  
  inflating: dataset_11_10000/train/data-00000-of-00001.arrow  
  inflating: dataset_11_10000/train/dataset_info.json  
  inflating: dataset_11_10000/train/state.json  
  inflating: dataset_11_10000/validation/data-00000-of-00001.arrow  
  inflating: dataset_11_10000/validation/dataset_info.json  
  inflating: dataset_11_10000/validation/state.json  


In [5]:
# Hugging Face Authenticate

os.environ["HF_TOKEN"] = userdata.get("HF_TOKEN")    # Store the key in os.environ
hf_token               = os.environ.get('HF_TOKEN')
login(token=hf_token)

# Verify
!huggingface-cli whoami

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


claudelepere


In [6]:
# Create the skill_classification repo on the Hugging Face Hub

HF_name         = "claudelepere/skill_classification"
repo_id_model   = HF_name
repo_id_dataset = HF_name

repo_model_url = create_repo(
    repo_id   = repo_id_model,
    repo_type = "model",
    private   = True,
    exist_ok  = True
)
print(f"Repo model url: {repo_model_url} created successfully as a private repo.")

repo_dataset_url = create_repo(
    repo_id   = repo_id_dataset,
    repo_type = "dataset",
    private   = True,
    exist_ok  = True
)
print(f"Repo datasets url: {repo_dataset_url} created successfully as a private repo.")

repo_id_dataset = f"datasets/{HF_name}"

print(f"repo_id_model: {repo_id_model}")
print(f"repo_id_dataset: {repo_id_dataset}")

Repo model url: https://huggingface.co/claudelepere/skill_classification created successfully as a private repo.
Repo datasets url: https://huggingface.co/datasets/claudelepere/skill_classification created successfully as a private repo.
repo_id_model: claudelepere/skill_classification
repo_id_dataset: datasets/claudelepere/skill_classification


In [7]:
# W&B initialization

os.environ["WANDB_API_KEY"] = userdata.get("WANDB_API_KEY")        # Store the key in os.environ
wandb_api_key               = os.environ.get('WANDB_API_KEY')
wandb.login(key=wandb_api_key)

try:
  wandb.init(
      project = "skill_classification",
      name    = run_name,
      entity  = "claudelepere-c-cile-cy",
      config  = {
          "learning_rate": learning_rate,
          "epochs"       : 5,
          "batch_size"   : batch_size
      }
  )
except wandb.errors.CommError as err:
  print(f"CommError: {err}")
except Exception as exc:
  print(f"Exception: {exc}")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mclaudelepere[0m ([33mclaudelepere-c-cile-cy[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [8]:
# Create the dataset: 3 Hugging Face Dataset in a Hugging Face DatasetDict

datasetDict = DatasetDict.load_from_disk(datasetDict_dir_name)

print(f"datasetDict: {type(datasetDict)} {datasetDict.shape}\n{datasetDict}")


datasetDict: <class 'datasets.dataset_dict.DatasetDict'> {'train': (7000, 8), 'validation': (1500, 8), 'test': (1500, 8)}
DatasetDict({
    train: Dataset({
        features: ['id', 'text', '390', '135', '136', '137', '138', '139'],
        num_rows: 7000
    })
    validation: Dataset({
        features: ['id', 'text', '390', '135', '136', '137', '138', '139'],
        num_rows: 1500
    })
    test: Dataset({
        features: ['id', 'text', '390', '135', '136', '137', '138', '139'],
        num_rows: 1500
    })
})


In [9]:
example = datasetDict['train'][0]
print(f"example: {type(example)} {example.keys()}\n{example}")

example: <class 'dict'> dict_keys(['id', 'text', '390', '135', '136', '137', '138', '139'])
{'id': 155200, 'text': 'Vivid Resourcing - Service Engineer Service, Engineer, Mechanical Vivid Resourcing Main Tasks Provide high level support to a varied range of external customers & internal stakeholders to ensure a high performance of machinery and a strong level of satisfaction from end users and contributors Take a thorough approach in seeking, and diagnosing, bugs or problems relating to either the electronic Hardware or software related problems Document results of analysis, pass to colleagues in R&D to help form the basis of modifications and amendments to product ranges Skills Degree in a related field i.e. mechanical engineering, electronics, automation etc. Strong analytical mindset and excellent problem-solving abilities Interest in mechnical engineering and effective automation Passion for learning new technologies and skills, especially relating to software & programming Fluent 

In [10]:
# Create the label list and the id2label and label2id mappings.

"""
dataset 7_1000_125_125  ,  48 labels
dataset 7_128_18_54     ,  42 labels
dataset 8910_1087_68_204, 206 labels
dataset 11_1000         ,   6 labels
"""

labels = [label for label in datasetDict['train'].features.keys() if label not in ['id', 'text']]
labels.sort()
print(f"labels: {type(labels)} {len(labels)}\n{labels}")

id2label = {idx:label for idx, label in enumerate(labels)}
print(f"id2label: {type(id2label)} {len(id2label)}\n{id2label}")

label2id = {label:idx for idx, label in enumerate(labels)}
print(f"label2id: {type(label2id)} {len(label2id)}\n{label2id}")

labels: <class 'list'> 6
['135', '136', '137', '138', '139', '390']
id2label: <class 'dict'> 6
{0: '135', 1: '136', 2: '137', 3: '138', 4: '139', 5: '390'}
label2id: <class 'dict'> 6
{'135': 0, '136': 1, '137': 2, '138': 3, '139': 4, '390': 5}


In [11]:
# Load tokenizer and model

model_name = "allenai/longformer-base-4096"

tokenizer = LongformerTokenizerFast.from_pretrained(model_name)

model = LongformerForSequenceClassification.from_pretrained(
    model_name,
    num_labels        = len(labels),
#    num_hidden_layers = hidden_layers,
    problem_type      = 'multi_label_classification')

# Configure attention window size
model.config.attention_window = 512

optimizer = AdamW(model.parameters(), lr=learning_rate)

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/597M [00:00<?, ?B/s]

Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# Tokenize ('input_ids' and 'attention_mask'), add 'global_attention_mask' (for Longformer), add 'labels'

def preprocess_data(examples, indices):
  # Step 1: Extract text and tokenize
  text = examples['text']             # Batch of texts
  encoding = tokenizer(
      text,                           # Tokenize text
      truncation     = True,
      padding        = 'max_length',
      max_length     = max_length,
      return_tensors = 'pt'           # Return PyTorch tensors
  )

  # Step 2: Create and add the global attention mask
  global_attention_mask             = torch.zeros_like(encoding['input_ids'])  # Initialize global attention mask with zeros (same shape as input_ids)
  global_attention_mask[:, 0]       = 1                                        # Set global attention on the first token ([CLS], token ID=0) in each sequence
  encoding['global_attention_mask'] = global_attention_mask                    # Add the global_attention_mask to the batch

  # Step 3: Create and populate the label matrix

  labels_matrix = torch.zeros((len(text), len(labels)), dtype=torch.float32)   # Create an empty label matrix
  #print(f"labels_matrix: {type(labels_matrix)} {labels_matrix.shape}")

  # Populate label matrix
  for idx, label in enumerate(labels):
    #print(f"idx:{idx} label:{label}")
    if label in examples:
      labels_matrix[:, idx] = torch.tensor(
          [1.0 if val else 0.0 for val in examples[label]],
          dtype=torch.float32
          )
  #print(f"labels_matrix: {type(labels_matrix)} {labels_matrix.shape}")

  encoding['labels'] = labels_matrix                                           # Add labels to the encoding
  #print(f"encoding['labels']: {encoding['labels']}")

  return encoding

In [13]:
# Create the 3 encoded datasets, train, validation and test

encoded_dataset = datasetDict.map(
    preprocess_data,
    batched        = True,
    remove_columns = datasetDict['train'].column_names,
    with_indices   = True
    )
train_dataset      = encoded_dataset['train']
validation_dataset = encoded_dataset['validation']
test_dataset       = encoded_dataset['test']
print(f"encoded_dataset: {type(encoded_dataset)} {encoded_dataset.shape}\n{encoded_dataset}")
print(f"train_dataset: {type(train_dataset)} {train_dataset.shape}")
print(f"validation_dataset: {type(validation_dataset)} {validation_dataset.shape}")
print(f"test_dataset['labels']: {type(test_dataset['labels'])} {len(test_dataset['labels'])}\n{test_dataset['labels']}")

print(f"train_dataset[0]['input_ids']: {type(train_dataset[0]['input_ids'])} {len(train_dataset[0]['input_ids'])}\n{train_dataset['input_ids'][0]}")
print(f"train_dataset[0]['attention_mask']: {type(train_dataset[0]['attention_mask'])} {len(train_dataset[0]['attention_mask'])}\n{train_dataset['attention_mask'][0]}")
print(f"train_dataset[0]['global_attention_mask']: {type(train_dataset[0]['global_attention_mask'])} {len(train_dataset[0]['global_attention_mask'])}\n{train_dataset['global_attention_mask'][0]}")

print(f"train_dataset[0]['labels']: {type(train_dataset[0]['labels'])} {len(train_dataset[0]['labels'])}\n{train_dataset[0]['labels']}")
print(f"train_dataset['labels'][0]: {type(train_dataset['labels'][0])} {len(train_dataset['labels'][0])}\n{train_dataset['labels'][0]}")


Map:   0%|          | 0/7000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

encoded_dataset: <class 'datasets.dataset_dict.DatasetDict'> {'train': (7000, 4), 'validation': (1500, 4), 'test': (1500, 4)}
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'global_attention_mask', 'labels'],
        num_rows: 7000
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'global_attention_mask', 'labels'],
        num_rows: 1500
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'global_attention_mask', 'labels'],
        num_rows: 1500
    })
})
train_dataset: <class 'datasets.arrow_dataset.Dataset'> (7000, 4)
validation_dataset: <class 'datasets.arrow_dataset.Dataset'> (1500, 4)
test_dataset['labels']: <class 'list'> 1500
[[0.0, 0.0, 1.0, 1.0, 0.0, 0.0], [0.0, 1.0, 1.0, 1.0, 1.0, 1.0], [0.0, 0.0, 0.0, 1.0, 1.0, 0.0], [0.0, 0.0, 1.0, 1.0, 0.0, 0.0], [0.0, 0.0, 1.0, 1.0, 0.0, 0.0], [0.0, 0.0, 1.0, 1.0, 0.0, 0.0], [0.0, 0.0, 1.0, 1.0, 0.0, 0.0], [0.0, 0.0, 1.0, 1.0, 0.0, 0.0], [0.0

In [14]:
# Truncated part

def get_truncated_part(text):
  tokens = tokenizer(
      text,
      truncation                = True,
      padding                   = 'max_length',
      max_length                = max_length,
      return_overflowing_tokens = True,
      return_tensors            = None
      )
  print(f"tokens.keys(): {tokens.keys()}")

  # Get the truncated tokens
  truncated_ids = tokens["input_ids"][0]
  print(f"truncated_ids: {type(truncated_ids)} {truncated_ids}")
  #overflow_ids  = tokens["overflow_to_sample_mapping"][0]
  #print(f"overflow_ids: {type(overflow_ids)} {overflow_ids}")

  # Decode the tokens back to text
  truncated_text = tokenizer.decode(truncated_ids, skip_special_tokens=True)
  #overflow_text  = tokenizer.decode(overflow_ids, skip_special_tokens=True)

  print(f"original_text :\n{text}")
  print(f"truncated_text:\n{truncated_text}")
  #print(f"overflow_text:\n{overflow_text}")

  original_tokens  = tokenizer.tokenize(text)
  truncated_tokens = tokenizer.tokenize(truncated_text)
  #overflow_tokens  = tokenizer.tokenize(overflow_text)

  print(f"original_tokens count : {len(original_tokens)}")
  print(f"truncated_tokens count: {len(truncated_tokens)}")
  #print(f"overflow_tokens count: {len(overflow_tokens)}")


In [15]:
example_text = datasetDict['train'][0]['text']
get_truncated_part(example_text)

inputs = tokenizer(
    example_text,
    truncation     = True,
    padding        = 'max_length',
    max_length     = max_length,
    return_tensors = 'pt'
)

#print(f"inputs: {type(inputs)} {inputs.keys()}\n{inputs}")
#print(f"inputs_ids: {type(inputs.input_ids)} {inputs.input_ids.shape}\n{inputs.input_ids}")
#print(f"attention_mask: {type(inputs.attention_mask)} {inputs.attention_mask.shape}\n{inputs.attention_mask}")
#print(f"token_type_ids: {inputs.token_type_ids.shape}")
#print(f"labels: {inputs.labels.shape}")


tokens.keys(): dict_keys(['input_ids', 'attention_mask', 'overflow_to_sample_mapping'])
truncated_ids: <class 'list'> [0, 846, 6837, 4787, 27824, 111, 1841, 24379, 1841, 6, 24379, 6, 35644, 468, 6837, 4787, 27824, 4326, 255, 40981, 36836, 239, 672, 323, 7, 10, 14903, 1186, 9, 6731, 916, 359, 3425, 7193, 7, 1306, 10, 239, 819, 9, 13922, 8, 10, 670, 672, 9, 11658, 31, 253, 1434, 8, 17233, 4624, 10, 10675, 1548, 11, 1818, 6, 8, 25378, 10174, 6, 19230, 50, 1272, 8941, 7, 1169, 5, 5175, 29575, 50, 2257, 1330, 1272, 27246, 775, 9, 1966, 6, 1323, 7, 4025, 11, 248, 947, 495, 7, 244, 1026, 5, 1453, 9, 24785, 8, 13037, 7, 1152, 16296, 25855, 27631, 11, 10, 1330, 882, 939, 4, 242, 4, 12418, 4675, 6, 8917, 6, 11767, 4753, 4, 8776, 23554, 11841, 8, 4206, 936, 12, 29, 21241, 11360, 11748, 11, 162, 13212, 3569, 4675, 8, 2375, 11767, 35477, 13, 2239, 92, 4233, 8, 2417, 6, 941, 8941, 7, 2257, 359, 8326, 25404, 1342, 2370, 5385, 20, 24027, 8776, 3737, 217, 10, 16392, 5391, 1199, 11, 516, 19, 110, 676, 3

In [16]:
# 4. Forward pass for multi-label classification

outputs = model(
    input_ids      = inputs.input_ids,
    attention_mask = inputs.attention_mask
    )

print(f"outputs: {type(outputs)} {outputs.keys()}\n{outputs}")

# Logits (= raw model outputs)
logits = outputs.logits
print(f"logits: {type(logits)} {logits.shape}\n{logits}")

# Convert logits to probabilities
sigmoid = torch.nn.Sigmoid()
probs   = sigmoid(logits)
print(f"probs: {type(probs)} {probs.shape}\n{probs}")

Initializing global attention on CLS token...
Input ids are automatically padded to be a multiple of `config.attention_window`: 512


outputs: <class 'transformers.models.longformer.modeling_longformer.LongformerSequenceClassifierOutput'> odict_keys(['logits'])
LongformerSequenceClassifierOutput(loss=None, logits=tensor([[-0.1358, -0.0018, -0.0797, -0.0787, -0.0246,  0.1020]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None, global_attentions=None)
logits: <class 'torch.Tensor'> torch.Size([1, 6])
tensor([[-0.1358, -0.0018, -0.0797, -0.0787, -0.0246,  0.1020]],
       grad_fn=<AddmmBackward0>)
probs: <class 'torch.Tensor'> torch.Size([1, 6])
tensor([[0.4661, 0.4995, 0.4801, 0.4803, 0.4939, 0.5255]],
       grad_fn=<SigmoidBackward0>)


In [17]:
example = encoded_dataset['train'][0]

#print(f"example: {type(example)} {example.keys()}\n{example}")
#print()
#print(f"example['input_ids']: {type(example['input_ids'])} {len(example['input_ids'])}\n{example['input_ids']}")
##print(f"example['token_type_ids']: {type(example['token_type_ids'])} {len(example['token_type_ids'])}\n{example['token_type_ids']}")
#print(f"example['attention_mask']: {type(example['attention_mask'])} {len(example['attention_mask'])}\n{example['attention_mask']}")
#print(f"example['labels']:  {type(example['labels'])} {len(example['labels'])}\n{example['labels']}")

In [18]:
tokenizer.decode(example['input_ids'])

'<s>Vivid Resourcing - Service Engineer Service, Engineer, Mechanical Vivid Resourcing Main Tasks Provide high level support to a varied range of external customers & internal stakeholders to ensure a high performance of machinery and a strong level of satisfaction from end users and contributors Take a thorough approach in seeking, and diagnosing, bugs or problems relating to either the electronic Hardware or software related problems Document results of analysis, pass to colleagues in R&D to help form the basis of modifications and amendments to product ranges Skills Degree in a related field i.e. mechanical engineering, electronics, automation etc. Strong analytical mindset and excellent problem-solving abilities Interest in mechnical engineering and effective automation Passion for learning new technologies and skills, especially relating to software & programming Fluent English speaker The Offer Strong package including a yearly salary paid in line with your experience & suitabili

In [19]:
[id2label[idx] for idx, label in enumerate(example['labels']) if label == 1.0]

['137', '138', '139']

In [20]:
# Set PyTorch format to ensures correctness and compatibility with PyTorch pipelines

# The 3 Hugging Face Dataset are formatted as PyTorch Dataset
encoded_dataset.set_format('torch')

In [21]:
batch_size  = batch_size
metric_name = "f1"

In [22]:
args = TrainingArguments(
    output_dir                  = './training_results',  # where model predictions and checkpoints will be written during training
    overwrite_output_dir        = True,
    logging_dir                 = './logs',
    logging_steps               = 50,
    save_steps                  = 500,
    save_total_limit            = 2,
    eval_strategy               = 'epoch',
    save_strategy               = 'epoch',
    learning_rate               = learning_rate,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size  = batch_size,
    gradient_accumulation_steps = gradient_accumulation_steps,
    num_train_epochs            = epochs,
    weight_decay                = 0.01,
    load_best_model_at_end      = True,
    metric_for_best_model       = metric_name,
    run_name                    = run_name,
    fp16                        = fp16,
    #dataloader_num_workers      = dataloader_num_workers,
    report_to                  = 'wandb'
    )

In [23]:
# Metrics
#   source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/

def multi_label_metrics(predictions, labels):
    average = 'micro'    # 'micro' or 'weighted'

    # first, apply sigmoid on predictions whose shape is (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs   = sigmoid(torch.Tensor(predictions))

    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1

    # finally, compute metrics
    y_true               = labels
    f1                   = f1_score               (y_true=y_true, y_pred=y_pred, average=average)    #, zero_division=1)
    precision            = precision_score        (y_true=y_true, y_pred=y_pred, average=average)    #, zero_division=1)
    recall               = recall_score           (y_true=y_true, y_pred=y_pred, average=average)    #, zero_division=1)
    roc_auc              = roc_auc_score          (y_true=y_true, y_score=probs, average=average)
    precision_recall_auc = average_precision_score(y_true=y_true, y_score=probs, average=average)
    accuracy             = accuracy_score         (y_true=y_true, y_pred=y_pred)

    # return as dictionary
    metrics = {
        'f1'                  : f1,
        'precision'           : precision,
        'recall'              : recall,
        'roc_auc'             : roc_auc,
        'precision_recall_auc': precision_recall_auc,
        'accuracy'            : accuracy
        }

    return metrics

In [24]:
def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    result = multi_label_metrics(
        predictions = preds,
        labels      = p.label_ids
        )
    return result

Let's verify a batch as well as a forward pass:

In [25]:
print(f"inputids:        {type(encoded_dataset['train']['input_ids'][0])}\t{encoded_dataset['train']['input_ids'][0].shape}")
#print(f"token_type_ids': {type(encoded_dataset['train']['token_type_ids'][0])}\t{encoded_dataset['train']['token_type_ids'][0].shape}")
print(f"attention_mask:  {type(encoded_dataset['train']['attention_mask'][0])}\t{encoded_dataset['train']['attention_mask'][0].shape}")
print(f"labels:          {type(encoded_dataset['train'][0]['labels'])}\t{encoded_dataset['train'][0]['labels'].shape}")

inputids:        <class 'torch.Tensor'>	torch.Size([768])
attention_mask:  <class 'torch.Tensor'>	torch.Size([768])
labels:          <class 'torch.Tensor'>	torch.Size([6])


In [26]:
# Execute a forward pass for debugging or verification purposes (cf. BERT_3_1 in Notion BERT database)

outputs = model(
    input_ids      = encoded_dataset['train']['input_ids'][0].unsqueeze(0),
    attention_mask = encoded_dataset['train']['attention_mask'][0].unsqueeze(0),
    labels         = encoded_dataset['train'][0]['labels'].unsqueeze(0)
    )

print(f"outputs: {type(outputs)} {outputs.keys()}\n{outputs}")

outputs: <class 'transformers.models.longformer.modeling_longformer.LongformerSequenceClassifierOutput'> odict_keys(['loss', 'logits'])
LongformerSequenceClassifierOutput(loss=tensor(0.7063, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>), logits=tensor([[-0.1358, -0.0018, -0.0797, -0.0787, -0.0246,  0.1020]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None, global_attentions=None)


In [27]:
# Create the trainer

trainer = Trainer(
    model,
    args,
    train_dataset = encoded_dataset["train"],
    eval_dataset  = encoded_dataset["validation"],
    tokenizer     = tokenizer,
    compute_metrics=compute_metrics
    )


  trainer = Trainer(


In [None]:
# Train, save the results as a JSON file

train_output  = trainer.train()

train_results = {
    'global_step':   train_output.global_step,    # total steps completed during training
    'training_loss': train_output.training_loss,  # average loss during training
    'metrics':       train_output.metrics         # dictionary of metrics
}

# Save train results
with open("train_results.json", "w") as f:
  json.dump(train_results, f, indent=4)
print(f"train_results: {type(train_results)} {len(train_results)}\n{train_results}")

Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Roc Auc,Precision Recall Auc,Accuracy
1,1.4089,0.331898,0.820768,0.717993,0.957881,0.931521,0.898614,0.264667
2,1.197,0.285784,0.84551,0.760157,0.952455,0.94932,0.927611,0.364
3,1.0159,0.276514,0.865135,0.80203,0.939018,0.953166,0.934338,0.452


In [None]:
print("Training successfully completed.")

## Evaluate

After training, we evaluate our model on the validation set.

In [None]:
def get_results(model, dataset, batch_size, threshold):
  # Clear GPU cache
  torch.cuda.empty_cache()

  # Set the model to evaluation mode to disable dropout and other training-specific behaviors
  model.eval()

  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  model.to(device)

  test_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

  all_preds       = []
  all_probs       = []
  all_true_labels = []

  for batch in tqdm(test_loader):
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
      outputs = model(**batch)
    logits = outputs.logits

    # Convert logits to probabilities and probabilities to predictions
    sigmoid = torch.nn.Sigmoid()
    probs   = sigmoid(logits).cpu().numpy()    # Convert to Numpy
    preds   = (probs > threshold).astype(int)  # Convert to binary Numpy array

    # Accumulate probabilities, predictions and labels
    all_probs.append(probs)
    all_preds.append(preds)
    all_true_labels.append(batch['labels'].cpu().numpy())

  # Concatenate results from all batches
  all_probs       = np.concatenate(all_probs, axis=0)        # shape: [num_samples, num_labels]
  all_preds       = np.concatenate(all_preds, axis=0)        # shape: [num_samples, num_labels]
  all_true_labels = np.concatenate(all_true_labels, axis=0)  # shape: [num_samples, num_labels]

  print(f"all_probs:       {type(all_probs)} {all_probs.shape}")
  print(f"all_preds:       {type(all_preds)} {all_preds.shape}")
  print(f"all_true_labels: {type(all_true_labels)} {all_true_labels.shape}")

  # Classification report for precision, recall, F1 score
  print(classification_report(
      y_true        = all_true_labels,
      y_pred        = all_preds,
      target_names  = labels,
      zero_division = 0
      ))

  # ROC AUC for multi-label classification
  roc_auc = roc_auc_score(
      y_true  = all_true_labels,
      y_score = all_probs,
      average = 'micro'
      )
  print(f"ROC AUC: {roc_auc}")

In [None]:
# First evaluate results NO SAVE

get_results(model=model, dataset=validation_dataset, batch_size=batch_size, threshold=threshold)

In [None]:
print("First evaluation successfully completed.")

In [None]:
# Second evaluate results; save to /content

eval_output = trainer.evaluate()

# Save evaluate results
with open("eval_results.json", "w") as f:
  json.dump(eval_output, f, indent=4)

In [None]:
print("Second evaluation successfully completed.")

## Upload model, tokenizer, train results, evaluate results

In [None]:
# Save model to /content

model_path = "model"
trainer.save_model(model_path)

In [None]:
# Upload model and tokenizer to the HF repo_id_model

tokenizer = LongformerTokenizerFast.from_pretrained(model_path)
model     = LongformerForSequenceClassification.from_pretrained(model_path)

tokenizer.push_to_hub(repo_id_model)
model.push_to_hub(repo_id_model)


In [None]:
# Upload train_results.json and eval_results.json to the HF repo_id_dataset BETTER to upload to wanddb repo?

upload_file(
    path_or_fileobj = "train_results.json",
    path_in_repo    = "train_results.json",
    repo_id         = HF_name,
    repo_type       = "dataset"
    )

upload_file(
    path_or_fileobj = "eval_results.json",
    path_in_repo    = "eval_results.json",
    repo_id         = HF_name,
    repo_type       = "dataset"
    )

## Test

In [None]:
# Test: first results NO SAVE

get_results(model=model, dataset=test_dataset, batch_size=batch_size, threshold=threshold)

In [None]:
print("First test successfully completed.")

In [None]:
# Test: second results NO SAVE

predictions = trainer.predict(test_dataset)

#print(f"predictions.predictions: {type(predictions.predictions)} {predictions.predictions.shape}\n{predictions.predictions}")  # Model logits
#print(f"predictions.label_ids: {type(predictions.label_ids)} {predictions.label_ids.shape}\n{predictions.label_ids}")          # Ground truth labels
print(f"predictions.metrics: {type(predictions.metrics)} {len(predictions.metrics)}\n{predictions.metrics}")                  # Metrics


In [None]:
print("Second test successfully completed.")

### Or otherwise

In [None]:
# Test: third results NO SAVE

predictions = trainer.predict(test_dataset)
#print(predictions.predictions)  # Model logits
#print(predictions.label_ids)    # Ground truth labels
print(predictions.metrics)      # Metrics

In [None]:
print("Third test successfully completed.")