<a href="https://colab.research.google.com/github/claudelepere/ML_GitHub/blob/main/NEW_Prediction_EN_11_24000__24100.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Error: gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.9.0 which is incompatible.
Ignore this error as gcsfs is not used.

In [17]:
!pip install -q transformers datasets

import datetime
import json
import numpy as np
import os
import pandas as pd
import shutil
import torch

from contextlib       import suppress
from datasets         import DatasetDict
from google.colab     import files, userdata
from huggingface_hub  import create_repo, hf_hub_download, login, whoami
from sklearn.metrics  import classification_report
from transformers     import LongformerTokenizerFast, LongformerForSequenceClassification
from torch.utils.data import DataLoader

## Hugging Face Hub (HF Hub) authenticate

In [18]:
os.environ["HF_TOKEN"] = userdata.get("HF_TOKEN")    # Store the key in os.environ
hf_token               = os.environ.get('HF_TOKEN')

login(token=hf_token)

# Check
user = whoami(token=hf_token)
assert user['name'] == 'claudelepere', f"{user['name']} is not claudelepere"

print(f"user: {user}")

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


user: {'type': 'user', 'id': '66ec3d5f61228b02f8780beb', 'name': 'claudelepere', 'fullname': 'Claude Lepère', 'isPro': False, 'avatarUrl': 'https://cdn-avatars.huggingface.co/v1/production/uploads/66ec3d5f61228b02f8780beb/gvnf9pvm2KvE90ETMUQo3.jpeg', 'orgs': [], 'auth': {'type': 'access_token', 'accessToken': {'displayName': 'jobs_token', 'role': 'fineGrained', 'createdAt': '2025-01-04T17:44:35.493Z', 'fineGrained': {'canReadGatedRepos': False, 'global': [], 'scoped': [{'entity': {'_id': '66ec3d5f61228b02f8780beb', 'type': 'user', 'name': 'claudelepere'}, 'permissions': ['repo.content.read', 'repo.write']}]}}}}


## HF Hub repo

In [19]:
repo_id = "claudelepere/jobs_EN_11_24000_032217"

## Checks

In [20]:
!python -V

print(f"currentdir: {os.getcwd()}")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"device: {device}")

Python 3.11.11
currentdir: /content
device: cuda


In [21]:
skills           = 11
all_rows_low     = 24000
all_rows_high    = 24100
num_datapoints   = all_rows_high - all_rows_low
max_length       = 1024
batch_size       = 4*8
threshold_tuning = True

datasetDict_zip_file_name = f"dataset_EN_{skills}_{all_rows_low}_{all_rows_high}.zip"
datasetDict_dir_name      = os.path.splitext(datasetDict_zip_file_name)[0]

print(f"datasetDict_zip_file_name: {datasetDict_zip_file_name}")
print(f"datasetDict_dir_name     : {datasetDict_dir_name}")
print()


datasetDict_zip_file_name: dataset_EN_11_24000_24100.zip
datasetDict_dir_name     : dataset_EN_11_24000_24100



## Out Of Memory (OOM)

### OOM: check for and kill zombie processes

In [22]:
!ps aux | grep python
!kill -9 <PID>
if torch.cuda.is_available():
    !nvidia-smi
    print(torch.cuda.memory_summary())

root          86  3.6  0.0      0     0 ?        Z    18:41   0:13 [python3] <defunct>
root          87  0.1  0.0  77272 57324 ?        S    18:41   0:00 python3 /usr/local/bin/colab-file
root         132  1.1  0.1 883228 130604 ?       Sl   18:41   0:04 /usr/bin/python3 /usr/local/bin/j
root         443  5.8  1.5 12360836 1321604 ?    Ssl  18:42   0:17 /usr/bin/python3 -m colab_kernel_
root         482  0.3  0.0 544720 20204 ?        Sl   18:42   0:01 /usr/bin/python3 /usr/local/lib/p
root        2016  0.0  0.0   7376  3460 ?        S    18:47   0:00 /bin/bash -c ps aux | grep python
root        2018  0.0  0.0   6484  2244 ?        S    18:47   0:00 grep python
/bin/bash: -c: line 1: syntax error near unexpected token `newline'
/bin/bash: -c: line 1: `kill -9 <PID>'
Sat Mar 22 18:47:21 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----

### OOM: free up GPU memory

In [23]:
torch.cuda.empty_cache()

## Upload and unzip job dataset

In [24]:
def upload_unzip_dataset(filename):
    """Upload and unzip the dataset to /content, ensuring correct placement."""

    # Get the expected directory name (same as the zip filename without extension)
    expected_dir = os.path.splitext(filename)[0]

    # Check if the file and the directory exist in /content and delete them
    with suppress(FileNotFoundError):
        if os.path.isdir(expected_dir):
            shutil.rmtree(expected_dir)               # Remove directory if it exists
        if os.path.isfile(filename):
            os.remove(filename)                       # Remove file if it exists

    print(f"Removed '{expected_dir}' and '{filename}' if they were present in /content.")

    # Upload the zip file
    uploaded_files = files.upload()                  # Prompt file upload dialog

    if filename not in uploaded_files:
        raise FileNotFoundError(f"'{filename}' was not uploaded.")

    print(f"'{filename}' successfully uploaded to /content.")

    # Unzip the file to /content
    shutil.unpack_archive(filename, "/content")

    print(f"Unzipped to '/content/{expected_dir}'.")

# Usage
upload_unzip_dataset(datasetDict_zip_file_name)

Removed 'dataset_EN_11_24000_24100' and 'dataset_EN_11_24000_24100.zip' if they were present in /content.


Saving dataset_EN_11_24000_24100.zip to dataset_EN_11_24000_24100.zip
'dataset_EN_11_24000_24100.zip' successfully uploaded to /content.
Unzipped to '/content/dataset_EN_11_24000_24100'.


## Create datasetDict (HF DatasetDict) = 1 HF Dataset, inference
The sequence of datapoints of dataset=datasetDict['inference'] is the same as the sequence of SELECT, id, ... WHERE in_LanguageId=1 ORDER BY id DESC in MySQL.

In [25]:
datasetDict = DatasetDict.load_from_disk(datasetDict_dir_name)
dataset     = datasetDict['inference']

print(f"dataset: {type(dataset)} shape={dataset.shape}\n{dataset}")     # <class 'datasets.arrow_dataset.Dataset'> shape=(100, 8)
print(f"dataset.features: {type(dataset.features)} shape={dataset.features}\n{dataset.features}")

# Convert the dataset to a pandas dataframe
df_original = pd.DataFrame(dataset)
print(f"df_original: {type(df_original)} shape={df_original.shape}\n{df_original}")                         # <class 'pandas.core.frame.DataFrame'> shape=(100, 8)


dataset: <class 'datasets.arrow_dataset.Dataset'> shape=(100, 8)
Dataset({
    features: ['id', 'text', '390', '135', '136', '137', '138', '139'],
    num_rows: 100
})
dataset.features: <class 'datasets.features.features.Features'> shape={'id': Value(dtype='int64', id=None), 'text': Value(dtype='string', id=None), '390': Value(dtype='bool', id=None), '135': Value(dtype='bool', id=None), '136': Value(dtype='bool', id=None), '137': Value(dtype='bool', id=None), '138': Value(dtype='bool', id=None), '139': Value(dtype='bool', id=None)}
{'id': Value(dtype='int64', id=None), 'text': Value(dtype='string', id=None), '390': Value(dtype='bool', id=None), '135': Value(dtype='bool', id=None), '136': Value(dtype='bool', id=None), '137': Value(dtype='bool', id=None), '138': Value(dtype='bool', id=None), '139': Value(dtype='bool', id=None)}
df_original: <class 'pandas.core.frame.DataFrame'> shape=(100, 8)
       id                                               text    390    135  \
0   99388  Consort

## Create labels (list), id2label (dict) and label2id (dict).
**The sequence of the labels list is the same as in dataset.
And the sequences of the optimized thresholds, true labels and predictions are preserved.**

* dataset 7_1000_125_125  ,  48 labels
* dataset 7_128_18_54     ,  42 labels
* dataset 8910_1087_68_204, 206 labels
* dataset 11_1000         ,   6 labels

In [26]:
labels = [label for label in dataset.features.keys() if label not in ['id', 'text']]
#labels.sort()

print(f"labels: {type(labels)} {len(labels)}\n{labels}")

num_labels = len(labels)

labels: <class 'list'> 6
['390', '135', '136', '137', '138', '139']


In [27]:
id2label = {idx: label for idx, label in enumerate(labels)}
print(f"id2label: {type(id2label)} {len(id2label)}\n{id2label}")

id2label: <class 'dict'> 6
{0: '390', 1: '135', 2: '136', 3: '137', 4: '138', 5: '139'}


In [28]:
label2id = {label: idx for idx, label in enumerate(labels)}
print(f"label2id: {type(label2id)} {len(label2id)}\n{label2id}")

label2id: <class 'dict'> 6
{'390': 0, '135': 1, '136': 2, '137': 3, '138': 4, '139': 5}


## Download the tokenizer and the model

In [29]:
print("Tokenizer")
tokenizer = LongformerTokenizerFast.from_pretrained(repo_id, timeout=60)  # Increased timeout to 60 seconds)

print("Model")
model = LongformerForSequenceClassification.from_pretrained(repo_id)
model.eval()  # Ensures no gradient computation

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Tokenizer


tokenizer_config.json:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.56M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

Model


config.json:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/595M [00:00<?, ?B/s]

LongformerForSequenceClassification(
  (longformer): LongformerModel(
    (embeddings): LongformerEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (position_embeddings): Embedding(4098, 768, padding_idx=1)
    )
    (encoder): LongformerEncoder(
      (layer): ModuleList(
        (0-11): 12 x LongformerLayer(
          (attention): LongformerAttention(
            (self): LongformerSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (query_global): Linear(in_features=768, out_features=768, bias=True)
              (key_global): Linear(in_features=768, out_features=768, bias=True)
          

## Downloads the optimized thresholds

In [30]:
file_path = hf_hub_download(repo_id=repo_id, filename="optimized_thresholds.json", cache_dir="./my_HF_downloads")
print(f"file_path: {file_path}")

# Load the JSON file
with open(file_path, 'r') as json_file:
    optimized_thresholds_json = json.load(json_file)

#print(f"optimized_thresholds_json: {type(optimized_thresholds_json)} {len(optimized_thresholds_json)}")
#print(json.dumps(optimized_thresholds_json, indent=2))

optimized_thresholds = np.array(list(optimized_thresholds_json.values()))
print(f"optimized_thresholds: {type(optimized_thresholds)} shape={optimized_thresholds.shape}\n{optimized_thresholds}")


optimized_thresholds.json:   0%|          | 0.00/194 [00:00<?, ?B/s]

file_path: ./my_HF_downloads/models--claudelepere--jobs_EN_11_24000_032217/snapshots/64a3c22458545750f89d4fff93797a44e76a7694/optimized_thresholds.json
optimized_thresholds: <class 'numpy.ndarray'> shape=(6,)
[0.45 0.5  0.45 0.5  0.45 0.45]


In [31]:
# Step 1: Tokenize dataset
def tokenize_function(examples):
  return tokenizer(examples["text"], padding='max_length', max_length=max_length, truncation=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
# <class 'datasets.arrow_dataset.Dataset'> shape=(100, 10)
print(f"tokenized_dataset: {type(tokenized_dataset)} shape={tokenized_dataset.shape}\n{tokenized_dataset}")

tokenized_df = tokenized_dataset.to_pandas()
print(f"tokenized_df: {type(tokenized_df)} shape={tokenized_df.shape}\n{tokenized_df}")

# Step 2: Convert to PyTorch DataLoader
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"] + labels)
dataloader = DataLoader(tokenized_dataset, batch_size=batch_size)

# Step 3: Run Prediction
all_preds = []

with torch.no_grad():                       # No gradients needed for prediction
    for batch in dataloader:
        # Move batch to GPU if available
        inputs = {k: batch[k].to(device) for k in ["input_ids", "attention_mask"]}

        # Forward pass
        outputs = model(**inputs)
        logits = outputs.logits  # Model outputs raw logits

        # Apply sigmoid to convert logits into probabilities
        probs = torch.sigmoid(logits)

        # Move probs to CPU and convert to NumPy
        all_preds.extend(probs.cpu().numpy())

all_preds_arr = np.array(all_preds)
print(f"all_preds    : {type(all_preds)}     len={len(all_preds)}\n{all_preds}")
print(f"all_preds_arr: {type(all_preds_arr)} shape={all_preds_arr.shape}\n{all_preds_arr}")

# Step 4: Convert Probabilities to Binary Predictions
if threshold_tuning:
    thresholds = optimized_thresholds
else:
    thresholds = np.full(num_labels, 0.5)
print(f"thresholds: {type(thresholds)} shape={thresholds.shape}\n{thresholds}")                  # <class 'numpy.ndarray'> shape=(6,)

binary_preds = (np.array(all_preds) > thresholds).astype(int)
print(f"binary_preds: {type(binary_preds)} shape={binary_preds.shape}\n{binary_preds}")          # <class 'numpy.ndarray'> shape=(100, 6)

# Step 5: Compare with True Labels
true_labels = tokenized_df[labels]
print(f"true_labels: {type(true_labels)} shape={true_labels.shape}\n{true_labels}")              # <class 'pandas.core.frame.DataFrame'> shape=(100, 6)

# Convert true_labels DataFrame to a NumPy array of 0 and 1
true_labels_np  = true_labels.astype(int).to_numpy()
print(f"true_labels_np: {type(true_labels_np)} shape={true_labels_np.shape}\n{true_labels_np}")  # <class 'numpy.ndarray'> shape=(100, 6)

# Convert id2label dict to a list of label names
label_names = list(id2label.values())
print(f"label_names: {type(label_names)} len={len(label_names)}\n{label_names}")                 # <class 'list'> len=6 ['390', '135', '136', '137', '138', '139']
report = classification_report(true_labels_np, binary_preds, target_names=label_names, zero_division=0)

print(f"classification report:{type(report)} len={len(report)}\n{report}")

relevant_columns          = ['id', 'text', '390', '135', '136', '137', '138', '139']
df_compare                = df_original[relevant_columns].copy()
df_compare['true_labels'] = true_labels.values.tolist()
df_compare['preds']       = binary_preds.tolist()
df_compare['compare']     = (df_compare['true_labels']==df_compare['preds']).replace({True:'OK', False:'KO'})
print(f"df_compare: {type(df_compare)} shape={df_compare.shape}\n{df_compare}")                  # <class 'pandas.core.frame.DataFrame'> shape=(100, 10)
print()

def print_row(row, file):
    """Prints the row information to the console and writes it to a file."""
    output  = f"id: {row['id']}\n"
    output += f"text: {row['text']}\n"
    output += "labels     : [" + ", ".join(f"{label:>5}" for label in labels) + "]\n"
    output += "true_labels: [" + ", ".join(f"{true_label:>5}" for true_label in row['true_labels']) + "]\n"
    output += "preds      : [" + ", ".join(f"{pred:>5}" for pred in row['preds']) + "]\n"
    output += f"compare    : {row['compare']}\n"
    output += "\n"

    print(output, end="")  # Print to console without extra newline
    file.write(output)     # Write to file


# Open the file in write mode
with open("output.txt", "w") as f:
    _ = df_compare.apply(print_row, axis=1, args=(f,))  # Pass file object to print_row

files.download("/content/output.txt")

# Count total 'OK' and 'KO' values
total_ok = (df_compare['compare'] == 'OK').sum()  # Total matches
total_ko = (df_compare['compare'] == 'KO').sum()  # Total mismatches
print(f"Total 'OK': {total_ok}")
print(f"Total 'KO': {total_ko}")
print()

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Initializing global attention on CLS token...


tokenized_dataset: <class 'datasets.arrow_dataset.Dataset'> shape=(100, 10)
Dataset({
    features: ['id', 'text', '390', '135', '136', '137', '138', '139', 'input_ids', 'attention_mask'],
    num_rows: 100
})
tokenized_df: <class 'pandas.core.frame.DataFrame'> shape=(100, 10)
       id                                               text    390    135  \
0   99388  Consort NT - Second Line Engineer (Windows Ser...  False  False   
1   99387  MyNextCompany - IT Infrastructure Project Mana...  False  False   
2   99386  Consort NT - Support Engineer (Windows Server,...  False  False   
3   99384  Visser & Van Baars - Market- & Business Intell...  False  False   
4   99383  Computer Recruitment Services - Change Manager...  False  False   
..    ...                                                ...    ...    ...   
95  99155  Koalect - Front-end Developer   HTML5, Javascr...  False  False   
96  99152  MyNextCompany - Junior Digital Project Officer...   True   True   
97  99151  Widetech 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Total 'OK': 55
Total 'KO': 45





```
	      true_labels		preds		is right
________________________________________________
99386: 137		0		1		true_labels
99379: 137		1		0		preds
99377: 136		0		1		true_labels
99375: 137		0		1		true_labels
99352: 137		1		0		true_labels
99351: 390		1		0		true_labels
99350: 136		1		0		preds
99344: 135		1?		0		preds
       139		1?		0		idem
99342: 136		0		1		true_labels
99330: 136		0		1		true_labels
99326: 136		0		1		preds
99325: 139		1		0		preds
99322: 390		0		1		preds
       135		0		1		idem
       138		1		0		idem
       139		1		0		idem
99316: 390		0		1		true_labels
99308: 136		1		0		preds		problem: nothing specified explicitly, true_labels and preds conclude many different experiences. Which skill to check when not explicitly requested
99302: 137		0		1		preds		mix Dutch and English
99291: 390		0?		1
       139		1?		0
99285: 139		0		1		true_labels	French
99284: 137		0		1		preds
99283: 137		1		0		preds
99280: 137		1		0		preds
99262: 137		0		1		true_labels
99246: 137		0		1		true_labels
99241: 139		0		1		preds
99238: 137		1		0		preds
99231: 136		0		1		preds?          no explicit requirement
99227: 136		0		1		true_labels     French
99225: 390		1		0                       no explicit requirement (Altran)
       136		1		0		idem
       138		0		1		idem
       139		0		1		idem
99223: 390		1		0					no explicit requirement (Altran)
       136		1		0		idem
       139		0		1		idem
99222: 390		1		0					no explicit requirement (Altran)
       138		0		1		idem
       139		0		1		idem
99221: 390		1		0					no explicit requirement (Altran)
       135		1		0		idem
       136		1		0		idem
       138		0		1		idem
       139		0		1		idem
99214: 137		0		1					no explicit requirement
99202: 390		1		0					no explicit requirement
       137		0		1		idem
       138		0		1		idem
       139		0		1		idem
99199: 137		1		0		true_labels
99185: 139		0		1					no explicit requirement; Dutch
99184: 139		1		0		true_labels > 2 years => 137, 138, 139
99183: 139		0		1					no explicit requirement
99179: 137		0		1					with or without experience
99178: 137		0		1					Experience in ...; Years of experience
99167: 137		0		1					no explicit requirement; Dutch
99166: 137		0		1					no explicit requirement
99165: 138		0		1					no explicit requirement
99161: 136		0		1					no explicit requirement
99157: 137		1		0					no explicit requirement
99155: 138		0		1		preds?      at least 1 year of experience
99152: 137		1		0		preds?      a first professional experience
99151: 138		1		0					no explicit requirement


```



In [32]:
print("It's the end")

It's the end


In [33]:
raise Exception("I stop here")

Exception: I stop here