# Jobify — Fine-Tuning JobBERT for Skill Extraction (NER)

This notebook fine-tunes **jjzha/jobbert-base-cased** on **SkillSpan** for BIO skill tagging.

Steps:
1. Install dependencies
2. Define data loader (SkillSpan splits)
3. Define tokenizer+label alignment (BIO + masking)
4. Build tf.data datasets
5. Train JobBERT token classifier
6. Evaluate using seqeval F1
7. Save the trained model

## 2. Install Requirements

In [None]:
!pip -q install transformers tensorflow datasets seqeval

## 3. Imports

In [None]:
import os
import numpy as np
import tensorflow as tf

from datasets import load_dataset, concatenate_datasets
from transformers import AutoTokenizer, TFAutoModelForTokenClassification, pipeline
from seqeval.metrics import classification_report, f1_score

## 4. Data Loader

In [None]:
# load skillspan dataset with official splits (train / validation / test)
def load_skillspan_data():
    splitted_data = load_dataset("jjzha/skillspan")
    data = concatenate_datasets([splitted_data['train'], splitted_data['validation'], splitted_data['test']])

    return data

In [None]:
# load data
data = load_skillspan_data()

train_X = data['train']['tokens']
train_Y = data['train']['tags_skill']

val_X = data['validation']['tokens']
val_Y= data['validation']['tags_skill']

## 5. Labels Mapping

In [None]:
# define labels
label_list = ["O", "B-SKILL", "I-SKILL"]

label2id = {l: i for i, l in enumerate(label_list)}
id2label = {i: l for l, i in label2id.items()}

## 6. Load Base-Cased Model

In [None]:
# load base model
tokenizer = AutoTokenizer.from_pretrained("jjzha/jobbert-base-cased")
model = TFAutoModelForTokenClassification.from_pretrained("jjzha/jobbert-base-cased",
                                                          num_labels=len(label_list),
                                                          id2label=id2label,
                                                          label2id=label2id)

## 7. Encoder

In [None]:
# Tokenize tokens (word-level) and align BIO labels to subword tokens
# Uses -100 for tokens we want to ignore (special tokens + extra subwords)

def tokenize_and_align_labels(tokenizer, tokens, tags):
    # tokenize the data
    tokenized = tokenizer(tokens, truncation=True, is_split_into_words=True, return_attention_mask=True, max_length=256)
    
    word_ids = tokenized.word_ids() # For each token produced by the tokenizer -> original word that it came from

    labels = []
    mask = []
    prev_word_id = None

    for word_id in word_ids:
        if word_id is None:
            labels.append(0)
            mask.append(0)

        elif word_id != prev_word_id:
            labels.append(tags[word_id])
            mask.append(1)

        else:
            labels.append(0)
            mask.append(0)

        prev_word_id = word_id
    
    tokenized['labels'] = labels
    tokenized['label_mask'] = mask

    return tokenized

## 8. Encode Data

In [None]:
# encode data
train_encoded = [
    tokenize_and_align_labels(tokenizer, tokens, tags)
    for tokens, tags in zip(train_X, train_Y)
]

val_encoded = [
    tokenize_and_align_labels(tokenizer, tokens, tags)
    for tokens, tags in zip(val_X, val_Y)
]

## 9. DataSet Builder

In [None]:
def make_dataset(encodings, batch_size=16):
    input_ids = [e["input_ids"] for e in encodings]
    attention = [e["attention_mask"] for e in encodings]
    labels = [e["labels"] for e in encodings]
    weights = [e["label_mask"] for e in encodings]

    ds = tf.data.Dataset.from_tensor_slices(({"input_ids": input_ids, "attention_mask": attention}, labels, weights))

    return ds.padded_batch(
        batch_size,
        padded_shapes=({"input_ids": [None], "attention_mask": [None]}, [None], [None]),
        padding_values=({"input_ids": 0, "attention_mask": 0}, 0, 0.0)
    ).prefetch(tf.data.AUTOTUNE)

In [None]:
train_data = make_dataset(train_encoded)
val_data = make_dataset(val_encoded)

## 10. Callbacks

In [None]:
# callbacks
checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath="ML\\src\\models\\trained_models\\checkpoints\\jobify_jobbert_v1.keras",
                                                    monitor='val_loss',
                                                    save_best_only=True,
                                                    save_weights_only=False,
                                                    mode='min',
                                                    verbose=1)

stop_training = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, mode='min', verbose=1, restore_best_weights=True)

reduce_lrate = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, verbose=1, mode='min')

## 11. Compiling and Training

In [None]:
# compile and train the model
print("STARTING COMPILING PROCESS:")
model.compile(optimizer= tf.keras.optimizers.Adam(learning_rate=5e-05, epsilon=1e-08, beta_1=0.9, beta_2=0.999),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=[
                  tf.keras.metrics.Accuracy(),
                  tf.keras.metrics.Precision(),
                  tf.keras.metrics.Recall()])
print("DONE COMPILING ✅")

print("STARTING TRAINING PROCESS: ")
history = model.fit(train_data, validation_data=val_data, epochs=30, verbose=1, callbacks=[checkpoint, reduce_lrate, stop_training])
print("DONE TRAINING ✅")
np.savez("ML\\src\\models\\training_history\\training_history.npz", **history.history)
print("HISTORY SAVED ✅")

## 12. Evaluation (F1-SCORE)

In [None]:
# convert model outputs and true labels into seqeval-compatible format
def get_seqeval_input(model, dataset, id2label):
    all_preds = []
    all_labels = []

    for batch in dataset:
        input = batch[0]
        true_labels = batch[1]
        weights = batch[2]

        logits = model(input, training=False).logits
        pred_ids = tf.argmax(logits, axis=-1).numpy()
        label_ids = true_labels.numpy()
        weights = weights.numpy()

        for i in range(pred_ids.shape[0]):
            preds, labels = [], []
            
            for j in range(pred_ids.shape[1]):

                if weights[i, j] == 0:
                    continue  # ignore masked tokens

                preds.append(id2label[int(pred_ids[i, j])])
                labels.append(id2label[int(label_ids[i, j])])

            all_preds.append(preds)
            all_labels.append(labels)
            
            all_preds.append(preds)
            all_labels.append(labels)
    
    return all_preds, all_labels



# evaluate the NER model
def evaluate_ner(model, dataset, id2label):
    y_pred, y_true = get_seqeval_input(model, dataset, id2label)
    print(classification_report(y_true, y_pred))
    f1 = f1_score(y_true, y_pred)

    return f1

In [None]:
f1 = evaluate_ner(model, val_data, id2label)
print("Validation seqeval F1:", f1)

## 12. Save The Model

In [None]:
SAVE_DIR = ""

model.save_pretrained(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)

print("Saved ✅ to:", SAVE_DIR)

## 13. Quick Test

In [None]:
SAVE_DIR = ""

jobify_tokenizer = AutoTokenizer.from_pretrained(SAVE_DIR)
jobify_model = TFAutoModelForTokenClassification.from_pretrained(SAVE_DIR)

print("Model loaded ✅")


In [None]:
test = pipeline(
    "token-classification",
    model=jobify_model,
    tokenizer=jobify_tokenizer,
    aggregation_strategy="simple"
)

text = "Experienced with Python, SQL, Docker, FastAPI, and React."
test(text)