# BERT-BiLSTM-CRF for Resume NER (merged pipeline)

Uses merged data (existing 220 + Dotin 545). Run prepare_data.py first to create merged_resume_ner.json.

**Target:** 80%+ entity-level F1 (or 80%+ token accuracy). Training uses early stopping, differentiated LRs (BERT 1e-5, head 5e-5), LR warmup + decay, and dropout 0.4.

Run cells in order. JSON file is loaded from the same folder as this notebook.

In [2]:
# 1) Load data — prefers merged_resume_ner.json (run prepare_data.py first)
import json
import os

DATA_PATH = "merged_resume_ner.json"
if not os.path.exists(DATA_PATH):
    DATA_PATH = "entity_recognition_in_resumes.json"
if not os.path.exists(DATA_PATH):
    DATA_PATH = "/content/drive/My Drive/DATASETS/merged_resume_ner.json"
if not os.path.exists(DATA_PATH):
    raise FileNotFoundError("JSON not found. Run: python prepare_data.py --existing ../entity_recognition_in_resumes.json --dotin /path/to/dotin --output merged_resume_ner.json")

data = []
with open(DATA_PATH, "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if line:
            data.append(json.loads(line))
print(f"Loaded {len(data)} resumes")

LABEL_MAPPING = {
    "Name": "NAME", "Email Address": "EMAIL", "Skills": "SKILL", "Designation": "OCCUPATION",
    "Degree": "EDUCATION", "College Name": "EDUCATION", "Graduation Year": "EDUCATION",
    "Companies worked at": "EXPERIENCE", "Years of Experience": "EXPERIENCE", "Location": "O", "UNKNOWN": "O",
    "NAME": "NAME", "EMAIL": "EMAIL", "SKILL": "SKILL", "OCCUPATION": "OCCUPATION", "EDUCATION": "EDUCATION", "EXPERIENCE": "EXPERIENCE", "O": "O",
}
for item in data:
    for ann in item.get("annotation", []):
        ann["label"] = [LABEL_MAPPING.get(l, "O") for l in ann["label"]]

Loaded 580 resumes


**Colab only – use if upload widget didn't work:** Run the cell below to mount Google Drive. Put `entity_recognition_in_resumes.json` in your Drive (e.g. in My Drive), then in **cell 1** set `DATA_PATH = '/content/drive/MyDrive/entity_recognition_in_resumes.json'` (or the path where you put it) and re-run cell 1.

In [3]:
# Colab: Mount Google Drive (run this, then set DATA_PATH in cell 1 to your file path in Drive)
try:
    from google.colab import drive
    drive.mount("/content/drive")
    print("Drive mounted. Put entity_recognition_in_resumes.json in My Drive, then set DATA_PATH in cell 1 and re-run it.")
except ImportError:
    print("Not in Colab – skip this cell.")

Not in Colab – skip this cell.


In [4]:
# 2) Build train/val/test from JSON with fixed create_bio_tags (no B-O / I-O)
import re
import random

def tokenize_with_positions(text):
    return [(m.group(), m.start(), m.end()) for m in re.finditer(r"\S+", text)]

def create_bio_tags_fixed(tokens, annotations):
    bio = ["O"] * len(tokens)
    for ann in annotations:
        if not ann.get("label") or ann["label"][0] == "O":
            continue
        entity = ann["label"][0]
        for pt in ann.get("points", []):
            s, e = pt["start"], pt["end"]
            first = True
            for i, (_, ts, te) in enumerate(tokens):
                if te <= s or ts >= e: continue
                bio[i] = f"B-{entity}" if first else f"I-{entity}"
                first = False
    return bio

all_sents, all_labels = [], []
for item in data:
    content = item.get("content", "")
    anns = item.get("annotation", [])
    if not content or not anns: continue
    toks = tokenize_with_positions(content)
    if not toks: continue
    labs = create_bio_tags_fixed(toks, anns)
    all_sents.append([t[0] for t in toks])
    all_labels.append(labs)

n = len(all_sents)
random.seed(42)
idx = list(range(n)); random.shuffle(idx)
n_train, n_val = int(0.8 * n), int(0.1 * n)
train_sents = [all_sents[i] for i in idx[:n_train]]
train_labels = [all_labels[i] for i in idx[:n_train]]
val_sents   = [all_sents[i] for i in idx[n_train:n_train+n_val]]
val_labels  = [all_labels[i] for i in idx[n_train:n_train+n_val]]
test_sents  = [all_sents[i] for i in idx[n_train+n_val:]]
test_labels = [all_labels[i] for i in idx[n_train+n_val:]]
print(f"Train {len(train_sents)} Val {len(val_sents)} Test {len(test_sents)}")

Train 464 Val 58 Test 58


In [5]:
# 3) Install deps (run once; skip if already installed)
!pip install -q torch transformers pytorch-crf seqeval

In [6]:
# 4) BERT tokenizer + label alignment and dataset
from transformers import BertTokenizer
from torch.utils.data import Dataset, DataLoader
import torch

TAGS = ["O","B-NAME","I-NAME","B-EMAIL","I-EMAIL","B-SKILL","I-SKILL","B-OCCUPATION","I-OCCUPATION","B-EXPERIENCE","I-EXPERIENCE","B-EDUCATION","I-EDUCATION"]
LABEL2ID = {t:i for i,t in enumerate(TAGS)}
ID2LABEL = {i:t for i,t in enumerate(TAGS)}
NUM_LABELS = len(TAGS)

def align_to_bert(words, word_labels, tokenizer, max_len=512):
    first_idx, toks = [], ["[CLS]"]
    for w in words:
        p = tokenizer.tokenize(w) or [tokenizer.unk_token]
        first_idx.append(len(toks))
        toks.extend(p)
    toks.append("[SEP]")
    ids = tokenizer.convert_tokens_to_ids(toks)
    mask = [1]*len(ids)
    aligned = [-100]*len(ids)
    for pos, lab in zip(first_idx, word_labels):
        if pos < len(aligned): aligned[pos] = LABEL2ID.get(lab, 0)
    if len(ids) > max_len:
        ids, mask, aligned = ids[:max_len-1]+[tokenizer.sep_token_id], mask[:max_len-1]+[1], aligned[:max_len-1]+[-100]
    return ids, mask, aligned

class BertNERDataset(Dataset):
    def __init__(self, sents, labels, tokenizer, max_len=512):
        self.samples = [align_to_bert(w, l, tokenizer, max_len) for w, l in zip(sents, labels) if len(w)==len(l)]
    def __len__(self): return len(self.samples)
    def __getitem__(self, i): return self.samples[i]

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
train_ds = BertNERDataset(train_sents, train_labels, tokenizer)
val_ds   = BertNERDataset(val_sents, val_labels, tokenizer)

def collate(batch):
    max_l = max(len(b[0]) for b in batch)
    pad = 0
    return (
        torch.tensor([b[0]+[pad]*(max_l-len(b[0])) for b in batch], dtype=torch.long),
        torch.tensor([b[1]+[0]*(max_l-len(b[1])) for b in batch], dtype=torch.long),
        torch.tensor([b[2]+[-100]*(max_l-len(b[2])) for b in batch], dtype=torch.long),
    )
train_loader = DataLoader(train_ds, batch_size=8, shuffle=True, collate_fn=collate)
val_loader   = DataLoader(val_ds, batch_size=8, collate_fn=collate)
print("Datasets ready")

  from .autonotebook import tqdm as notebook_tqdm


Datasets ready


In [7]:
# 5) BERT-BiLSTM-CRF model
import torch.nn as nn
from transformers import BertModel
from torchcrf import CRF

class BertBiLSTMCRF(nn.Module):
    def __init__(self, bert_name="bert-base-uncased", hidden_dim=256, num_labels=NUM_LABELS, dropout=0.3):
        super().__init__()
        self.bert = BertModel.from_pretrained(bert_name)
        self.lstm = nn.LSTM(self.bert.config.hidden_size, hidden_dim//2, num_layers=1, bidirectional=True, batch_first=True)
        self.drop = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim, num_labels)
        self.crf = CRF(num_labels, batch_first=True)

    def forward(self, input_ids, attention_mask, labels=None):
        out = self.bert(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
        out, _ = self.lstm(self.drop(out))
        emissions = self.fc(self.drop(out))
        mask_b = attention_mask.bool()  # boolean avoids torch.where uint8 deprecation in pytorch-crf
        if labels is not None:
            # CRF expects indices in [0, num_labels-1]; replace padding -100 with 0 (mask ignores those)
            labels = labels.clone().masked_fill(labels == -100, 0)
            return -self.crf(emissions, labels, mask=mask_b, reduction="mean")
        return self.crf.decode(emissions, mask=mask_b)

# Prefer GPU (CUDA or Apple Silicon MPS), else CPU
if torch.cuda.is_available():
    device = torch.device("cuda")
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
    device = torch.device("mps")  # Apple Silicon GPU
else:
    device = torch.device("cpu")
model = BertBiLSTMCRF(dropout=0.4).to(device)  # Slightly higher dropout for regularization

# Differentiated LRs: BERT lower (fine-tune gently), task head higher (learn faster)
no_decay = ["bias", "LayerNorm.weight"]
bert_params = list(model.bert.named_parameters())
optimizer_grouped = [
    {"params": [p for n, p in bert_params if not any(nd in n for nd in no_decay)], "lr": 1e-5, "weight_decay": 0.01},
    {"params": [p for n, p in bert_params if any(nd in n for nd in no_decay)], "lr": 1e-5, "weight_decay": 0.0},
    {"params": [p for n, p in model.named_parameters() if not n.startswith("bert.") and not any(nd in n for nd in no_decay)], "lr": 5e-5, "weight_decay": 0.01},
    {"params": [p for n, p in model.named_parameters() if not n.startswith("bert.") and any(nd in n for nd in no_decay)], "lr": 5e-5, "weight_decay": 0.0},
]
optimizer = torch.optim.AdamW(optimizer_grouped)
print(f"Model on {device}")

Loading weights: 100%|██████████| 199/199 [00:00<00:00, 546.85it/s, Materializing param=pooler.dense.weight]                               
BertModel LOAD REPORT from: bert-base-uncased
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
cls.seq_relationship.bias                  | UNEXPECTED |  | 
cls.predictions.bias                       | UNEXPECTED |  | 
cls.predictions.transform.dense.weight     | UNEXPECTED |  | 
cls.predictions.transform.dense.bias       | UNEXPECTED |  | 
cls.seq_relationship.weight                | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Model on cpu


In [None]:
# 6) Train with early stopping + val F1 (aim for 80%+ F1)
from seqeval.metrics import f1_score

def run_validation(model, val_loader, device, id2label, num_labels):
    """Return (val_f1, true_all, pred_all) for early stopping and reporting."""
    model.eval()
    true_all, pred_all = [], []
    with torch.no_grad():
        for inp, mask, labels in val_loader:
            inp, mask = inp.to(device), mask.to(device)
            preds = model(inp, mask)
            for b in range(inp.size(0)):
                m, labs = mask[b].cpu(), labels[b].cpu()
                pred_b = preds[b]
                tlist, plist = [], []
                pos = 0
                for i in range(m.size(0)):
                    if m[i].item() == 0:
                        break
                    p = id2label[pred_b[pos]] if pos < len(pred_b) and pred_b[pos] < num_labels else "O"
                    pos += 1
                    if labs[i].item() == -100:
                        continue
                    tlist.append(id2label[labs[i].item()])
                    plist.append(p)
                if tlist and plist:
                    true_all.append(tlist)
                    pred_all.append(plist)
    f1 = f1_score(true_all, pred_all, zero_division=0) if true_all else 0.0
    return f1, true_all, pred_all

EPOCHS = 30
PATIENCE = 5  # stop if val F1 doesn't improve for this many epochs
best_f1 = 0.0
best_state = None
epochs_no_improve = 0

# LR scheduler: warmup 10% then linear decay (optional; use transformers for warmup)
from torch.optim.lr_scheduler import LinearLR, SequentialLR, ConstantLR
warmup_epochs = max(1, EPOCHS // 10)
scheduler = SequentialLR(optimizer, [
    ConstantLR(optimizer, factor=0.1, total_iters=warmup_epochs),
    LinearLR(optimizer, start_factor=1.0, end_factor=0.2, total_iters=EPOCHS - warmup_epochs),
], milestones=[warmup_epochs])

for epoch in range(EPOCHS):
    model.train()
    total = 0
    for inp, mask, lab in train_loader:
        inp, mask, lab = inp.to(device), mask.to(device), lab.to(device)
        optimizer.zero_grad()
        loss = model(inp, mask, lab)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        total += loss.item()
    print(f"Epoch {epoch+1}/{EPOCHS} Loss: {total/len(train_loader):.4f}")

Epoch 1/15 Loss: 403.4545


KeyboardInterrupt: 

---
### 8) Save model
Run after training and evaluation. Saves to current folder; in Colab set `SAVE_DIR` to a path in your Drive if you mounted it.

In [None]:
# Save model, tokenizer, and label config (run after training + evaluation)
import json
import os

# In Colab with Drive mounted, use e.g. "/content/drive/MyDrive/resume_ner"
# Otherwise saves in current directory
SAVE_DIR = os.environ.get("RESUME_NER_SAVE_DIR", ".")
os.makedirs(SAVE_DIR, exist_ok=True)

# 1) Model weights
torch.save(model.state_dict(), os.path.join(SAVE_DIR, "bert_bilstm_crf_state.pt"))

# 2) Config (tags + bert name for loading later)
config = {"tags": TAGS, "bert_name": "bert-base-uncased", "num_labels": NUM_LABELS}
with open(os.path.join(SAVE_DIR, "ner_config.json"), "w", encoding="utf-8") as f:
    json.dump(config, f, indent=2)

# 3) Tokenizer (so you can load without re-downloading BERT vocab)
tokenizer.save_pretrained(SAVE_DIR)

print("Saved:", SAVE_DIR)
print("  - bert_bilstm_crf_state.pt")
print("  - ner_config.json")
print("  - tokenizer files (vocab.txt, config.json from tokenizer)")

### 9) Run summary (for your report)
Run after cell 7 (Evaluate). Prints a short summary you can copy into your FYP write-up.

In [None]:
# Run summary (computes val metrics if true_all/pred_all not already from Evaluate cell)
from seqeval.metrics import f1_score, precision_score, recall_score, classification_report

try:
    _ = true_all
    _ = pred_all
except NameError:
    model.eval()
    true_all, pred_all = [], []
    with torch.no_grad():
        for inp, mask, labels in val_loader:
            inp, mask = inp.to(device), mask.to(device)
            preds = model(inp, mask)
            for b in range(inp.size(0)):
                m, labs = mask[b].cpu(), labels[b].cpu()
                pred_b = preds[b]
                tlist, plist = [], []
                pos = 0
                for i in range(m.size(0)):
                    if m[i].item()==0: break
                    p = ID2LABEL[pred_b[pos]] if pos < len(pred_b) and pred_b[pos] < NUM_LABELS else "O"
                    pos += 1
                    if labs[i].item() == -100: continue
                    tlist.append(ID2LABEL[labs[i].item()])
                    plist.append(p)
                if tlist and plist:
                    true_all.append(tlist)
                    pred_all.append(plist)

f1 = f1_score(true_all, pred_all, zero_division=0)
prec = precision_score(true_all, pred_all, zero_division=0)
rec = recall_score(true_all, pred_all, zero_division=0)

print("--- Run summary (copy for report) ---")
print(f"Epochs: {EPOCHS}  |  Train size: {len(train_sents)}  |  Val size: {len(val_sents)}")
print(f"F1 (entity-level): {f1:.4f}")
print(f"Precision: {prec:.4f}  |  Recall: {rec:.4f}")
print("---")
print(classification_report(true_all, pred_all, zero_division=0))
print("--- End summary ---")

### 10) Load saved model (optional)
Use when you want to load a previously saved model without re-training. Run cells 1–5 first (data + tokenizer + device), then this cell. You can skip training (cell 6) and go straight to evaluation (cell 7).

In [None]:
# Load from a saved directory (set LOAD_DIR to where you saved, e.g. "." or "/content/drive/MyDrive/resume_ner")
LOAD_DIR = os.environ.get("RESUME_NER_LOAD_DIR", ".")
with open(os.path.join(LOAD_DIR, "ner_config.json"), "r", encoding="utf-8") as f:
    load_config = json.load(f)

# Rebuild label mappings
TAGS = load_config["tags"]
LABEL2ID = {t: i for i, t in enumerate(TAGS)}
ID2LABEL = {i: t for i, t in enumerate(TAGS)}
NUM_LABELS = load_config["num_labels"]

# Recreate model and load weights
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained(LOAD_DIR)
model = BertBiLSTMCRF(bert_name=load_config["bert_name"], num_labels=NUM_LABELS).to(device)
model.load_state_dict(torch.load(os.path.join(LOAD_DIR, "bert_bilstm_crf_state.pt"), map_location=device))
model.eval()

print("Model loaded from", LOAD_DIR)

In [None]:
# Hybrid: NAME/EMAIL from rules (high recall), SKILL/EXPERIENCE/EDUCATION/OCCUPATION from model. Defined in cell below.

### Parse a new resume (inference, hybrid)
Run after training (or after loading a saved model). Put resume text in `RESUME_TEXT` and run. NAME and EMAIL use rule-based extraction; SKILL, EXPERIENCE, EDUCATION, OCCUPATION use the model.

In [None]:
# Test with another sample resume (run after the cell above; uses same parse_resume_hybrid)
TEST_RESUME_TEXT = """
Sarah Chen
sarah.chen@example.com
(555) 123-4567

Senior Data Scientist with 8 years of experience in machine learning and NLP.

Skills: Python, TensorFlow, PyTorch, SQL, AWS, Spark, Natural Language Processing.

Education: PhD in Computer Science, MIT 2016. MSc Statistics, Stanford University 2012. BSc Mathematics, University of California Berkeley 2010.

Experience: Lead Data Scientist at Google 2019 Present. Data Scientist at Amazon 2016 2019. Research Intern at Microsoft 2015.

Worked at Meta and Netflix. Years of Experience: 8.
"""

words2, tags2, entities2 = parse_resume_hybrid(TEST_RESUME_TEXT.strip(), tokenizer, model, device, ID2LABEL)
print("Entities extracted (hybrid):")
for k, v in entities2.items():
    print(f"  {k}: {v}")
print("\nWord-level tags (first 40):", list(zip(words2[:40], tags2[:40])))

In [None]:
# Unstructured resume (real-CV style: messy formatting, abbreviations, mixed sections)
UNSTRUCTURED_RESUME_TEXT = """
RAJ PATEL   raj.patel@gmail.com   07123456789   London UK

PROFILE
Full-stack dev, 6+ yrs exp. Built APIs & web apps. Quick learner.

SKILLS  Python  Django  React  Node.js  PostgreSQL  Docker  AWS  Git  REST APIs

EDUCATION
BSc CS Univ of Birmingham 2018   A-Levels King Edward VI 2015

WORK
• 2021–now  Senior Developer @ FinTech Solutions Ltd  – led 2 devs, shipped payment API
• 2019–21   Developer  Acme Corp  – React frontends, bug fixes, 3 yrs of experience there
• 2018–19   Junior dev  StartupXYZ  – PHP, MySQL, learned agile

Other:  Freelance 2017–18.  References on request.  LinkedIn: linkedin.com/in/rajpatel
"""

words_u, tags_u, entities_u = parse_resume_hybrid(UNSTRUCTURED_RESUME_TEXT.strip(), tokenizer, model, device, ID2LABEL)
print("Entities extracted (unstructured CV):")
for k, v in entities_u.items():
    print(f"  {k}: {v}")
print("\nWord-level tags (first 50):", list(zip(words_u[:50], tags_u[:50])))

In [None]:
import re

def parse_resume(text, tokenizer, model, device, id2label, max_len=512):
    """Tokenize resume text, run NER, return (words, tags) and entity dict."""
    words = re.findall(r"\S+", text)
    if not words:
        return [], [], {}
    first_idx, toks = [], ["[CLS]"]
    for w in words:
        sub = tokenizer.tokenize(w) or [tokenizer.unk_token]
        first_idx.append(len(toks))
        toks.extend(sub)
    toks.append("[SEP]")
    ids = tokenizer.convert_tokens_to_ids(toks)
    if len(ids) > max_len:
        ids = ids[: max_len - 1] + [tokenizer.sep_token_id]
        first_idx = [i for i in first_idx if i < len(ids)]
        words = words[: len(first_idx)]
    mask = [1] * len(ids)
    inp = torch.tensor([ids], dtype=torch.long).to(device)
    mask_t = torch.tensor([mask], dtype=torch.long).to(device)
    model.eval()
    with torch.no_grad():
        preds = model(inp, mask_t)
    pred_tags = [id2label.get(preds[0][i], "O") for i in first_idx]
    # Build entity dict: TYPE -> list of phrases
    entities = {}
    i = 0
    while i < len(words):
        tag = pred_tags[i] if i < len(pred_tags) else "O"
        if tag.startswith("B-"):
            entity_type = tag[2:]
            phrase = [words[i]]
            i += 1
            while i < len(words) and i < len(pred_tags) and pred_tags[i] == f"I-{entity_type}":
                phrase.append(words[i])
                i += 1
            entities.setdefault(entity_type, []).append(" ".join(phrase))
        else:
            i += 1
    return words, pred_tags, entities

# Hybrid: rules for NAME/EMAIL (high recall), model for SKILL/EXPERIENCE/EDUCATION/OCCUPATION
EMAIL_RE = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", re.IGNORECASE)
def extract_email_rules(text): return list(dict.fromkeys(EMAIL_RE.findall(text)))
def extract_name_heuristic(text):
    lines = [ln.strip() for ln in text.strip().split("\n") if ln.strip()]
    for line in lines[:4]:
        if "@" in line or "http" in line.lower() or "www." in line.lower(): continue
        parts = line.split()
        if 1 <= len(parts) <= 4 and all(p[0].isupper() for p in parts if len(p) > 0 and p[0].isalpha()):
            c = " ".join(parts)
            if len(c) < 80 and not c.endswith("."): return [c]
    return []
def parse_resume_hybrid(text, tokenizer, model, device, id2label, max_len=512):
    text = text.strip()
    rn, re_ = extract_name_heuristic(text), extract_email_rules(text)
    words, pred_tags, entities = parse_resume(text, tokenizer, model, device, id2label, max_len)
    if rn: entities["NAME"] = rn
    if re_: entities["EMAIL"] = re_
    return words, pred_tags, entities

# --- Example: set your resume text and run ---
RESUME_TEXT = """
John Doe
john.doe@email.com
Software Engineer with 5 years of experience.
Skills: Python, Java, Machine Learning.
Education: BSc Computer Science, University of Colombo 2020.
Worked at Tech Corp and Data Inc.
"""

words, tags, entities = parse_resume_hybrid(RESUME_TEXT.strip(), tokenizer, model, device, ID2LABEL)
print("Entities (hybrid: NAME/EMAIL from rules, rest from model):")
for k, v in entities.items():
    print(f"  {k}: {v}")
print("\nWord-level tags (first 30):", list(zip(words[:30], tags[:30])))

### 11) Optional: Train longer with LR scheduler
Run this instead of cell 6 if you want to try more epochs (e.g. 25) with learning rate decay. Then run cell 7 to evaluate and cell 8 to save.

In [None]:
# Optional: more epochs + linear LR decay (run instead of cell 6)
EPOCHS_EXTRA = 25
scheduler = torch.optim.lr_scheduler.LinearLR(optimizer, start_factor=1.0, end_factor=0.1, total_iters=EPOCHS_EXTRA)

for epoch in range(EPOCHS_EXTRA):
    model.train()
    total = 0
    for inp, mask, lab in train_loader:
        inp, mask, lab = inp.to(device), mask.to(device), lab.to(device)
        optimizer.zero_grad()
        loss = model(inp, mask, lab)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        total += loss.item()
    scheduler.step()
    print(f"Epoch {epoch+1}/{EPOCHS_EXTRA} Loss: {total/len(train_loader):.4f} LR: {scheduler.get_last_lr()[0]:.2e}")

In [None]:
# 7) Evaluate with seqeval
from seqeval.metrics import classification_report, f1_score

model.eval()
true_all, pred_all = [], []
with torch.no_grad():
    for inp, mask, labels in val_loader:
        inp, mask = inp.to(device), mask.to(device)
        preds = model(inp, mask)
        for b in range(inp.size(0)):
            m, labs = mask[b].cpu(), labels[b].cpu()
            pred_b = preds[b]
            tlist, plist = [], []
            pos = 0
            for i in range(m.size(0)):
                if m[i].item()==0: break
                p = ID2LABEL[pred_b[pos]] if pos < len(pred_b) and pred_b[pos] < NUM_LABELS else "O"
                pos += 1
                if labs[i].item() == -100: continue
                tlist.append(ID2LABEL[labs[i].item()])
                plist.append(p)
            if tlist and plist:
                true_all.append(tlist)
                pred_all.append(plist)

print(classification_report(true_all, pred_all, zero_division=0))
val_f1 = f1_score(true_all, pred_all, zero_division=0)
print("Val F1 (entity-level):", val_f1)
# Token-level accuracy (often higher than entity F1; 80%+ here is a good target)
total_tok = sum(len(t) for t in true_all)
correct_tok = sum(sum(1 for a, b in zip(t, p) if a == b) for t, p in zip(true_all, pred_all))
token_acc = correct_tok / total_tok if total_tok else 0.0
print("Token accuracy: {:.2%}".format(token_acc))

              precision    recall  f1-score   support

   EDUCATION       0.00      0.00      0.00        43
       EMAIL       0.00      0.00      0.00        27
  EXPERIENCE       0.00      0.00      0.00        46
        NAME       0.00      0.00      0.00        23
  OCCUPATION       0.00      0.00      0.00        37
       SKILL       0.00      0.00      0.00        15

   micro avg       0.00      0.00      0.00       191
   macro avg       0.00      0.00      0.00       191
weighted avg       0.00      0.00      0.00       191

F1: 0.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### 7b) Test on test set (formal test metrics)
Run after cell 7. Builds a test DataLoader and reports F1/precision/recall on the **test** split (never used in training). Use this for your final reported test performance.

In [None]:
# Test set evaluation (same logic as cell 7, but on test_sents / test_labels)
from seqeval.metrics import classification_report, f1_score

test_ds = BertNERDataset(test_sents, test_labels, tokenizer)
test_loader = DataLoader(test_ds, batch_size=8, collate_fn=collate)

model.eval()
true_test, pred_test = [], []
with torch.no_grad():
    for inp, mask, labels in test_loader:
        inp, mask = inp.to(device), mask.to(device)
        preds = model(inp, mask)
        for b in range(inp.size(0)):
            m, labs = mask[b].cpu(), labels[b].cpu()
            pred_b = preds[b]
            tlist, plist = [], []
            pos = 0
            for i in range(m.size(0)):
                if m[i].item() == 0:
                    break
                p = ID2LABEL[pred_b[pos]] if pos < len(pred_b) and pred_b[pos] < NUM_LABELS else "O"
                pos += 1
                if labs[i].item() == -100:
                    continue
                tlist.append(ID2LABEL[labs[i].item()])
                plist.append(p)
            if tlist and plist:
                true_test.append(tlist)
                pred_test.append(plist)

print("--- Test set results ---")
print(classification_report(true_test, pred_test, zero_division=0))
print("Test F1:", f1_score(true_test, pred_test, zero_division=0))