# BERT-BiLSTM-CRF for Resume NER (rule-compliant)

Run cells in order.

**Run in Cursor:** Use a **local Python kernel** (not Colab): click the kernel name top-right → select your project’s Python (e.g. conda/venv). Then run cells with **Shift+Enter** or ▶. The JSON file is loaded from the same folder as this notebook.

In [20]:
# 1) Load data — set DATA_PATH to your JSON file path if needed
import json
import os

DATA_PATH = "entity_recognition_in_resumes.json"
if not os.path.exists(DATA_PATH):
    DATA_PATH = "/content/drive/My Drive/DATASETS/entity_recognition_in_resumes.json"
if not os.path.exists(DATA_PATH):
    raise FileNotFoundError("JSON file not found. Set DATA_PATH in this cell to the path of entity_recognition_in_resumes.json and re-run.")

data = []
with open(DATA_PATH, "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if line:
            data.append(json.loads(line))
print(f"Loaded {len(data)} resumes")

LABEL_MAPPING = {
    "Name": "NAME", "Email Address": "EMAIL", "Skills": "SKILL", "Designation": "OCCUPATION",
    "Degree": "EDUCATION", "College Name": "EDUCATION", "Graduation Year": "EDUCATION",
    "Companies worked at": "EXPERIENCE", "Years of Experience": "EXPERIENCE", "Location": "O", "UNKNOWN": "O",
}
for item in data:
    for ann in item.get("annotation", []):
        ann["label"] = [LABEL_MAPPING.get(l, "O") for l in ann["label"]]

Loaded 220 resumes


**Colab only – use if upload widget didn't work:** Run the cell below to mount Google Drive. Put `entity_recognition_in_resumes.json` in your Drive (e.g. in My Drive), then in **cell 1** set `DATA_PATH = '/content/drive/MyDrive/entity_recognition_in_resumes.json'` (or the path where you put it) and re-run cell 1.

In [21]:
# Colab: Mount Google Drive (run this, then set DATA_PATH in cell 1 to your file path in Drive)
try:
    from google.colab import drive
    drive.mount("/content/drive")
    print("Drive mounted. Put entity_recognition_in_resumes.json in My Drive, then set DATA_PATH in cell 1 and re-run it.")
except ImportError:
    print("Not in Colab – skip this cell.")

Not in Colab – skip this cell.


In [22]:
# 2) Build train/val/test from JSON with fixed create_bio_tags (no B-O / I-O)
import re
import random

def tokenize_with_positions(text):
    return [(m.group(), m.start(), m.end()) for m in re.finditer(r"\S+", text)]

def create_bio_tags_fixed(tokens, annotations):
    bio = ["O"] * len(tokens)
    for ann in annotations:
        if not ann.get("label") or ann["label"][0] == "O":
            continue
        entity = ann["label"][0]
        for pt in ann.get("points", []):
            s, e = pt["start"], pt["end"]
            first = True
            for i, (_, ts, te) in enumerate(tokens):
                if te <= s or ts >= e: continue
                bio[i] = f"B-{entity}" if first else f"I-{entity}"
                first = False
    return bio

all_sents, all_labels = [], []
for item in data:
    content = item.get("content", "")
    anns = item.get("annotation", [])
    if not content or not anns: continue
    toks = tokenize_with_positions(content)
    if not toks: continue
    labs = create_bio_tags_fixed(toks, anns)
    all_sents.append([t[0] for t in toks])
    all_labels.append(labs)

n = len(all_sents)
random.seed(42)
idx = list(range(n)); random.shuffle(idx)
n_train, n_val = int(0.8 * n), int(0.1 * n)
train_sents = [all_sents[i] for i in idx[:n_train]]
train_labels = [all_labels[i] for i in idx[:n_train]]
val_sents   = [all_sents[i] for i in idx[n_train:n_train+n_val]]
val_labels  = [all_labels[i] for i in idx[n_train:n_train+n_val]]
test_sents  = [all_sents[i] for i in idx[n_train+n_val:]]
test_labels = [all_labels[i] for i in idx[n_train+n_val:]]
print(f"Train {len(train_sents)} Val {len(val_sents)} Test {len(test_sents)}")

Train 176 Val 22 Test 22


In [23]:
# 3) Install deps (run once; skip if already installed)
!pip install -q torch transformers pytorch-crf seqeval

python(8270) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m26.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [24]:
# 4) BERT tokenizer + label alignment and dataset
from transformers import BertTokenizer
from torch.utils.data import Dataset, DataLoader
import torch

TAGS = ["O","B-NAME","I-NAME","B-EMAIL","I-EMAIL","B-SKILL","I-SKILL","B-OCCUPATION","I-OCCUPATION","B-EXPERIENCE","I-EXPERIENCE","B-EDUCATION","I-EDUCATION"]
LABEL2ID = {t:i for i,t in enumerate(TAGS)}
ID2LABEL = {i:t for i,t in enumerate(TAGS)}
NUM_LABELS = len(TAGS)

def align_to_bert(words, word_labels, tokenizer, max_len=512):
    first_idx, toks = [], ["[CLS]"]
    for w in words:
        p = tokenizer.tokenize(w) or [tokenizer.unk_token]
        first_idx.append(len(toks))
        toks.extend(p)
    toks.append("[SEP]")
    ids = tokenizer.convert_tokens_to_ids(toks)
    mask = [1]*len(ids)
    aligned = [-100]*len(ids)
    for pos, lab in zip(first_idx, word_labels):
        if pos < len(aligned): aligned[pos] = LABEL2ID.get(lab, 0)
    if len(ids) > max_len:
        ids, mask, aligned = ids[:max_len-1]+[tokenizer.sep_token_id], mask[:max_len-1]+[1], aligned[:max_len-1]+[-100]
    return ids, mask, aligned

class BertNERDataset(Dataset):
    def __init__(self, sents, labels, tokenizer, max_len=512):
        self.samples = [align_to_bert(w, l, tokenizer, max_len) for w, l in zip(sents, labels) if len(w)==len(l)]
    def __len__(self): return len(self.samples)
    def __getitem__(self, i): return self.samples[i]

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
train_ds = BertNERDataset(train_sents, train_labels, tokenizer)
val_ds   = BertNERDataset(val_sents, val_labels, tokenizer)

def collate(batch):
    max_l = max(len(b[0]) for b in batch)
    pad = 0
    return (
        torch.tensor([b[0]+[pad]*(max_l-len(b[0])) for b in batch], dtype=torch.long),
        torch.tensor([b[1]+[0]*(max_l-len(b[1])) for b in batch], dtype=torch.long),
        torch.tensor([b[2]+[-100]*(max_l-len(b[2])) for b in batch], dtype=torch.long),
    )
train_loader = DataLoader(train_ds, batch_size=8, shuffle=True, collate_fn=collate)
val_loader   = DataLoader(val_ds, batch_size=8, collate_fn=collate)
print("Datasets ready")

Datasets ready


In [25]:
# 5) BERT-BiLSTM-CRF model
import torch.nn as nn
from transformers import BertModel
from torchcrf import CRF

class BertBiLSTMCRF(nn.Module):
    def __init__(self, bert_name="bert-base-uncased", hidden_dim=256, num_labels=NUM_LABELS, dropout=0.3):
        super().__init__()
        self.bert = BertModel.from_pretrained(bert_name)
        self.lstm = nn.LSTM(self.bert.config.hidden_size, hidden_dim//2, num_layers=1, bidirectional=True, batch_first=True)
        self.drop = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim, num_labels)
        self.crf = CRF(num_labels, batch_first=True)

    def forward(self, input_ids, attention_mask, labels=None):
        out = self.bert(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
        out, _ = self.lstm(self.drop(out))
        emissions = self.fc(self.drop(out))
        mask_b = attention_mask.bool()  # boolean avoids torch.where uint8 deprecation in pytorch-crf
        if labels is not None:
            # CRF expects indices in [0, num_labels-1]; replace padding -100 with 0 (mask ignores those)
            labels = labels.clone().masked_fill(labels == -100, 0)
            return -self.crf(emissions, labels, mask=mask_b, reduction="mean")
        return self.crf.decode(emissions, mask=mask_b)

# Prefer GPU (CUDA or Apple Silicon MPS), else CPU
if torch.cuda.is_available():
    device = torch.device("cuda")
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
    device = torch.device("mps")  # Apple Silicon GPU
else:
    device = torch.device("cpu")
model = BertBiLSTMCRF().to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
print(f"Model on {device}")

Model on mps


In [26]:
# 6) Train (more epochs help with small data; gradient clipping stabilizes)
EPOCHS = 15
for epoch in range(EPOCHS):
    model.train()
    total = 0
    for inp, mask, lab in train_loader:
        inp, mask, lab = inp.to(device), mask.to(device), lab.to(device)
        optimizer.zero_grad()
        loss = model(inp, mask, lab)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        total += loss.item()
    print(f"Epoch {epoch+1}/{EPOCHS} Loss: {total/len(train_loader):.4f}")

Epoch 1/15 Loss: 506.0813
Epoch 2/15 Loss: 212.6201


KeyboardInterrupt: 

In [None]:
# 7) Evaluate with seqeval
from seqeval.metrics import classification_report, f1_score

model.eval()
true_all, pred_all = [], []
with torch.no_grad():
    for inp, mask, labels in val_loader:
        inp, mask = inp.to(device), mask.to(device)
        preds = model(inp, mask)
        for b in range(inp.size(0)):
            m, labs = mask[b].cpu(), labels[b].cpu()
            pred_b = preds[b]
            tlist, plist = [], []
            pos = 0
            for i in range(m.size(0)):
                if m[i].item()==0: break
                p = ID2LABEL[pred_b[pos]] if pos < len(pred_b) and pred_b[pos] < NUM_LABELS else "O"
                pos += 1
                if labs[i].item() == -100: continue
                tlist.append(ID2LABEL[labs[i].item()])
                plist.append(p)
            if tlist and plist:
                true_all.append(tlist)
                pred_all.append(plist)

print(classification_report(true_all, pred_all, zero_division=0))
print("F1:", f1_score(true_all, pred_all, zero_division=0))

              precision    recall  f1-score   support

   EDUCATION       0.00      0.00      0.00        43
       EMAIL       0.00      0.00      0.00        27
  EXPERIENCE       0.00      0.00      0.00        46
        NAME       0.00      0.00      0.00        23
  OCCUPATION       0.00      0.00      0.00        37
       SKILL       0.00      0.00      0.00        15

   micro avg       0.00      0.00      0.00       191
   macro avg       0.00      0.00      0.00       191
weighted avg       0.00      0.00      0.00       191

F1: 0.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
