# Job Poster NER with BiLSTM-CRF from Scratch (Path 2 – FYP)

This notebook implements **Path 2** for job poster NER: a **Word2Vec + BiLSTM + CRF** model built from scratch (no pre-trained transformer). Entity types: **JOB_TITLE**, **COMPANY**, **LOCATION**, **SALARY**, **SKILLS_REQUIRED**, **EXPERIENCE_REQUIRED**, **EDUCATION_REQUIRED**, **JOB_TYPE**.

- **Feature engineering:** Word embeddings (Word2Vec) trained on job posting corpus
- **Model:** Embedding → BiLSTM → Linear → CRF
- **Training:** Full training loop with CRF loss
- **Data:** `merged_job_poster_ner.json` (e.g. SkillSpan ~11.5k sentences)

## Dependencies

Run once. (No `transformers`; uses `gensim` for Word2Vec.)

In [None]:
!pip install -q torch pytorch-crf seqeval gensim

## Mount Google Drive

In [None]:
try:
    from google.colab import drive
    drive.mount("/content/drive")
    print("Drive mounted.")
except ImportError:
    print("Not in Colab – skip this cell when running locally.")

## 1. Data loading

In [None]:
import json
import os

_drive_base = "/content/drive/MyDrive" if os.path.exists("/content/drive/MyDrive") else "/content/drive/My Drive"
# Prefer merged files with LLM data (SkillSpan + LLM + Sri Lanka jobs)
for _name in ["merged_job_poster_ner_full.json", "merged_job_poster_ner_with_llm.json", "merged_job_poster_ner.json"]:
    _p = os.path.join(_drive_base, _name)
    if os.path.exists(_p):
        DATA_PATH = _p
        break
else:
    DATA_PATH = "../../job_poster_ner_pipeline/merged_job_poster_ner_full.json"
if not os.path.exists(DATA_PATH):
    DATA_PATH = "../../job_poster_ner_pipeline/merged_job_poster_ner_with_llm.json"
if not os.path.exists(DATA_PATH):
    DATA_PATH = "../../job_poster_ner_pipeline/merged_job_poster_ner.json"
if not os.path.exists(DATA_PATH):
    raise FileNotFoundError("No data file found. Place merged_job_poster_ner_full.json in Drive or job_poster_ner_pipeline/.")

data = []
with open(DATA_PATH, "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if line:
            data.append(json.loads(line))
print(f"Loaded {len(data)} job postings from {os.path.basename(DATA_PATH)}")

LABEL_MAPPING = {
    "JOB_TITLE": "JOB_TITLE", "COMPANY": "COMPANY", "LOCATION": "LOCATION", "SALARY": "SALARY",
    "SKILLS_REQUIRED": "SKILLS_REQUIRED", "EXPERIENCE_REQUIRED": "EXPERIENCE_REQUIRED",
    "EDUCATION_REQUIRED": "EDUCATION_REQUIRED", "JOB_TYPE": "JOB_TYPE", "O": "O",
    "Job Title": "JOB_TITLE", "Company": "COMPANY", "Location": "LOCATION", "Salary": "SALARY",
    "Skill": "SKILLS_REQUIRED", "Occupation": "JOB_TITLE", "Qualification": "EDUCATION_REQUIRED",
}
for item in data:
    for ann in item.get("annotation", []):
        ann["label"] = [LABEL_MAPPING.get(l, "O") for l in ann["label"]]

## 2. Preprocessing and train/val/test split

In [None]:
import re
import random

def tokenize_with_positions(text):
    return [(m.group(), m.start(), m.end()) for m in re.finditer(r"\S+", text)]

def create_bio_tags_fixed(tokens, annotations):
    bio = ["O"] * len(tokens)
    for ann in annotations:
        if not ann.get("label") or ann["label"][0] == "O":
            continue
        entity = ann["label"][0]
        for pt in ann.get("points", []):
            s, e = pt["start"], pt["end"]
            first = True
            for i, (_, ts, te) in enumerate(tokens):
                if te <= s or ts >= e:
                    continue
                bio[i] = f"B-{entity}" if first else f"I-{entity}"
                first = False
    return bio

all_sents, all_labels = [], []
for item in data:
    content = item.get("content", "")
    anns = item.get("annotation", [])
    if not content or not anns:
        continue
    toks = tokenize_with_positions(content)
    if not toks:
        continue
    labs = create_bio_tags_fixed(toks, anns)
    all_sents.append([t[0] for t in toks])
    all_labels.append(labs)

n = len(all_sents)
random.seed(42)
idx = list(range(n))
random.shuffle(idx)
n_train, n_val = int(0.8 * n), int(0.1 * n)
train_sents = [all_sents[i] for i in idx[:n_train]]
train_labels = [all_labels[i] for i in idx[:n_train]]
val_sents = [all_sents[i] for i in idx[n_train : n_train + n_val]]
val_labels = [all_labels[i] for i in idx[n_train : n_train + n_val]]
test_sents = [all_sents[i] for i in idx[n_train + n_val :]]
test_labels = [all_labels[i] for i in idx[n_train + n_val :]]
print(f"Train {len(train_sents)} Val {len(val_sents)} Test {len(test_sents)}")

## 3. Word embeddings (Word2Vec) and vocabulary

In [None]:
from gensim.models import Word2Vec
import numpy as np

EMBED_DIM = 256
W2V_MIN_COUNT = 1
W2V_WINDOW = 6
W2V_EPOCHS = 35

w2v = Word2Vec(sentences=all_sents, vector_size=EMBED_DIM, window=W2V_WINDOW, min_count=W2V_MIN_COUNT, epochs=W2V_EPOCHS, workers=4)

PAD_TOKEN = "<PAD>"
UNK_TOKEN = "<UNK>"
word2id = {PAD_TOKEN: 0, UNK_TOKEN: 1}
for w in w2v.wv.key_to_index:
    if w not in word2id:
        word2id[w] = len(word2id)
vocab_size = len(word2id)

embedding_matrix = np.zeros((vocab_size, EMBED_DIM), dtype=np.float32)
for w, i in word2id.items():
    if w in (PAD_TOKEN, UNK_TOKEN):
        continue
    if w in w2v.wv:
        embedding_matrix[i] = w2v.wv[w]
    else:
        embedding_matrix[i] = np.random.normal(0, 0.1, EMBED_DIM)

print(f"Vocab size: {vocab_size}, Embed dim: {EMBED_DIM}")

## 4. Labels and dataset (word-level, padded)

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

TAGS = ["O", "B-JOB_TITLE", "I-JOB_TITLE", "B-COMPANY", "I-COMPANY", "B-LOCATION", "I-LOCATION", "B-SALARY", "I-SALARY",
        "B-SKILLS_REQUIRED", "I-SKILLS_REQUIRED", "B-EXPERIENCE_REQUIRED", "I-EXPERIENCE_REQUIRED",
        "B-EDUCATION_REQUIRED", "I-EDUCATION_REQUIRED", "B-JOB_TYPE", "I-JOB_TYPE"]
LABEL2ID = {t: i for i, t in enumerate(TAGS)}
ID2LABEL = {i: t for i, t in enumerate(TAGS)}
NUM_LABELS = len(TAGS)

class WordNERDataset(Dataset):
    def __init__(self, sents, labels, word2id, max_len=512):
        self.samples = []
        for words, labs in zip(sents, labels):
            if len(words) != len(labs) or len(words) == 0:
                continue
            if len(words) > max_len:
                words, labs = words[:max_len], labs[:max_len]
            ids = [word2id.get(w, word2id[UNK_TOKEN]) for w in words]
            lab_ids = [LABEL2ID.get(l, 0) for l in labs]
            self.samples.append((ids, lab_ids, len(ids)))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, i):
        return self.samples[i]

def collate_pad(batch):
    max_l = max(b[2] for b in batch)
    pad_id = word2id[PAD_TOKEN]
    pad_label = -100
    ids = torch.tensor([b[0] + [pad_id] * (max_l - b[2]) for b in batch], dtype=torch.long)
    labels = torch.tensor([b[1] + [pad_label] * (max_l - b[2]) for b in batch], dtype=torch.long)
    mask = torch.tensor([[1] * b[2] + [0] * (max_l - b[2]) for b in batch], dtype=torch.long)
    return ids, mask, labels

MAX_LEN = 512
train_ds = WordNERDataset(train_sents, train_labels, word2id, MAX_LEN)
val_ds = WordNERDataset(val_sents, val_labels, word2id, MAX_LEN)

# Oversample any sample with entities (not just O) to prevent all-O collapse
entity_tags = {"B-SALARY", "I-SALARY", "B-EDUCATION_REQUIRED", "I-EDUCATION_REQUIRED", "B-JOB_TYPE", "I-JOB_TYPE",
               "B-EXPERIENCE_REQUIRED", "I-EXPERIENCE_REQUIRED", "B-COMPANY", "I-COMPANY", "B-LOCATION", "I-LOCATION",
               "B-JOB_TITLE", "I-JOB_TITLE", "B-SKILLS_REQUIRED", "I-SKILLS_REQUIRED"}
# Higher weight (3.5) for entity-rich samples to prevent model collapsing to all-O predictions
train_weights = [3.5 if any(t in entity_tags for t in l) else 1.0 for w, l in zip(train_sents, train_labels) if len(w) == len(l) and len(w) > 0]
from torch.utils.data import WeightedRandomSampler
train_sampler = WeightedRandomSampler(weights=train_weights, num_samples=len(train_weights))
train_loader = DataLoader(train_ds, batch_size=8, sampler=train_sampler, collate_fn=collate_pad)
val_loader = DataLoader(val_ds, batch_size=12, collate_fn=collate_pad)
print("Datasets ready")

## 5. Model: BiLSTM-CRF (from scratch)

In [None]:
import torch.nn as nn
from torchcrf import CRF

class BiLSTMCRF(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_labels, embedding_matrix=None, dropout=0.35):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        if embedding_matrix is not None:
            self.embed.weight.data.copy_(torch.from_numpy(embedding_matrix))
            self.embed.weight.requires_grad = True
        self.lstm = nn.LSTM(embed_dim, hidden_dim // 2, num_layers=2, bidirectional=True, batch_first=True, dropout=0.2)
        self.drop = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim, num_labels)
        self.crf = CRF(num_labels, batch_first=True)

    def forward(self, input_ids, mask, labels=None):
        x = self.embed(input_ids)
        x = self.drop(x)
        out, _ = self.lstm(x)
        emissions = self.fc(self.drop(out))
        mask_b = mask.bool()
        if labels is not None:
            labels = labels.clone().masked_fill(labels == -100, 0)
            return -self.crf(emissions, labels, mask=mask_b, reduction="mean")
        return self.crf.decode(emissions, mask=mask_b)

HIDDEN_DIM = 384
if torch.cuda.is_available():
    device = torch.device("cuda")
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

model = BiLSTMCRF(vocab_size, EMBED_DIM, HIDDEN_DIM, NUM_LABELS, embedding_matrix=embedding_matrix, dropout=0.35).to(device)
# Lower LR (5e-4) helps prevent collapse to all-O; raise to 1e-3 if training is too slow
optimizer = torch.optim.Adam(model.parameters(), lr=5e-4, weight_decay=5e-5)
print(f"Model on {device}")

## 6. Training (early stopping, validation F1)

In [None]:
from seqeval.metrics import f1_score
from torch.optim.lr_scheduler import LinearLR, SequentialLR, ConstantLR

def run_validation(model, val_loader, device, id2label, num_labels):
    model.eval()
    true_all, pred_all = [], []
    with torch.no_grad():
        for ids, mask, labels in val_loader:
            ids, mask = ids.to(device), mask.to(device)
            preds = model(ids, mask)
            for b in range(ids.size(0)):
                m, labs = mask[b].cpu(), labels[b].cpu()
                pred_b = preds[b]
                tlist, plist = [], []
                for i in range(m.size(0)):
                    if m[i].item() == 0:
                        break
                    if labs[i].item() == -100:
                        continue
                    tlist.append(id2label[labs[i].item()])
                    p = id2label[pred_b[i]] if i < len(pred_b) and pred_b[i] < num_labels else "O"
                    plist.append(p)
                if tlist and plist:
                    true_all.append(tlist)
                    pred_all.append(plist)
    f1 = f1_score(true_all, pred_all, zero_division=0) if true_all else 0.0
    return f1

EPOCHS = 80
PATIENCE = 20
warmup_epochs = max(2, EPOCHS // 15)
scheduler = SequentialLR(optimizer, [
    ConstantLR(optimizer, factor=0.1, total_iters=warmup_epochs),
    LinearLR(optimizer, start_factor=1.0, end_factor=0.15, total_iters=EPOCHS - warmup_epochs),
], milestones=[warmup_epochs])
best_f1 = 0.0
best_state = None
epochs_no_improve = 0

for epoch in range(EPOCHS):
    model.train()
    total = 0
    for ids, mask, lab in train_loader:
        ids, mask, lab = ids.to(device), mask.to(device), lab.to(device)
        optimizer.zero_grad()
        loss = model(ids, mask, lab)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 2.0)
        optimizer.step()
        total += loss.item()
    scheduler.step()
    val_f1 = run_validation(model, val_loader, device, ID2LABEL, NUM_LABELS)
    if val_f1 > best_f1:
        best_f1 = val_f1
        best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1
    print(f"Epoch {epoch+1}/{EPOCHS} Loss: {total/len(train_loader):.4f} Val F1: {val_f1:.4f} Best: {best_f1:.4f}")
    if epochs_no_improve >= PATIENCE:
        print(f"Early stopping (no improvement for {PATIENCE} epochs).")
        break
if best_state is not None:
    model.load_state_dict(best_state)
    print("Restored best checkpoint (by val F1).")

## 7. Save model and assets

In [None]:
_save_base = "/content/drive/MyDrive" if os.path.exists("/content/drive/MyDrive") else "."
SAVE_DIR = os.environ.get("JOB_POSTER_NER_PATH2_SAVE_DIR", os.path.join(_save_base, "job_poster_ner_path2"))
os.makedirs(SAVE_DIR, exist_ok=True)

torch.save(model.state_dict(), os.path.join(SAVE_DIR, "bilstm_crf_state.pt"))
w2v.save(os.path.join(SAVE_DIR, "word2vec.model"))
config = {"tags": TAGS, "word2id": word2id, "embed_dim": EMBED_DIM, "num_labels": NUM_LABELS, "max_len": MAX_LEN}
with open(os.path.join(SAVE_DIR, "ner_config.json"), "w", encoding="utf-8") as f:
    json.dump(config, f, indent=2)

print("Saved:", SAVE_DIR)
print("  - bilstm_crf_state.pt")
print("  - word2vec.model")
print("  - ner_config.json")

## 8. Inference on job poster text

In [None]:
def parse_job_poster_path2(text, word2id, model, device, id2label, max_len=512):
    """Extract entities from job poster text using Path 2 model."""
    words = re.findall(r"\S+", text)[:max_len]
    if not words:
        return [], [], {}
    ids = torch.tensor([[word2id.get(w, word2id[UNK_TOKEN]) for w in words]], dtype=torch.long).to(device)
    mask = torch.ones_like(ids, dtype=torch.long).to(device)
    model.eval()
    with torch.no_grad():
        preds = model(ids, mask)
    pred_tags = [id2label.get(preds[0][i], "O") for i in range(len(preds[0]))]
    entities = {}
    i = 0
    while i < len(words):
        tag = pred_tags[i] if i < len(pred_tags) else "O"
        if tag.startswith("B-"):
            entity_type = tag[2:]
            phrase = [words[i]]
            i += 1
            while i < len(words) and i < len(pred_tags) and pred_tags[i] == f"I-{entity_type}":
                phrase.append(words[i])
                i += 1
            raw = " ".join(phrase)
            cleaned = raw.strip().rstrip(".,;:!?)]}\"'").lstrip("([{\"'").strip()
            if cleaned:
                entities.setdefault(entity_type, []).append(cleaned)
        else:
            i += 1
    for k in entities:
        entities[k] = list(dict.fromkeys(entities[k]))
    return words, pred_tags, entities

# Simpler regex to avoid escaping issues (catches $80k, $120k-150k, 80k-120k, Competitive)
SALARY_RE = re.compile(r'\\$[\\d,]+\\.?\\d*\\s*(k|K|M)?|\\d+\\s*(k|K)\\s*-\\s*\\d+\\s*(k|K)|Competitive', re.IGNORECASE)

def parse_job_poster_path2_hybrid(text, word2id, model, device, id2label, max_len=512):
    """Hybrid: SALARY from rules, rest from model."""
    words, pred_tags, entities = parse_job_poster_path2(text, word2id, model, device, id2label, max_len)
    sal = list(dict.fromkeys(m.group(0).strip() for m in SALARY_RE.finditer(text)))
    if sal:
        entities["SALARY"] = sal
    return words, pred_tags, entities

In [None]:
JOB_POSTER_TEXT = """
Senior Data Scientist at Acme Corp. Remote. $120k-150k. Skills: Python, ML. 5+ years experience. PhD preferred. Full-time.
"""
words, tags, entities = parse_job_poster_path2_hybrid(JOB_POSTER_TEXT.strip(), word2id, model, device, ID2LABEL)
print("Entities (hybrid):")
for k, v in entities.items():
    print(f"  {k}: {v}")
print("\nWord-level tags (first 25):", list(zip(words[:25], tags[:25])))

## 9. Evaluation (validation and test)

In [None]:
from seqeval.metrics import classification_report, f1_score

model.eval()
true_all, pred_all = [], []
with torch.no_grad():
    for ids, mask, labels in val_loader:
        ids, mask = ids.to(device), mask.to(device)
        preds = model(ids, mask)
        for b in range(ids.size(0)):
            m, labs = mask[b].cpu(), labels[b].cpu()
            pred_b = preds[b]
            tlist, plist = [], []
            for i in range(m.size(0)):
                if m[i].item() == 0:
                    break
                if labs[i].item() == -100:
                    continue
                tlist.append(ID2LABEL[labs[i].item()])
                p = ID2LABEL[pred_b[i]] if i < len(pred_b) and pred_b[i] < NUM_LABELS else "O"
                plist.append(p)
            if tlist and plist:
                true_all.append(tlist)
                pred_all.append(plist)

print("--- Validation ---")
print(classification_report(true_all, pred_all, zero_division=0))
print("Val F1:", f1_score(true_all, pred_all, zero_division=0))

test_ds = WordNERDataset(test_sents, test_labels, word2id, MAX_LEN)
test_loader = DataLoader(test_ds, batch_size=12, collate_fn=collate_pad)
true_test, pred_test = [], []
with torch.no_grad():
    for ids, mask, labels in test_loader:
        ids, mask = ids.to(device), mask.to(device)
        preds = model(ids, mask)
        for b in range(ids.size(0)):
            m, labs = mask[b].cpu(), labels[b].cpu()
            pred_b = preds[b]
            tlist, plist = [], []
            for i in range(m.size(0)):
                if m[i].item() == 0:
                    break
                if labs[i].item() == -100:
                    continue
                tlist.append(ID2LABEL[labs[i].item()])
                p = ID2LABEL[pred_b[i]] if i < len(pred_b) and pred_b[i] < NUM_LABELS else "O"
                plist.append(p)
            if tlist and plist:
                true_test.append(tlist)
                pred_test.append(plist)

print("--- Test ---")
print(classification_report(true_test, pred_test, zero_division=0))
print("Test F1:", f1_score(true_test, pred_test, zero_division=0))