In [1]:
import json

from google.colab import drive
drive.mount('/content/drive')


file_path = '/content/drive/My Drive/DATASETS/entity_recognition_in_resumes.json'

data = []
with open(file_path, 'r', encoding='utf-8') as f:
    for line in f:
        line = line.strip()
        if line:  # skip empty lines
            data.append(json.loads(line))

print(len(data))  # number of records
print(data[0])    # first JSON object

Mounted at /content/drive
220
{'content': "Abhishek Jha\nApplication Development Associate - Accenture\n\nBengaluru, Karnataka - Email me on Indeed: indeed.com/r/Abhishek-Jha/10e7a8cb732bc43a\n\n• To work for an organization which provides me the opportunity to improve my skills\nand knowledge for my individual and company's growth in best possible ways.\n\nWilling to relocate to: Bangalore, Karnataka\n\nWORK EXPERIENCE\n\nApplication Development Associate\n\nAccenture -\n\nNovember 2017 to Present\n\nRole: Currently working on Chat-bot. Developing Backend Oracle PeopleSoft Queries\nfor the Bot which will be triggered based on given input. Also, Training the bot for different possible\nutterances (Both positive and negative), which will be given as\ninput by the user.\n\nEDUCATION\n\nB.E in Information science and engineering\n\nB.v.b college of engineering and technology -  Hubli, Karnataka\n\nAugust 2013 to June 2017\n\n12th in Mathematics\n\nWoodbine modern school\n\nApril 2011 to M

In [2]:
!mkdir -p /content/drive/MyDrive/FYP/data/processed


In [3]:
count_annotated = sum(1 for item in data if item.get('annotation'))
print(f"Annotated resumes: {count_annotated}/{len(data)}")


Annotated resumes: 220/220


In [4]:
all_labels = set()
for item in data:
    for ann in item.get('annotation', []):
        for label in ann['label']:
            all_labels.add(label)
print("Labels found:", all_labels)


Labels found: {'Name', 'Companies worked at', 'Designation', 'Skills', 'UNKNOWN', 'College Name', 'Graduation Year', 'Degree', 'Email Address', 'Years of Experience', 'Location'}


In [5]:
sample = data[0]
print(sample['content'][:500])  # first 500 characters of resume
print(sample['annotation'])


Abhishek Jha
Application Development Associate - Accenture

Bengaluru, Karnataka - Email me on Indeed: indeed.com/r/Abhishek-Jha/10e7a8cb732bc43a

• To work for an organization which provides me the opportunity to improve my skills
and knowledge for my individual and company's growth in best possible ways.

Willing to relocate to: Bangalore, Karnataka

WORK EXPERIENCE

Application Development Associate

Accenture -

November 2017 to Present

Role: Currently working on Chat-bot. Developing Backen
[{'label': ['Skills'], 'points': [{'start': 1295, 'end': 1621, 'text': '\n• Programming language: C, C++, Java\n• Oracle PeopleSoft\n• Internet Of Things\n• Machine Learning\n• Database Management System\n• Computer Networks\n• Operating System worked on: Linux, Windows, Mac\n\nNon - Technical Skills\n\n• Honest and Hard-Working\n• Tolerant and Flexible to Different Situations\n• Polite and Calm\n• Team-Player'}]}, {'label': ['Skills'], 'points': [{'start': 993, 'end': 1153, 'text': 'C (Less th

In [6]:
label_mapping = {
    "Name": "NAME",
    "Email Address": "EMAIL",
    "Skills": "SKILL",
    "Designation": "OCCUPATION",
    "Degree": "EDUCATION",
    "College Name": "EDUCATION",
    "Graduation Year": "EDUCATION",
    "Companies worked at": "EXPERIENCE",
    "Years of Experience": "EXPERIENCE",
    "Location": "O",
    "UNKNOWN": "O"
}

for resume in data:
    for ann in resume["annotation"]:
        old_labels = ann["label"]
        ann["label"] = [label_mapping.get(l, "O") for l in old_labels]


In [7]:
import re

def tokenize_with_positions(text):
    """
    Tokenizes text into words and keeps start/end character positions
    """
    tokens = []
    for match in re.finditer(r'\S+', text):  # \S+ matches non-whitespace sequences
        token_text = match.group()
        start = match.start()
        end = match.end()
        tokens.append((token_text, start, end))
    return tokens


In [8]:
sample_text = data[0]['content'][:100]  # first 100 characters of resume
tokens = tokenize_with_positions(sample_text)
for t in tokens:
    print(t)


('Abhishek', 0, 8)
('Jha', 9, 12)
('Application', 13, 24)
('Development', 25, 36)
('Associate', 37, 46)
('-', 47, 48)
('Accenture', 49, 58)
('Bengaluru,', 60, 70)
('Karnataka', 71, 80)
('-', 81, 82)
('Email', 83, 88)
('me', 89, 91)
('on', 92, 94)
('Indee', 95, 100)


In [9]:
def create_bio_tags(tokens, annotations):
    """
    tokens: list of (token_text, start, end)
    annotations: list of annotations with 'start', 'end', 'label'
    """
    bio_labels = ["O"] * len(tokens)

    for ann in annotations:
        entity_label = ann['label'][0]  # already normalized
        for point in ann['points']:
            span_start = point['start']
            span_end = point['end']
            # check token overlap
            first_token = True
            for i, (tok, tok_start, tok_end) in enumerate(tokens):
                if tok_end <= span_start:
                    continue
                if tok_start >= span_end:
                    break
                # token overlaps
                if first_token:
                    bio_labels[i] = f"B-{entity_label}"
                    first_token = False
                else:
                    bio_labels[i] = f"I-{entity_label}"
    return bio_labels


In [10]:
sample_resume = data[0]
tokens = tokenize_with_positions(sample_resume['content'])
bio_labels = create_bio_tags(tokens, sample_resume['annotation'])

# print first 20 tokens with labels
for (tok, start, end), label in zip(tokens[:20], bio_labels[:20]):
    print(f"{tok} -> {label}")


Abhishek -> B-NAME
Jha -> I-NAME
Application -> B-OCCUPATION
Development -> I-OCCUPATION
Associate -> I-OCCUPATION
- -> O
Accenture -> B-EXPERIENCE
Bengaluru, -> B-O
Karnataka -> O
- -> O
Email -> O
me -> O
on -> O
Indeed: -> B-EMAIL
indeed.com/r/Abhishek-Jha/10e7a8cb732bc43a -> I-EMAIL
• -> O
To -> O
work -> O
for -> O
an -> O


In [11]:
def create_bio_tags(tokens, annotations):
    """
    tokens: list of (token_text, start, end)
    annotations: list of annotations with 'start', 'end', 'label'
    """
    bio_labels = ["O"] * len(tokens)

    for ann in annotations:
        if not ann['label']:
            continue  # skip empty label lists
        entity_label = ann['label'][0]  # already normalized
        if entity_label == "O":
            continue  # skip non-entity annotations
        for point in ann['points']:
            span_start = point['start']
            span_end = point['end']
            # check token overlap
            first_token = True
            for i, (tok, tok_start, tok_end) in enumerate(tokens):
                if tok_end <= span_start:
                    continue
                if tok_start >= span_end:
                    break
                # token overlaps
                if first_token:
                    bio_labels[i] = f"B-{entity_label}"
                    first_token = False
                else:
                    bio_labels[i] = f"I-{entity_label}"
    return bio_labels


# save_to_file(train_data, '/content/drive/My Drive/DATASETS/train.txt')
# save_to_file(val_data, '/content/drive/My Drive/DATASETS/valid.txt')
# save_to_file(test_data, '/content/drive/My Drive/DATASETS/test.txt')



In [12]:
tags = [
    "O",
    "B-NAME", "I-NAME",
    "B-EMAIL", "I-EMAIL",
    "B-SKILL", "I-SKILL",
    "B-OCCUPATION", "I-OCCUPATION",
    "B-EXPERIENCE", "I-EXPERIENCE",
    "B-EDUCATION", "I-EDUCATION"
]

tag2idx = {tag: idx for idx, tag in enumerate(tags)}
idx2tag = {idx: tag for tag, idx in tag2idx.items()}

print(tag2idx)


{'O': 0, 'B-NAME': 1, 'I-NAME': 2, 'B-EMAIL': 3, 'I-EMAIL': 4, 'B-SKILL': 5, 'I-SKILL': 6, 'B-OCCUPATION': 7, 'I-OCCUPATION': 8, 'B-EXPERIENCE': 9, 'I-EXPERIENCE': 10, 'B-EDUCATION': 11, 'I-EDUCATION': 12}


In [13]:
def load_bio_file(file_path):
    sentences = []
    labels = []

    with open(file_path, 'r', encoding='utf-8') as f:
        tokens = []
        token_labels = []
        for line in f:
            line = line.strip()
            if not line:
                if tokens:
                    sentences.append(tokens)
                    labels.append(token_labels)
                    tokens = []
                    token_labels = []
            else:
                splits = line.split()
                tokens.append(splits[0])
                token_labels.append(splits[1])
        if tokens:  # last sentence
            sentences.append(tokens)
            labels.append(token_labels)
    return sentences, labels

train_sents, train_labels = load_bio_file('/content/drive/My Drive/DATASETS/train.txt')
val_sents, val_labels = load_bio_file('/content/drive/My Drive/DATASETS/valid.txt')
test_sents, test_labels = load_bio_file('/content/drive/My Drive/DATASETS/test.txt')

print(f"Train samples: {len(train_sents)}")
print(f"First sentence tokens: {train_sents[0]}")
print(f"First sentence labels: {train_labels[0]}")


Train samples: 154
First sentence tokens: ['PRASHANTH', 'BADALA', 'Devops', 'Engineer', ',Cloud', 'Engineer', '-Oracle', 'Hyderabad,', 'Telangana', '-', 'Email', 'me', 'on', 'Indeed:', 'indeed.com/r/PRASHANTH-BADALA/', 'bf4c4b7253a8ece7', '•', 'Hands', 'on', 'experience', 'in', 'end-end', 'process', 'of', 'Build', 'Management,', 'Release', 'Management', 'and', 'Configuration', 'Management.', '•', 'Hands', 'on', 'experience', 'on', 'supporting', 'configuration', 'management', 'tools', 'on', 'both', 'physical', 'and', 'cloud', 'environment', '•', 'Involved', 'in', 'setting', 'up', 'Jenkins', 'in', 'Distributed', 'Environments', 'with', 'Master', 'and', 'Slave', '•', 'Working', 'experience', 'on', 'Subversion', '(SVN)', 'administration', 'and', 'basic', 'usage', '•', 'Creating', 'Branches,', 'tags', 'and', 'providing', 'SVN', 'user', 'access', 'to', 'all', 'developers', 'in', 'the', 'organization', '•', 'Managing', 'application', 'server', 'instances', 'running', 'on', 'AWS', '•', 'Involv

In [14]:
from collections import Counter

# 1️⃣ Build word vocabulary
all_words = [word for sent in train_sents for word in sent]
word_counts = Counter(all_words)

# Add special tokens
word2idx = {"<PAD>": 0, "<UNK>": 1}
for idx, word in enumerate(word_counts.keys(), start=2):
    word2idx[word] = idx

# 2️⃣ Build label vocabulary
unique_labels = set(label for sent_labels in train_labels for label in sent_labels)
label2idx = {label: idx for idx, label in enumerate(unique_labels)}
idx2label = {idx: label for label, idx in label2idx.items()}

# 3️⃣ Encode sentences and labels
def encode_sentences(sentences, labels, word2idx, label2idx, max_len=None):
    encoded_sents = []
    encoded_labels = []

    for sent, lab in zip(sentences, labels):
        # Encode words
        enc_sent = [word2idx.get(word, word2idx["<UNK>"]) for word in sent]
        # Encode labels
        enc_lab = [label2idx[label] for label in lab]

        encoded_sents.append(enc_sent)
        encoded_labels.append(enc_lab)

    # Optional padding
    if max_len is None:
        max_len = max(len(s) for s in encoded_sents)

    for i in range(len(encoded_sents)):
        # Pad sentences
        pad_len = max_len - len(encoded_sents[i])
        encoded_sents[i] = encoded_sents[i] + [word2idx["<PAD>"]] * pad_len
        # Pad labels
        encoded_labels[i] = encoded_labels[i] + [-100] * pad_len  # -100 for PyTorch loss ignore_index

    return encoded_sents, encoded_labels

# Encode train, val, test
max_len = max(max(len(s) for s in train_sents),
              max(len(s) for s in val_sents),
              max(len(s) for s in test_sents))

train_enc_sents, train_enc_labels = encode_sentences(train_sents, train_labels, word2idx, label2idx, max_len)
val_enc_sents, val_enc_labels = encode_sentences(val_sents, val_labels, word2idx, label2idx, max_len)
test_enc_sents, test_enc_labels = encode_sentences(test_sents, test_labels, word2idx, label2idx, max_len)

print(f"Vocabulary size: {len(word2idx)}")
print(f"Number of labels: {len(label2idx)}")
print(f"Example encoded sentence: {train_enc_sents[0]}")
print(f"Example encoded labels: {train_enc_labels[0]}")


Vocabulary size: 14879
Number of labels: 13
Example encoded sentence: [2, 3, 4, 5, 6, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 13, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 17, 18, 13, 19, 13, 31, 32, 33, 34, 13, 35, 36, 28, 37, 38, 17, 39, 20, 40, 41, 42, 20, 43, 44, 45, 46, 28, 47, 17, 48, 19, 13, 49, 50, 51, 28, 52, 53, 17, 54, 55, 56, 28, 57, 58, 59, 60, 61, 62, 63, 20, 64, 65, 17, 66, 67, 68, 69, 70, 13, 71, 17, 39, 20, 72, 73, 69, 74, 45, 75, 76, 41, 77, 17, 39, 20, 32, 27, 78, 79, 28, 80, 64, 81, 82, 83, 17, 84, 85, 13, 86, 87, 88, 89, 90, 28, 91, 17, 39, 20, 92, 64, 93, 78, 94, 28, 95, 64, 67, 61, 96, 97, 17, 39, 20, 98, 99, 45, 100, 101, 102, 103, 23, 104, 28, 70, 105, 106, 17, 39, 20, 107, 108, 109, 110, 28, 111, 22, 112, 17, 113, 42, 114, 115, 116, 117, 118, 119, 120, 121, 122, 17, 123, 20, 40, 41, 124, 125, 126, 28, 127, 119, 128, 23, 64, 120, 121, 13, 58, 68, 17, 129, 64, 130, 20, 58, 131, 132, 17, 133, 64, 134, 135, 136, 137, 78, 138, 17, 139, 64, 140, 141, 1

In [15]:
import torch
from torch.utils.data import Dataset, DataLoader

class NERDataset(Dataset):
    def __init__(self, enc_sents, enc_labels):
        self.enc_sents = enc_sents
        self.enc_labels = enc_labels

    def __len__(self):
        return len(self.enc_sents)

    def __getitem__(self, idx):
        return torch.tensor(self.enc_sents[idx], dtype=torch.long), \
               torch.tensor(self.enc_labels[idx], dtype=torch.long)

# Create datasets
train_dataset = NERDataset(train_enc_sents, train_enc_labels)
val_dataset   = NERDataset(val_enc_sents, val_enc_labels)
test_dataset  = NERDataset(test_enc_sents, test_enc_labels)

# Create DataLoaders
batch_size = 32

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader  = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Test one batch
for x, y in train_loader:
    print("Input batch shape:", x.shape)
    print("Label batch shape:", y.shape)
    break


Input batch shape: torch.Size([32, 2953])
Label batch shape: torch.Size([32, 2953])


In [16]:
!pip install pytorch-crf


Collecting pytorch-crf
  Downloading pytorch_crf-0.7.2-py3-none-any.whl.metadata (2.4 kB)
Downloading pytorch_crf-0.7.2-py3-none-any.whl (9.5 kB)
Installing collected packages: pytorch-crf
Successfully installed pytorch-crf-0.7.2


In [17]:
import torch
import torch.nn as nn
from torchcrf import CRF  # pip install pytorch-crf

class BiLSTM_CRF(nn.Module):
    def __init__(self, vocab_size, tagset_size, embedding_dim=128, hidden_dim=256):
        super(BiLSTM_CRF, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2, num_layers=1,
                            bidirectional=True, batch_first=True)
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        self.crf = CRF(tagset_size, batch_first=True)

    def forward(self, x, tags=None, mask=None):
        # x: [batch_size, seq_len]
        embeds = self.embedding(x)
        lstm_out, _ = self.lstm(embeds)
        emissions = self.hidden2tag(lstm_out)  # [batch_size, seq_len, tagset_size]

        if tags is not None:
            # Training mode, compute negative log likelihood
            loss = -self.crf(emissions, tags, mask=mask, reduction='mean')
            return loss
        else:
            # Prediction mode
            pred_tags = self.crf.decode(emissions, mask=mask)
            return pred_tags


In [18]:
import torch
import torch.nn as nn
from torchcrf import CRF

class BiLSTM_CRF(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_labels):
        super(BiLSTM_CRF, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_dim,
            num_layers=1,
            batch_first=True,
            bidirectional=True
        )
        self.hidden2tag = nn.Linear(hidden_dim * 2, num_labels)
        self.crf = CRF(num_labels, batch_first=True)

    def forward(self, x, labels=None):
        embeddings = self.embedding(x)                   # [batch, seq_len, embed_dim]
        lstm_out, _ = self.lstm(embeddings)             # [batch, seq_len, hidden*2]
        emissions = self.hidden2tag(lstm_out)           # [batch, seq_len, num_labels]

        if labels is not None:
            # Compute negative log-likelihood loss
            loss = -self.crf(emissions, labels, mask=(x != 0))
            return loss
        else:
            # Decode labels
            prediction = self.crf.decode(emissions, mask=(x != 0))
            return prediction

# Example setup
vocab_size = 14879
embedding_dim = 128
hidden_dim = 256
num_labels = 13

model = BiLSTM_CRF(vocab_size, embedding_dim, hidden_dim, num_labels)

# Example batch
inputs = torch.randint(0, vocab_size, (32, 2953))   # random input batch
labels = torch.randint(0, num_labels, (32, 2953))   # random label batch

# Forward pass with labels (training)
loss = model(inputs, labels)
print("Loss:", loss.item())

# Forward pass without labels (inference)
preds = model(inputs)
print("Predictions length:", len(preds), "First sequence length:", len(preds[0]))


Loss: 242688.734375
Predictions length: 32 First sequence length: 2953


In [19]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


In [20]:
model.train()
optimizer.zero_grad()
loss = model(inputs, labels)  # forward pass
loss.backward()               # backprop
optimizer.step()              # update weights
print("Training Loss:", loss.item())


Training Loss: 242688.734375


In [21]:
import torch
from torch.utils.data import TensorDataset, DataLoader

# Assuming inputs and labels are torch tensors of shape [num_samples, seq_len]
# Example: inputs = torch.tensor(all_input_sequences)
#          labels = torch.tensor(all_label_sequences)

dataset = TensorDataset(inputs, labels)
batch_size = 32
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

num_epochs = 5

for epoch in range(num_epochs):
    for batch_inputs, batch_labels in dataloader:
        optimizer.zero_grad()
        loss = model(batch_inputs, batch_labels)  # forward pass
        loss.backward()                            # backprop
        optimizer.step()                           # update weights
    print(f"Epoch {epoch+1} Loss: {loss.item()}")


Epoch 1 Loss: 242402.78125
Epoch 2 Loss: 242147.28125
Epoch 3 Loss: 241912.625
Epoch 4 Loss: 241689.25
Epoch 5 Loss: 241469.453125


In [22]:
model.eval()

with torch.no_grad():
    sample_inputs = inputs[:1]   # 1 resume
    sample_labels = labels[:1]

    predictions = model(sample_inputs)  # CRF decode


In [23]:
id2label = {v: k for k, v in label2idx.items()}
label2id = {v: k for k, v in id2label.items()}


In [24]:
print(label2id['B-NAME'])   # 0
print(id2label[0])          # B-NAME

9
B-EDUCATION


In [25]:
pred_labels = [id2label[p] for p in predictions[0]]
true_labels = [id2label[l.item()] for l in sample_labels[0]]
tokens = train_sents[0]


In [26]:
for t, true, pred in zip(tokens, true_labels, pred_labels):
    if true != "O" or pred != "O":
        print(f"{t:15}  TRUE={true:15}  PRED={pred}")


PRASHANTH        TRUE=B-EXPERIENCE     PRED=I-SKILL
BADALA           TRUE=I-EMAIL          PRED=I-EXPERIENCE
Devops           TRUE=I-EMAIL          PRED=I-EMAIL
Engineer         TRUE=B-NAME           PRED=B-EXPERIENCE
,Cloud           TRUE=I-EDUCATION      PRED=B-SKILL
Engineer         TRUE=B-OCCUPATION     PRED=B-OCCUPATION
-Oracle          TRUE=I-SKILL          PRED=B-OCCUPATION
Hyderabad,       TRUE=B-EXPERIENCE     PRED=B-OCCUPATION
Telangana        TRUE=B-EMAIL          PRED=B-EMAIL
-                TRUE=I-EXPERIENCE     PRED=I-OCCUPATION
Email            TRUE=B-SKILL          PRED=I-OCCUPATION
me               TRUE=I-NAME           PRED=B-NAME
on               TRUE=B-NAME           PRED=B-NAME
Indeed:          TRUE=O                PRED=B-OCCUPATION
indeed.com/r/PRASHANTH-BADALA/  TRUE=I-OCCUPATION     PRED=B-OCCUPATION
bf4c4b7253a8ece7  TRUE=I-SKILL          PRED=I-EDUCATION
•                TRUE=I-EDUCATION      PRED=I-EMAIL
Hands            TRUE=I-SKILL          PRED=B-EMAIL
o

In [27]:
from torch.utils.data import Dataset

class NERDataset(Dataset):
    def __init__(self, input_ids, labels, tokens):
        self.input_ids = input_ids
        self.labels = labels
        self.tokens = tokens

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return (
            self.input_ids[idx],
            self.labels[idx],
            self.tokens[idx]
        )


In [28]:
dataset = NERDataset(inputs, labels, train_sents)


In [29]:
model.eval()

with torch.no_grad():
    input_ids, gold_labels, tokens = dataset[0]
    preds = model(input_ids.unsqueeze(0))[0]

gold_labels = [id2label[l.item()] for l in gold_labels]
pred_labels = [id2label[p] for p in preds]

for t, g, p in zip(tokens, gold_labels, pred_labels):
    if g != "O" or p != "O":
        print(f"{t:15}  TRUE={g:15}  PRED={p}")


PRASHANTH        TRUE=B-EXPERIENCE     PRED=I-SKILL
BADALA           TRUE=I-EMAIL          PRED=I-EXPERIENCE
Devops           TRUE=I-EMAIL          PRED=I-EMAIL
Engineer         TRUE=B-NAME           PRED=B-EXPERIENCE
,Cloud           TRUE=I-EDUCATION      PRED=B-SKILL
Engineer         TRUE=B-OCCUPATION     PRED=B-OCCUPATION
-Oracle          TRUE=I-SKILL          PRED=B-OCCUPATION
Hyderabad,       TRUE=B-EXPERIENCE     PRED=B-OCCUPATION
Telangana        TRUE=B-EMAIL          PRED=B-EMAIL
-                TRUE=I-EXPERIENCE     PRED=I-OCCUPATION
Email            TRUE=B-SKILL          PRED=I-OCCUPATION
me               TRUE=I-NAME           PRED=B-NAME
on               TRUE=B-NAME           PRED=B-NAME
Indeed:          TRUE=O                PRED=B-OCCUPATION
indeed.com/r/PRASHANTH-BADALA/  TRUE=I-OCCUPATION     PRED=B-OCCUPATION
bf4c4b7253a8ece7  TRUE=I-SKILL          PRED=I-EDUCATION
•                TRUE=I-EDUCATION      PRED=I-EMAIL
Hands            TRUE=I-SKILL          PRED=B-EMAIL
o

In [30]:
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for i in range(len(val_dataset)):
        input_ids, gold_labels = val_dataset[i]
        preds = model(input_ids.unsqueeze(0))[0]

        for g, p in zip(gold_labels, preds):
            if g.item() != -100:   # ignore padding
                total += 1
                if g.item() == p:
                    correct += 1

print("Validation Token Accuracy:", correct / total)


Validation Token Accuracy: 0.09471864568431092


In [31]:
from torch.utils.data import random_split

dataset = TensorDataset(inputs, labels)

train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=1)


In [35]:
num_epochs = 25
PAD_ID = 0

model.train()

for epoch in range(num_epochs):
    total_loss = 0

    for batch_inputs, batch_labels in train_dataloader:
        optimizer.zero_grad()

        # Create mask (ignore padding)
        mask = batch_inputs != PAD_ID

        # CRF loss
        loss = model(batch_inputs, batch_labels)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs} - Loss: {avg_loss:.4f}")


Epoch 1/25 - Loss: 26189.5013
Epoch 2/25 - Loss: 25938.4549
Epoch 3/25 - Loss: 25622.7545
Epoch 4/25 - Loss: 25295.7698
Epoch 5/25 - Loss: 24931.4556
Epoch 6/25 - Loss: 24414.2580
Epoch 7/25 - Loss: 23823.3218
Epoch 8/25 - Loss: 23157.0310
Epoch 9/25 - Loss: 22398.8281
Epoch 10/25 - Loss: 21569.5512
Epoch 11/25 - Loss: 20780.9940
Epoch 12/25 - Loss: 19944.6128
Epoch 13/25 - Loss: 19077.3052
Epoch 14/25 - Loss: 18245.4418
Epoch 15/25 - Loss: 17492.3311
Epoch 16/25 - Loss: 16634.3926
Epoch 17/25 - Loss: 15889.7784
Epoch 18/25 - Loss: 15216.3864
Epoch 19/25 - Loss: 14410.1200
Epoch 20/25 - Loss: 13700.1604
Epoch 21/25 - Loss: 12993.1389
Epoch 22/25 - Loss: 12274.0808
Epoch 23/25 - Loss: 11586.5511
Epoch 24/25 - Loss: 10887.8496
Epoch 25/25 - Loss: 10232.6458


In [36]:
torch.save(model.state_dict(), "bert_bilstm_crf_resume.pth")


In [37]:
model.eval()

all_preds = []
all_labels = []

with torch.no_grad():
    for batch_inputs, batch_labels in val_dataloader:
        preds = model(batch_inputs)  # CRF decode → List[List[int]]

        all_preds.extend(preds)
        all_labels.extend(batch_labels.tolist())


In [38]:
true_preds = []
true_labels = []

for preds, labels in zip(all_preds, all_labels):
    for p, l in zip(preds, labels):
        if l != 0:  # assuming label 0 = "O" or PAD (adjust if needed)
            true_preds.append(id2label[p])
            true_labels.append(id2label[l])


In [40]:
!pip install seqeval


Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=8708b245385d809cb4b9b68c98445dcab52c667dfdee8be1a3f06dce6e9ae5f7
  Stored in directory: /root/.cache/pip/wheels/5f/b8/73/0b2c1a76b701a677653dd79ece07cfabd7457989dbfbdcd8d7
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [43]:
from seqeval.metrics import classification_report, f1_score

true_labels = []
true_preds = []

model.eval()
with torch.no_grad():
    for batch_inputs, batch_labels in val_dataloader:
        preds = model(batch_inputs)  # CRF decode

        for i in range(len(preds)):
            pred_seq = preds[i]
            gold_seq = batch_labels[i]

            sent_true = []
            sent_pred = []

            for g, p in zip(gold_seq, pred_seq):
                if g.item() != -100:   # REMOVE padding
                    sent_true.append(id2label[g.item()])
                    sent_pred.append(id2label[p])

            true_labels.append(sent_true)
            true_preds.append(sent_pred)

print(classification_report(true_labels, true_preds))
print("F1 Score:", f1_score(true_labels, true_preds))


              precision    recall  f1-score   support

   EDUCATION       0.13      0.13      0.13      2992
       EMAIL       0.12      0.09      0.11      2930
  EXPERIENCE       0.11      0.09      0.10      2904
        NAME       0.12      0.09      0.10      2902
  OCCUPATION       0.13      0.10      0.11      2922
       SKILL       0.12      0.11      0.11      2954

   micro avg       0.12      0.10      0.11     17604
   macro avg       0.12      0.10      0.11     17604
weighted avg       0.12      0.10      0.11     17604

F1 Score: 0.11128824575387823


In [44]:
!pip install transformers seqeval




In [45]:
from transformers import BertTokenizerFast, BertModel

# Load tokenizer and model
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
bert_model = BertModel.from_pretrained("bert-base-uncased")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [49]:
all_texts = preprocessed_texts  # list of token lists


NameError: name 'preprocessed_texts' is not defined