🔹 Step 1: Install Required Libraries


In [2]:
pip install transformers datasets seqeval torch pandas scikit-learn






[notice] A new release of pip available: 22.3 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


🔹 
Step 2: Load and Process the ner_data.txt

In [3]:
from datasets import Dataset
import pandas as pd

# Read the ner_data.txt file
with open("ner_data.txt", "r") as f:
    lines = f.readlines()

sentences = []
labels = []
current_sentence = []
current_labels = []

for line in lines:
    if line.strip():  # If the line is not empty
        token, label = line.strip().split(" ")
        current_sentence.append(token)
        current_labels.append(label)
    else:  # Empty line means a new sentence
        if current_sentence:
            sentences.append(current_sentence)
            labels.append(current_labels)
            current_sentence = []
            current_labels = []

# Convert to Hugging Face dataset format
dataset = Dataset.from_dict({"tokens": sentences, "ner_tags": labels})
dataset = dataset.train_test_split(test_size=0.2)

train_dataset = dataset["train"]
test_dataset = dataset["test"]


  from .autonotebook import tqdm as notebook_tqdm


🔹 Step 3: Tokenizing with RoBERTa

In [5]:
from transformers import RobertaTokenizerFast

# Load the tokenizer
model_name = "roberta-base"
tokenizer = RobertaTokenizerFast.from_pretrained(model_name, add_prefix_space=True)


# Define label mapping
label_list = ["O", "B-AGE", "I-AGE", "B-GEN", "I-GEN", "B-LOC", "I-LOC",
              "B-SKILL", "I-SKILL", "B-EDU", "I-EDU", "B-EXP", "I-EXP",
              "B-CERT", "I-CERT"]

label_map = {label: i for i, label in enumerate(label_list)}

# Tokenize the dataset
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True, padding="max_length", max_length=128)

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to words
        label_ids = []
        previous_word_idx = None

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Ignore these tokens
            elif word_idx != previous_word_idx:
                label_ids.append(label_map[label[word_idx]])  # Use B- or I- label
            else:
                label_ids.append(label_map[label[word_idx]])  # Continue I- label

            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Apply tokenization
train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
test_dataset = test_dataset.map(tokenize_and_align_labels, batched=True)


Map: 100%|██████████| 36/36 [00:00<00:00, 92.88 examples/s]
Map: 100%|██████████| 10/10 [00:00<00:00, 327.57 examples/s]


🔹 Step 4: Define the RoBERTa Model

In [6]:
from transformers import RobertaForTokenClassification

# Load the pre-trained RoBERTa model for token classification
model = RobertaForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label_list),
    id2label={i: label for i, label in enumerate(label_list)},
    label2id={label: i for i, label in enumerate(label_list)}
)


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


🔹 Step 5: Train the Model

In [7]:
from transformers import TrainingArguments, Trainer
import torch

# Define training arguments
training_args = TrainingArguments(
    output_dir="output/train_args/ner_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    push_to_hub=False,
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

# Save the trained model
trainer.save_model("output/model/ner_model")
tokenizer.save_pretrained("output/model/ner_model")


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,2.3953,2.046999
2,1.6408,1.628647
3,1.2629,1.418581
4,0.9937,1.255156
5,0.8598,1.182594


('output/model/ner_model\\tokenizer_config.json',
 'output/model/ner_model\\special_tokens_map.json',
 'output/model/ner_model\\vocab.json',
 'output/model/ner_model\\merges.txt',
 'output/model/ner_model\\added_tokens.json',
 'output/model/ner_model\\tokenizer.json')

🔹 Step 6: Load and Evaluate the Trained Model

In [None]:
import pandas as pd
import evaluate
from seqeval.metrics import classification_report

# Load seqeval metric
metric = evaluate.load("seqeval")

### STEP 1: Load True Labels from ner_data.txt ###
def load_true_labels(file_path):
    sentences, labels = [], []
    sentence, label_seq = [], []

    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            if line.strip():
                word, tag = line.strip().split()
                sentence.append(word)
                label_seq.append(tag)
            else:
                if sentence:  # Avoid empty lists
                    sentences.append(sentence)
                    labels.append(label_seq)
                sentence, label_seq = [], []
    
    # Add the last sentence if the file doesn't end with a newline
    if sentence:
        sentences.append(sentence)
        labels.append(label_seq)

    return labels  # List of lists containing labels

true_labels = load_true_labels("ner_data.txt")


### STEP 2: Load Predicted Labels from predictions.csv ###
def load_predicted_labels(csv_file):
    df = pd.read_csv(csv_file)  # Load CSV
    predicted_labels = [row.split() for row in df["Prediction"].tolist()]
    return predicted_labels  # List of lists containing predictions

predicted_labels = load_predicted_labels("predictions.csv")


### STEP 3: Compute Evaluation Metrics ###
def evaluate_ner_model(true_labels, predicted_labels):
    """
    Evaluate the NER model using precision, recall, and F1-score.
    """
    results = metric.compute(predictions=predicted_labels, references=true_labels)

    print("Evaluation Metrics:")
    print(classification_report(true_labels, predicted_labels))

# Run evaluation
evaluate_ner_model(true_labels, predicted_labels)


🔹 Step 7: Actual Text Resume testing / Human Evaluation


In [62]:
from transformers import pipeline
import re

# Load the trained model
ner_pipeline = pipeline("ner", model="output/model/ner_model", tokenizer="output/model/ner_model")

# Test on a new sentence
text = """"
    25,Male,"P1, Ong Yiu, Butuan City",Computer Programming,College Graduate,Backend Web Developer,Web Dev NCIII

"""

# Run NER model on the text

print(ner_pipeline(text))
# # Define characters to remove
# remove_chars = r"[ĠĊ./()\-\s]"

# # Get words and their predicted labels
# extracted_entities = [
#     {"word": re.sub(remove_chars, '', entry['word']), "label": entry['entity']}
#     for entry in result
# ]

# # Print extracted entities with labels
# for entity in extracted_entities:
#     print(f"Word: {entity['word']}, Label: {entity['label']}")


Device set to use cpu


[{'entity': 'B-LOC', 'score': 0.25582153, 'index': 1, 'word': 'Ġ"', 'start': 0, 'end': 1}, {'entity': 'B-LOC', 'score': 0.27625707, 'index': 2, 'word': 'Ċ', 'start': 1, 'end': 2}, {'entity': 'B-LOC', 'score': 0.39380813, 'index': 3, 'word': 'Ġ', 'start': 3, 'end': 3}, {'entity': 'B-LOC', 'score': 0.30683574, 'index': 4, 'word': 'Ġ', 'start': 4, 'end': 4}, {'entity': 'B-LOC', 'score': 0.26208305, 'index': 5, 'word': 'Ġ', 'start': 5, 'end': 5}, {'entity': 'B-LOC', 'score': 0.26625317, 'index': 6, 'word': 'Ġ25', 'start': 6, 'end': 8}, {'entity': 'I-LOC', 'score': 0.16913983, 'index': 7, 'word': ',', 'start': 8, 'end': 9}, {'entity': 'B-LOC', 'score': 0.2467152, 'index': 8, 'word': 'Male', 'start': 9, 'end': 13}, {'entity': 'B-LOC', 'score': 0.3277546, 'index': 9, 'word': ',"', 'start': 13, 'end': 15}, {'entity': 'B-LOC', 'score': 0.5216246, 'index': 10, 'word': 'P', 'start': 15, 'end': 16}, {'entity': 'I-LOC', 'score': 0.45797294, 'index': 11, 'word': '1', 'start': 16, 'end': 17}, {'entit