## Installing requirements

In [1]:
pip install datasets transformers seqeval ipywidgets torch

Note: you may need to restart the kernel to use updated packages.


## Loading the dataset

In [1]:
from datasets import load_dataset

dataset = load_dataset("conll2012_ontonotesv5", "english_v12")

#Checking
print(dataset)

README.md:   0%|          | 0.00/22.9k [00:00<?, ?B/s]

conll2012_ontonotesv5.py:   0%|          | 0.00/32.0k [00:00<?, ?B/s]

The repository for conll2012_ontonotesv5 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/conll2012_ontonotesv5.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y+
The repository for conll2012_ontonotesv5 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/conll2012_ontonotesv5.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


Downloading data:   0%|          | 0.00/194M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10539 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1370 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1200 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['document_id', 'sentences'],
        num_rows: 10539
    })
    validation: Dataset({
        features: ['document_id', 'sentences'],
        num_rows: 1370
    })
    test: Dataset({
        features: ['document_id', 'sentences'],
        num_rows: 1200
    })
})


In [5]:
print(dataset["train"].features)

{'document_id': Value(dtype='string', id=None), 'sentences': [{'part_id': Value(dtype='int32', id=None), 'words': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'pos_tags': Sequence(feature=ClassLabel(names=['XX', '``', '$', "''", '*', ',', '-LRB-', '-RRB-', '.', ':', 'ADD', 'AFX', 'CC', 'CD', 'DT', 'EX', 'FW', 'HYPH', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NFP', 'NN', 'NNP', 'NNPS', 'NNS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'VERB', 'WDT', 'WP', 'WP$', 'WRB'], id=None), length=-1, id=None), 'parse_tree': Value(dtype='string', id=None), 'predicate_lemmas': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'predicate_framenet_ids': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'word_senses': Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None), 'speaker': Value(dtype='string', id=None), 'named_entities': Sequence(feature=Class

## Processing dataset

In [30]:
#Extracting all unique NER labels
unique_labels = dataset["train"].features["sentences"][0]["named_entities"].feature.names

print(unique_labels)

['O', 'B-PERSON', 'I-PERSON', 'B-NORP', 'I-NORP', 'B-FAC', 'I-FAC', 'B-ORG', 'I-ORG', 'B-GPE', 'I-GPE', 'B-LOC', 'I-LOC', 'B-PRODUCT', 'I-PRODUCT', 'B-DATE', 'I-DATE', 'B-TIME', 'I-TIME', 'B-PERCENT', 'I-PERCENT', 'B-MONEY', 'I-MONEY', 'B-QUANTITY', 'I-QUANTITY', 'B-ORDINAL', 'I-ORDINAL', 'B-CARDINAL', 'I-CARDINAL', 'B-EVENT', 'I-EVENT', 'B-WORK_OF_ART', 'I-WORK_OF_ART', 'B-LAW', 'I-LAW', 'B-LANGUAGE', 'I-LANGUAGE']


In [31]:
#Mapping
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for label, i in label2id.items()}

## Converting data to tokens and labels

In [32]:
def extract_tokens_labels(dataset_split):
    all_tokens, all_labels = [], []
    for doc in dataset_split:
        for sent in doc["sentences"]:
            tokens = sent["words"]
            label_ids = sent["named_entities"]
            all_tokens.append(tokens)
            all_labels.append(label_ids)
    return all_tokens, all_labels

train_tokens, train_label_ids = extract_tokens_labels(dataset["train"])
val_tokens, val_label_ids = extract_tokens_labels(dataset["validation"])

## Preparing BERT tokenizer

In [33]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

## Tokenizing and aligning labels

In [35]:
def tokenize_and_align_labels(tokens_list, label_ids_list):
    tokenized_inputs = tokenizer(
        tokens_list,
        truncation=True,
        is_split_into_words=True,
        padding='max_length',
        max_length=128,
    )

    all_labels = []
    for i, labels in enumerate(label_ids_list):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(labels[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        all_labels.append(label_ids)
    tokenized_inputs["labels"] = all_labels
    return tokenized_inputs

from datasets import Dataset

train_dataset = Dataset.from_dict({"tokens": train_tokens, "labels": train_label_ids})
val_dataset = Dataset.from_dict({"tokens": val_tokens, "labels": val_label_ids})

train_tokenized = train_dataset.map(lambda x: tokenize_and_align_labels(x["tokens"], x["labels"]), batched=True)
val_tokenized = val_dataset.map(lambda x: tokenize_and_align_labels(x["tokens"], x["labels"]), batched=True)

Map:   0%|          | 0/115812 [00:00<?, ? examples/s]

Map:   0%|          | 0/15680 [00:00<?, ? examples/s]

## Model setup

In [36]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(
    "bert-base-cased",
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Training

In [38]:
training_args = TrainingArguments(
    output_dir="./ner-bert",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
)

from seqeval.metrics import classification_report, f1_score

def compute_metrics(p):
    predictions, labels = p
    predictions = predictions.argmax(-1)
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return {
        "f1": f1_score(true_labels, true_predictions),
        "report": classification_report(true_labels, true_predictions),
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Report
1,0.069,0.07414,0.862763,precision recall f1-score support  CARDINAL 0.77 0.88 0.82 1719  DATE 0.83 0.89 0.86 3197  EVENT 0.64 0.56 0.60 179  FAC 0.49 0.56 0.52 133  GPE 0.91 0.93 0.92 3618  LANGUAGE 0.87 0.74 0.80 35  LAW 0.49 0.70 0.58 64  LOC 0.68 0.77 0.72 316  MONEY 0.88 0.89 0.89 834  NORP 0.85 0.91 0.88 1277  ORDINAL 0.72 0.89 0.80 335  ORG 0.84 0.91 0.88 3787  PERCENT 0.90 0.90 0.90 656  PERSON 0.91 0.95 0.93 3144  PRODUCT 0.50 0.42 0.46 214  QUANTITY 0.72 0.69 0.70 190  TIME 0.68 0.78 0.72 361  WORK_OF_ART 0.40 0.52 0.46 202  micro avg 0.84 0.89 0.86 20261  macro avg 0.73 0.77 0.75 20261 weighted avg 0.84 0.89 0.86 20261
2,0.0395,0.071645,0.874543,precision recall f1-score support  CARDINAL 0.80 0.88 0.84 1719  DATE 0.85 0.89 0.87 3197  EVENT 0.67 0.54 0.60 179  FAC 0.45 0.63 0.53 133  GPE 0.92 0.94 0.93 3618  LANGUAGE 0.74 0.74 0.74 35  LAW 0.50 0.72 0.59 64  LOC 0.73 0.76 0.74 316  MONEY 0.89 0.91 0.90 834  NORP 0.87 0.90 0.88 1277  ORDINAL 0.75 0.83 0.79 335  ORG 0.87 0.90 0.89 3787  PERCENT 0.90 0.89 0.90 656  PERSON 0.91 0.96 0.93 3144  PRODUCT 0.55 0.46 0.50 214  QUANTITY 0.76 0.71 0.73 190  TIME 0.71 0.81 0.76 361  WORK_OF_ART 0.50 0.57 0.54 202  micro avg 0.86 0.89 0.87 20261  macro avg 0.74 0.78 0.76 20261 weighted avg 0.86 0.89 0.87 20261




TrainOutput(global_step=28954, training_loss=0.06552902048296334, metrics={'train_runtime': 5416.4395, 'train_samples_per_second': 42.763, 'train_steps_per_second': 5.346, 'total_flos': 1.5135437830404096e+16, 'train_loss': 0.06552902048296334, 'epoch': 2.0})

## Evaluating trainer

In [42]:
trainer.evaluate()

{'eval_loss': 0.07164479792118073,
 'eval_f1': 0.8745434756318781,
 'eval_report': '              precision    recall  f1-score   support\n\n    CARDINAL       0.80      0.88      0.84      1719\n        DATE       0.85      0.89      0.87      3197\n       EVENT       0.67      0.54      0.60       179\n         FAC       0.45      0.63      0.53       133\n         GPE       0.92      0.94      0.93      3618\n    LANGUAGE       0.74      0.74      0.74        35\n         LAW       0.50      0.72      0.59        64\n         LOC       0.73      0.76      0.74       316\n       MONEY       0.89      0.91      0.90       834\n        NORP       0.87      0.90      0.88      1277\n     ORDINAL       0.75      0.83      0.79       335\n         ORG       0.87      0.90      0.89      3787\n     PERCENT       0.90      0.89      0.90       656\n      PERSON       0.91      0.96      0.93      3144\n     PRODUCT       0.55      0.46      0.50       214\n    QUANTITY       0.76      0.71 

## Processing test dataset

In [43]:
#Parsing test file
test_sentences = []
test_labels = []
tokens = []
labels = []
with open("NER-test.tsv", "r", encoding="utf-8") as f:
    skip_header = True
    for line in f:
        if skip_header:
            skip_header = False
            continue
        line = line.strip()
        if not line:
            if tokens:
                test_sentences.append(tokens)
                test_labels.append(labels)
                tokens = []
                labels = []
        else:
            parts = line.split('\t')
            tokens.append(parts[1])
            labels.append(parts[2])
    if tokens:
        test_sentences.append(tokens)
        test_labels.append(labels)

In [44]:
#Normalizing the 'B-LOCATION'/'I-LOCATION' labels

def normalize_location_label(label):
    if label == "B-LOCATION":
        return "B-LOC"
    elif label == "I-LOCATION":
        return "I-LOC"
    else:
        return label

test_labels_normalized = [
    [normalize_location_label(tag) for tag in sent_labels] for sent_labels in test_labels
]

In [45]:
#Mapping
test_label_ids = [
    [label2id.get(label, label2id['O']) for label in sent_labels]
    for sent_labels in test_labels_normalized
]

## Tokenizing and aligning test set

In [46]:
test_dataset = Dataset.from_dict({"tokens": test_sentences, "labels": test_label_ids})

def tokenize_and_align_labels(tokens_list, label_ids_list):
    tokenized_inputs = tokenizer(
        tokens_list,
        truncation=True,
        is_split_into_words=True,
        padding='max_length',
        max_length=128,
    )
    all_labels = []
    for i, labels in enumerate(label_ids_list):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(labels[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        all_labels.append(label_ids)
    tokenized_inputs["labels"] = all_labels
    return tokenized_inputs

test_tokenized = test_dataset.map(lambda x: tokenize_and_align_labels(x["tokens"], x["labels"]), batched=True)

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

## Predicting on test set

In [50]:
test_tokenized_for_pred = test_tokenized.remove_columns('labels')
test_results = trainer.predict(test_tokenized_for_pred)

## Converting IDs back to strings for reporting

In [None]:
all_pred_labels = [
    [id2label[p] for (p, l) in zip(pred_seq, label_seq) if l != -100]
    for pred_seq, label_seq in zip(preds, test_tokenized["labels"])
]
all_true_labels = [
    [id2label[l] for (p, l) in zip(pred_seq, label_seq) if l != -100]
    for pred_seq, label_seq in zip(preds, test_tokenized["labels"])
]

## Evaluation metrics of test results

In [None]:
from seqeval.metrics import classification_report
print(classification_report(all_true_labels, all_pred_labels))

## Bar graph of results

In [None]:
#Getting per-class scores using seqeval

from seqeval.metrics import classification_report, performance_report
import pandas as pd

#Returns a report as a dictionary
from seqeval.metrics import classification_report

report_dict = classification_report(all_true_labels, all_pred_labels, output_dict=True)

In [None]:
#Creating dataframe for table

labels = [label for label in report_dict if label not in ['micro avg', 'macro avg', 'weighted avg', 'accuracy']]

df_report = pd.DataFrame({
    "Label": labels,
    "Precision": [report_dict[l]["precision"] for l in labels],
    "Recall":    [report_dict[l]["recall"]    for l in labels],
    "F1-score":  [report_dict[l]["f1-score"]  for l in labels],
    "Support":   [report_dict[l]["support"]   for l in labels]
})

df_report = df_report.sort_values("F1-score", ascending=False)

#Displaying
from IPython.display import display
display(df_report)

In [40]:
#Bar Graph of Per-Class F1 Scores

import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.bar(df_report["Label"], df_report["F1-score"])
plt.xlabel("Entity Label")
plt.ylabel("F1 Score")
plt.title("Per-class F1 Scores (NER)")
plt.xticks(rotation=45, ha="right")
plt.ylim(0, 1.0)
plt.tight_layout()
plt.show()

## Confusion matrix

In [41]:
#Flattening predictions
flat_true = []
flat_pred = []

for true_seq, pred_seq in zip(all_true_labels, all_pred_labels):
    flat_true.extend(true_seq)
    flat_pred.extend(pred_seq)

NameError: name 'all_true_labels' is not defined

In [None]:
#Computing
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Optional: If you want to exclude "O" tags, filter here
labels_to_include = sorted(list(set(flat_true + flat_pred)))  # All seen labels

cm = confusion_matrix(flat_true, flat_pred, labels=labels_to_include)

In [None]:
#Plotting
fig, ax = plt.subplots(figsize=(12, 10))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels_to_include)
disp.plot(include_values=True, cmap="Blues", ax=ax, xticks_rotation=45)
plt.title("NER Confusion Matrix")
plt.tight_layout()
plt.show()