## Imports and Setup

In [5]:
!pip install transformers datasets seqeval evaluate --quiet

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import BertTokenizerFast, BertForTokenClassification, Trainer, TrainingArguments, BertConfig
from datasets import Dataset
import evaluate # Import the evaluate library
import torch
import json

## Load Dataset

In [6]:
df = pd.read_csv('GMB_dataset.txt', sep='\t', quoting=3, encoding='latin-1')
df = df.dropna(subset=['Sentence #', 'Word', 'Tag'])

## Group by sentences

In [7]:
sentences = []
labels = []
for _, group in df.groupby('Sentence #'):
    sentences.append(list(group['Word']))
    labels.append(list(group['Tag']))

## Label list & mapping

In [9]:
label_list = sorted(set(label for doc in labels for label in doc))
label_to_id = {l: i for i, l in enumerate(label_list)}
id_to_label = {i: l for l, i in label_to_id.items()}
# Save to JSON files
with open("label2id.json", "w") as f:
    json.dump(label_to_id, f, indent=4)

with open("id2label.json", "w") as f:
    json.dump(id_to_label, f, indent=4)

## train/val/test Split

In [10]:
sents_trainval, sents_test, labels_trainval, labels_test = train_test_split(sentences, labels, test_size=0.1, random_state=42)
sents_train, sents_val, labels_train, labels_val = train_test_split(sents_trainval, labels_trainval, test_size=0.1111, random_state=42)  # 0.1111 * 0.9 ≈ 0.1 total

print(f"Train size: {len(sents_train)}, Val size: {len(sents_val)}, Test size: {len(sents_test)}")

Train size: 2399, Val size: 300, Test size: 300


## Make Huggingface Datasets

In [11]:
train_ds = Dataset.from_dict({"tokens": sents_train, "tags": labels_train})
val_ds = Dataset.from_dict({"tokens": sents_val, "tags": labels_val})
test_ds = Dataset.from_dict({"tokens": sents_test, "tags": labels_test})

## Load tokenizer and model

In [8]:
model_name = "bert-base-cased"
tokenizer = BertTokenizerFast.from_pretrained(model_name)
config = BertConfig.from_pretrained(model_name, hidden_dropout_prob=0.3, attention_probs_dropout_prob=0.3, num_labels=len(label_list))
model = BertForTokenClassification.from_pretrained(model_name, config=config)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Tokenize and align labels function

In [9]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], is_split_into_words=True, truncation=True, padding=True)

    labels = []
    for i, label in enumerate(examples["tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # special tokens
            elif word_idx != previous_word_idx:
                label_ids.append(label_to_id[label[word_idx]])
            else:
                # For wordpieces, label with -100 to ignore in loss or repeat label if you want
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

train_ds = train_ds.map(tokenize_and_align_labels, batched=True)
val_ds = val_ds.map(tokenize_and_align_labels, batched=True)
test_ds = test_ds.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/2399 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

## Setup metric

In [10]:
from evaluate import load # Import the load function

metric = load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    preds = np.argmax(predictions, axis=2)

    true_labels = [[id_to_label[l] for l in label if l != -100] for label in labels]
    true_preds = [
        [id_to_label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(preds, labels)
    ]

    results = metric.compute(predictions=true_preds, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

Downloading builder script: 0.00B [00:00, ?B/s]

## Training arguments

In [11]:
training_args = TrainingArguments(
    output_dir="./ner_bert_gmb",
    eval_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=50,
    load_best_model_at_end=True,
)

## Trainer

In [12]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


## Train

In [13]:
trainer.train()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mnicholashosw06[0m ([33mnicholashosw06-nanyang-polytechnic[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.214,0.174269,0.698264,0.75578,0.725885,0.955808
2,0.1525,0.163742,0.742156,0.786127,0.763509,0.958495
3,0.1322,0.16236,0.731507,0.771676,0.751055,0.958346


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=450, training_loss=0.2280308903588189, metrics={'train_runtime': 159.2485, 'train_samples_per_second': 45.194, 'train_steps_per_second': 2.826, 'total_flos': 268162047850878.0, 'train_loss': 0.2280308903588189, 'epoch': 3.0})

## Evaluate on test set

In [14]:
results = trainer.evaluate(test_ds)
print("Test set results:", results)

Test set results: {'eval_loss': 0.13457486033439636, 'eval_precision': 0.7412587412587412, 'eval_recall': 0.7636887608069164, 'eval_f1': 0.752306600425834, 'eval_accuracy': 0.9624003609565348, 'eval_runtime': 1.5714, 'eval_samples_per_second': 190.909, 'eval_steps_per_second': 12.091, 'epoch': 3.0}


  _warn_prf(average, modifier, msg_start, len(result))


## Evaluate on actual sentences

In [17]:
from transformers import pipeline

model.config.id2label = id_to_label
model.config.label2id = label_to_id
# 1. Create a NER pipeline using your trained model and tokenizer
ner_pipeline = pipeline(
    "ner",
    model=trainer.model,          # use the trained model directly
    tokenizer=tokenizer,
    aggregation_strategy="simple" # groups subword tokens nicely
)

# 2. Sample sentences to test
test_sentences = [
    "London is a big city in the United Kingdom.",
    "Barack Obama was the 44th president of the USA.",
    "I love pizza and coding!"
]

# 3. Run NER on each sentence and print results
for sentence in test_sentences:
    print(f"Sentence: {sentence}")
    ner_results = ner_pipeline(sentence)
    print("Entities found:")
    for entity in ner_results:
        print(f"  - {entity['entity_group']} '{entity['word']}' (score: {entity['score']:.2f})")
    print()

Device set to use cuda:0


Sentence: London is a big city in the United Kingdom.
Entities found:
  - geo 'London' (score: 0.95)
  - geo 'United Kingdom' (score: 0.96)

Sentence: Barack Obama was the 44th president of the USA.
Entities found:
  - per 'Barack Obama' (score: 0.97)
  - geo 'USA' (score: 0.88)

Sentence: I love pizza and coding!
Entities found:



## Download files (colab only)

In [16]:
import os
output_dir = "./ner_bert_gmb"
checkpoint_path = os.path.join(output_dir, "checkpoint-450")

if os.path.exists(checkpoint_path):
    !zip -r ner_bert_gmb.zip {checkpoint_path}
    from google.colab import files
    files.download("ner_bert_gmb.zip")
else:
    print(f"Checkpoint directory not found at: {checkpoint_path}")

  adding: ner_bert_gmb/checkpoint-450/ (stored 0%)
  adding: ner_bert_gmb/checkpoint-450/config.json (deflated 61%)
  adding: ner_bert_gmb/checkpoint-450/trainer_state.json (deflated 72%)
  adding: ner_bert_gmb/checkpoint-450/tokenizer.json (deflated 70%)
  adding: ner_bert_gmb/checkpoint-450/model.safetensors (deflated 7%)
  adding: ner_bert_gmb/checkpoint-450/vocab.txt (deflated 49%)
  adding: ner_bert_gmb/checkpoint-450/training_args.bin (deflated 52%)
  adding: ner_bert_gmb/checkpoint-450/tokenizer_config.json (deflated 75%)
  adding: ner_bert_gmb/checkpoint-450/optimizer.pt (deflated 23%)
  adding: ner_bert_gmb/checkpoint-450/special_tokens_map.json (deflated 42%)
  adding: ner_bert_gmb/checkpoint-450/scheduler.pt (deflated 56%)
  adding: ner_bert_gmb/checkpoint-450/rng_state.pth (deflated 25%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>