# Install Dependencies

In [1]:
!pip install transformers torch scikit-learn

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

# Establish Google Drive Connection (if needed)

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Helper Functions

## Load Data
Loads all json files in a specified path and combines them in one aggregated list

In [3]:
import json
import os

def load_json_data(folder_path):
    aggregated_data = []

    # loop through all files in the given folder
    for root, dirs, files in os.walk(folder_path):
        for file_name in files:
            with open(f"{folder_path}/{file_name}", "r") as f:
                data = json.load(f)

            aggregated_data.append(data)

    return aggregated_data

# 1. Imports and Model Initialization

In [4]:
import sys
import torch
import numpy as np
from transformers import AutoConfig, AutoTokenizer, AutoModelForTokenClassification, pipeline, DataCollatorWithPadding
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from sklearn.cluster import DBSCAN

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load JSON files and store them in memory
val_data_path = "drive/MyDrive/dataset/dev" # change to local path
val_data = load_json_data(val_data_path)

# build a list with all unique entity labels
unique_label_set = set()

for dataset in val_data:
    for record in dataset:
        for label in record["entity_label_set"]:
            unique_label_set.add(label)

entity_label_set = sorted(list(unique_label_set))

print("Extracted labels:")
print(entity_label_set)

# Build label mappings
label2id = {label: i for i, label in enumerate(entity_label_set)}
id2label = {i: label for label, i in label2id.items()}


# Load SpanBERT configuration and set up token classification head
model_name = "SpanBERT/spanbert-large-cased"
config = AutoConfig.from_pretrained(model_name)
config.num_labels = len(entity_label_set)
config.id2label = id2label
config.label2id = label2id

max_len = 512

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.model_max_length = max_len
model = AutoModelForTokenClassification.from_pretrained(model_name, config=config)
model.to(device)

# Span classification head: from pooled span embeddings to entity labels
# defined here in the Imports and Model Initialization cell, immediately after model.to(device)
import torch.nn as nn
span_classifier = nn.Linear(config.hidden_size, config.num_labels).to(device)  # line ~20 in this cell
# Loss function for span classification
criterion = nn.CrossEntropyLoss()  # line ~21 in this cell
import torch.nn as nn
span_classifier = nn.Linear(config.hidden_size, config.num_labels).to(device)
# Loss function for span classification
criterion = nn.CrossEntropyLoss()


Extracted labels:
['CARDINAL', 'DATE', 'EVENT', 'FAC', 'GPE', 'LANGUAGE', 'LAW', 'LOC', 'MISC', 'MONEY', 'NORP', 'ORDINAL', 'ORG', 'PERCENT', 'PERSON', 'PRODUCT', 'QUANTITY', 'TIME', 'WORK_OF_ART']


config.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/665M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/665M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at SpanBERT/spanbert-large-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# 2. Baseline NER with Untrained Model on Training Data

In [5]:
# Initialize a token-classification pipeline for baseline inference
ner_pipeline = pipeline(
    task="ner",
    model=model,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1,
    aggregation_strategy="simple"
)

# Run baseline NER
baseline_results = []

for dataset in val_data:
    dataset_results = []
    for ex in dataset:
        result = ner_pipeline(ex["doc"])
        dataset_results.append(
            (ex["title"], result)
        )

    baseline_results.append(
        (dataset[0]["domain"], dataset_results)
    )

# Display first example
print(baseline_results[0])

# Convert NumPy float32 to native Python floats before JSON serialization
def convert_numpy_floats(obj):
    if isinstance(obj, np.float32):
        return float(obj)
    raise TypeError

# Dump the modified results to JSON
with open(f"/content/drive/MyDrive/dataset/ner_baseline_output.json", "w") as f:
    json.dump(baseline_results, f, ensure_ascii=False, indent=1, default=convert_numpy_floats)

Device set to use cuda:0
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset




# 4. Load Training Data

In [6]:
train_data_path = "drive/MyDrive/dataset/dev" # change depending on

train_data = []

# loop through all files in the given folder
for root, dirs, files in os.walk(train_data_path):
    for file_name in files:
        with open(f"{train_data_path}/{file_name}", "r") as f:
            data = json.load(f)

        for ex in data:
            train_data.append(ex)

print(train_data)




# 5. Preparing Span-based Training Data and Coreference Grouping Preparing Span-based Training Data and Coreference Grouping

In [7]:
max_span_length = 10

def generate_candidate_spans(offsets, max_len_spans=max_span_length):
    spans = []
    for i in range(len(offsets)):
        for j in range(i, min(i + max_len_spans, len(offsets))):
            spans.append((i, j))
    return spans

class SpanNERDataset(Dataset):
    def __init__(self, examples, tokenizer, label2id, max_span_length=10, max_len=512):
        self.examples = examples
        self.tokenizer = tokenizer
        self.label2id = label2id
        self.max_span_length = max_span_length
        self.max_len = max_len

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        ex = self.examples[idx]
        encoding = self.tokenizer(
            ex['doc'],
            truncation=True,
            max_length=self.max_len,
            return_offsets_mapping=True
        )
        offsets = encoding.pop('offset_mapping')
        spans = generate_candidate_spans(offsets, self.max_span_length)
        span_labels = []
        for s, e in spans:
            label = 0
            for ent in ex['entities']:
                for ment in ent['mentions']:
                    start_idx = ex['doc'].find(ment)
                    if start_idx >= 0 and offsets[s][0] >= start_idx and offsets[e][1] <= start_idx + len(ment):
                        label = self.label2id[ent['type']]
                        break
                if label != 0:
                    break
            span_labels.append(label)

        return {
            'input_ids': torch.tensor(encoding['input_ids']),
            'attention_mask': torch.tensor(encoding['attention_mask']),
            'spans': torch.tensor(spans),
            'span_labels': torch.tensor(span_labels)
        }

train_dataset = SpanNERDataset(train_data, tokenizer, label2id, max_span_length)

# Use DataCollatorWithPadding for tokens; collate spans separately
data_collator = DataCollatorWithPadding(tokenizer, return_tensors="pt")

def collate_fn(batch):
    # 1) extract only the token inputs
    token_inputs = [
        {"input_ids": item["input_ids"], "attention_mask": item["attention_mask"]}
        for item in batch
    ]
    # 2) pad the token inputs uniformly
    batch_tokens = data_collator(token_inputs)
    # 3) re-attach the variable-length spans and labels
    batch_tokens["spans"] = [item["spans"] for item in batch]
    batch_tokens["span_labels"] = [item["span_labels"] for item in batch]
    return batch_tokens

train_loader = DataLoader(
    train_dataset,
    batch_size=2,
    shuffle=True,
    collate_fn=collate_fn
)

## 5a. Entity Grouping / Coreference Grouping

In [8]:
model.eval()
all_span_embeddings, all_span_labels = [], []
with torch.no_grad():
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model.base_model(input_ids, attention_mask=attention_mask)
        hidden = outputs.last_hidden_state
        for i, spans in enumerate(batch['spans']):
            for (s, e) in spans:
                emb = hidden[i, s:e+1].mean(dim=0)
                all_span_embeddings.append(emb.cpu().numpy())
        for labels in batch['span_labels']:
            all_span_labels.extend(labels.numpy())
clustering = DBSCAN(eps=1.0, min_samples=2, metric='euclidean')
clusters = clustering.fit_predict(all_span_embeddings)

# 6. Model Training

In [11]:
optimizer = AdamW(list(model.parameters()) + list(span_classifier.parameters()), lr=5e-5)
epochs = 10
model.train()
span_classifier.train()
for epoch in range(epochs):
    total_loss = 0.0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        spans_batch = batch['spans']       # list of tensors [(num_spans,2), ...]
        labels_batch = batch['span_labels'] # list of tensors [(num_spans), ...]

        optimizer.zero_grad()
        # Base model forward
        outputs = model.base_model(input_ids, attention_mask=attention_mask)
        hidden_states = outputs.last_hidden_state  # (B, L, H)

        # Prepare span embeddings and labels
        all_logits = []
        all_labels = []
        for i, spans in enumerate(spans_batch):
            hs = hidden_states[i]  # (L, H)
            span_embs = []
            for (s, e) in spans:
                # mean pooling over the span tokens
                span_emb = hs[s:e+1].mean(dim=0)
                span_embs.append(span_emb)
            span_embs = torch.stack(span_embs, dim=0)  # (num_spans, H)
            logits = span_classifier(span_embs)         # (num_spans, num_labels)
            all_logits.append(logits)
            all_labels.append(labels_batch[i].to(device))

        # Concatenate across batch
        logits_cat = torch.cat(all_logits, dim=0)
        labels_cat = torch.cat(all_labels, dim=0)
        # Compute loss
        loss = criterion(logits_cat, labels_cat)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")

Epoch 1/10, Loss: 0.4832
Epoch 2/10, Loss: 0.4427
Epoch 3/10, Loss: 0.4040
Epoch 4/10, Loss: 0.3858
Epoch 5/10, Loss: 0.3793
Epoch 6/10, Loss: 0.3675
Epoch 7/10, Loss: 0.3415
Epoch 8/10, Loss: 0.3471
Epoch 9/10, Loss: 0.3262
Epoch 10/10, Loss: 0.3343


# 7. NER on Validation Data

In [12]:
er_pipeline = pipeline(
    task="ner",
    model=model,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1,
    aggregation_strategy="simple"
)

training_results = []

for dataset in val_data:
    dataset_results = []
    for ex in dataset:
        result = er_pipeline(ex["doc"])
        dataset_results.append(
            (ex["title"], result)
        )

    training_results.append(
        (dataset[0]["domain"], dataset_results)
    )

print(training_results[0])

# Dump the modified results to JSON
with open(f"/content/drive/MyDrive/dataset/ner_training_output.json", "w") as f:
    json.dump(baseline_results, f, ensure_ascii=False, indent=1, default=convert_numpy_floats)

# loader_val = FileLoader('/mnt/data/validation')
# valid_examples = loader_val.load()
# valid_texts = [ex['doc'] for ex in valid_examples]
# valid_results = []
# for text in valid_texts:
#     enc = tokenizer(text, truncation=True, max_length=max_len)
#     tokens = tokenizer.convert_ids_to_tokens(enc['input_ids'], skip_special_tokens=True)
#     txt = tokenizer.convert_tokens_to_string(tokens)
#     valid_results.append(er_pipeline(txt))
# print(valid_results[0])

Device set to use cuda:0


