# Install Dependencies

In [2]:
!pip install transformers torch scikit-learn

Collecting scikit-learn
  Using cached scikit_learn-1.6.1-cp312-cp312-macosx_10_13_x86_64.whl.metadata (31 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Using cached scikit_learn-1.6.1-cp312-cp312-macosx_10_13_x86_64.whl (12.1 MB)
Using cached joblib-1.4.2-py3-none-any.whl (301 kB)
Downloading threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, joblib, scikit-learn
Successfully installed joblib-1.4.2 scikit-learn-1.6.1 threadpoolctl-3.6.0


# Establish Google Drive Connection (if needed)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# 1. Imports and Model Initialization

In [None]:
import sys
import os
import json
import torch
from transformers import AutoConfig, AutoTokenizer, AutoModelForTokenClassification, pipeline
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from sklearn.cluster import AgglomerativeClustering


# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load JSON files and store them in memory
val_data_path = "data/raw/dev" # change to local path
val_data = []

# loop through all files in the given folder
for root, dirs, files in os.walk(val_data_path):
    for file_name in files:
        with open(f"data/raw/dev/{file_name}", "r") as f:
            data = json.load(f)

        val_data.append(data)

# build a list with all unique entity labels
unique_label_set = set()

for dataset in val_data:
    for record in dataset:
        for label in record["entity_label_set"]:
            unique_label_set.add(label)

entity_label_set = sorted(list(unique_label_set))

print("Extracted labels:")
print(entity_label_set)

# Build label mappings
label2id = {label: i for i, label in enumerate(entity_label_set)}
id2label = {i: label for label, i in label2id.items()}


# Load SpanBERT configuration and set up token classification head
model_name = "SpanBERT/spanbert-large-cased"
config = AutoConfig.from_pretrained(model_name)
config.num_labels = len(entity_label_set)
config.id2label = id2label
config.label2id = label2id

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name, config=config)
model.to(device)


Extracted labels:
['CARDINAL', 'DATE', 'EVENT', 'FAC', 'GPE', 'LANGUAGE', 'LAW', 'LOC', 'MISC', 'MONEY', 'NORP', 'ORDINAL', 'ORG', 'PERCENT', 'PERSON', 'PRODUCT', 'QUANTITY', 'TIME', 'WORK_OF_ART']


'\n# Load SpanBERT configuration and set up token classification head\nmodel_name = "SpanBERT/spanbert-large-cased"\nconfig = AutoConfig.from_pretrained(model_name)\nconfig.num_labels = len(entity_label_set)\nconfig.id2label = id2label\nconfig.label2id = label2id\n\ntokenizer = AutoTokenizer.from_pretrained(model_name)\nmodel = AutoModelForTokenClassification.from_pretrained(model_name, config=config)\nmodel.to(device)\n'

# 2. Baseline NER with Untrained Model on Training Data

In [None]:
# Initialize a token-classification pipeline for baseline inference
ner_pipeline = pipeline(
    task="ner", 
    model=model, 
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1,
    aggregation_strategy="simple"
)

# Prepare raw texts
train_texts = []

for dataset in val_data:
    train_texts.append([ex['doc'] for ex in dataset])

# Run baseline NER
baseline_results = [ner_pipeline(text) for text in train_texts]

# Display first example
print(baseline_results[0])

with open(f"/content/drive/MyDrive/dataset/ner_baseline_output.json", "w") as f:
                json.dump(baseline_results, f, ensure_ascii=False, indent=4)