## NER Fine-Tuning Notebook with SpaCy

### Imports

In [None]:
import os
import ctypes
import torch
import spacy
import json
import torch
import spacy_transformers
import time
from spacy.tokens import DocBin
from spacy.training import Example
from sklearn.model_selection import train_test_split

### Configure the environment

In [None]:
'''
I have a Blackwell GPU (RTX 5070, sm_120) on my local machine. To use it with SpaCy, I need to load specific CUDA libraries manually.
The following function sets up the environment to use the GPU with SpaCy.
'''
VENV_PATH = "/home/user/path/to/venv"
CUDA_LIB_PATH = f"{VENV_PATH}/lib/python3.12/site-packages/nvidia/cu13/lib"

def setup_blackwell_gpu():
    libs = ["libnvrtc-builtins.so.13.0", "libnvrtc.so.13", "libcublas.so.13"]
    for lib in libs:
        try:
            ctypes.CDLL(os.path.join(CUDA_LIB_PATH, lib), mode=ctypes.RTLD_GLOBAL)
        except OSError: pass 
    os.environ["LD_LIBRARY_PATH"] = f"{CUDA_LIB_PATH}:{os.environ.get('LD_LIBRARY_PATH', '')}"
    return spacy.require_gpu(0)

### Convert to SpaCy format

In [None]:
def convert_spacy(path, outputfile):
    nlp = spacy.blank("en")
    docbin = DocBin()
    with open(path, 'r') as f:
        data = json.load(f)
    
    for item in data["annotated"]:
        doc = nlp.make_doc(item["text"])
        ents = []
        for entity in item["entities"]:
            span = doc.char_span(entity["start"], entity["end"], label=entity["label"], alignment_mode="contract")
            if span is None:
                span = doc.char_span(entity["start"], entity["end"], label=entity["label"], alignment_mode="expand")
            if span is not None:
                ents.append(span)
        doc.ents = ents
        docbin.add(doc)
    docbin.to_disk(outputfile)
    print(f"Created: {outputfile}")

### Split Dataset

In [None]:
def split_and_save_dataset(input_json_path, output_dir, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1, random_seed=123456):

    with open(input_json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    classes = data['classes']
    annotated = data['annotated']

    train_data, temp_data = train_test_split(annotated, train_size=train_ratio, random_state=random_seed, shuffle=True)

    val_relative = val_ratio / (val_ratio + test_ratio)

    val_data, test_data = train_test_split(temp_data, train_size=val_relative, random_state=random_seed, shuffle=True)

    def save_json(data_subset, filename):
        out_data = {
            'classes': classes,
            'annotated': data_subset
        }
        with open(f"{output_dir}/{filename}", 'w', encoding='utf-8') as fw:
            json.dump(out_data, fw, ensure_ascii=False, indent=2)
        print(f"Saved {filename} with {len(data_subset)} samples")

    #Save the splits made
    save_json(train_data, "train.json")
    save_json(val_data, "val.json")
    save_json(test_data, "test.json")

In [None]:
split_and_save_dataset("./data/dummy_dataset_NER.json", "./data")

#Convert to SpaCy format
os.makedirs("./spacy", exist_ok=True)
convert_spacy("./data/train.json", "./spacy/train.spacy")
convert_spacy("./data/val.json", "./spacy/val.spacy")
convert_spacy("./data/test.json", "./spacy/test.spacy")

### Configure files and Fine-Tune

In [None]:
!python -m spacy info

In [None]:
!python -m spacy init fill-config ./base_config.cfg ./config.cfg

In [None]:
%%bash
export LD_LIBRARY_PATH=$CUDA_PATH:$LD_LIBRARY_PATH
python -m spacy train config.cfg --output ./output --paths.train ./spacy/train.spacy --paths.dev ./spacy/val.spacy --gpu-id 0

### Inference with Test Set

In [None]:
nlp = spacy.load("./output/model-best")
doc_bin = DocBin().from_disk("./spacy/test.spacy")

#Evaluate the model
examples = [Example(nlp(doc.text), doc) for doc in doc_bin.get_docs(nlp.vocab)]
scores = nlp.evaluate(examples)

print("\n" + "="*40)
print("NER Metrics - Test Set")
print("="*40)
print(f"OVERALL: F1 {scores['ents_f']:.4f} | Precision {scores['ents_p']:.4f} | Recall {scores['ents_r']:.4f}")

print("\nMetrics per entity type:")
for ent, m in scores['ents_per_type'].items():
    print(f"[{ent.upper()}]: P: {m['p']:.4f} | R: {m['r']:.4f} | F1: {m['f']:.4f}")

### Latency for specific queries.

In [None]:
spacy.prefer_gpu(False) 

#Load CPU model.
nlp_cpu = spacy.load("./output/model-best") 

speed_test_queries = [
    "I am looking for a flat in Calle Lagasca with a gym and 1 bedroom",
    "How many square feet does Splendom Chacao have?",
    "How much is 15 multiplied by 27?",
    "I want an apartment near Retiro Park with a swimming pool"
]

for text in speed_test_queries:
    print(f"Query: {text}")
    
    start_time = time.perf_counter()
    doc = nlp_cpu(text)
    end_time = time.perf_counter()
    
    print(f"Latency: {end_time - start_time:.5f} seconds")
    print("---")

In [None]:
torch.cuda.empty_cache()