## NLU Fine-Tuning Notebook with SpaCy

### Imports

In [None]:
import spacy
import json
from spacy.tokens import DocBin
from sklearn.model_selection import train_test_split
from spacy.cli.train import train
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix
import os
import spacy_transformers
import time
import ctypes
import torch
import matplotlib.pyplot as plt
import seaborn as sns

### Configure the environment

In [None]:
'''
I have a Blackwell GPU (RTX 5070, sm_120) on my local machine. To use it with SpaCy, I need to load specific CUDA libraries manually.
The following function sets up the environment to use the GPU with SpaCy.
'''
VENV_PATH = "/home/user/path/to/venv"
CUDA_LIB_PATH = f"{VENV_PATH}/lib/python3.12/site-packages/nvidia/cu13/lib"

def setup_blackwell_gpu():
    libs = ["libnvrtc-builtins.so.13.0", "libnvrtc.so.13", "libcublas.so.13"]
    for lib in libs:
        try:
            ctypes.CDLL(os.path.join(CUDA_LIB_PATH, lib), mode=ctypes.RTLD_GLOBAL)
        except OSError: pass 
    os.environ["LD_LIBRARY_PATH"] = f"{CUDA_LIB_PATH}:{os.environ.get('LD_LIBRARY_PATH', '')}"
    return spacy.require_gpu(0)

In [None]:
INPUT_FILE = "./data/nlu_dummy_dataset.json"
OUTPUT_DIR = "spacy_nlu"

with open(INPUT_FILE, "r", encoding="utf-8") as f:
    data = json.load(f)

#Determine unique labels in the dataset
unique_labels = sorted(list(set(item["label"] for item in data)))
print(f"Unique Labels: {unique_labels}")

### Convert to SpaCy format

In [None]:
def convert(data_subset, output_path):
    nlp = spacy.blank("en")
    doc_bin = DocBin()
    
    for item in data_subset:
        text = item["text"]
        label = item["label"]
        
        doc = nlp.make_doc(text)
        
        cats = {l: 0.0 for l in unique_labels}
        cats[label] = 1.0
        
        doc.cats = cats
        doc_bin.add(doc)
    
    doc_bin.to_disk(output_path)
    print(f"Saved {output_path} with {len(data_subset)} examples.")

### Split Dataset

In [None]:
#Split Train/Val/Test (80/10/10)
train, test = train_test_split(data, test_size=0.2, stratify=[d["label"] for d in data], random_state=123456)
val, test = train_test_split(test, test_size=0.5, stratify=[d["label"] for d in test], random_state=123456)

os.makedirs(OUTPUT_DIR, exist_ok=True)

convert(train, f"{OUTPUT_DIR}/train.spacy")
convert(val, f"{OUTPUT_DIR}/val.spacy")
convert(test, f"{OUTPUT_DIR}/test.spacy")

### Configure files and Fine-Tune

In [None]:
!python -m spacy info

In [None]:
!python -m spacy init fill-config base_config_nlu.cfg config_nlu_gpu.cfg

In [None]:
os.environ["CUDA_PATH"] = CUDA_LIB_PATH

In [None]:
%%bash
export LD_LIBRARY_PATH=$CUDA_PATH:$LD_LIBRARY_PATH
python -m spacy train config_nlu_gpu.cfg --output ./output_nlu --paths.train ./spacy_nlu/train.spacy --paths.dev ./spacy_nlu/val.spacy --gpu-id 0

### Inference with Test Set

In [None]:
'''
Even though the NLU model was trained on 130 real dataset examples, they are private and owned by Splendom Suites.
Therefore, here I offer a usual split of the synthetic generated dataset.
'''

nlp = spacy.load("output_nlu/model-best")
doc_bin = DocBin().from_disk("spacy_nlu/test.spacy")
docs_test = list(doc_bin.get_docs(nlp.vocab))

#Generate predictions
y_true = [max(doc.cats, key=doc.cats.get) for doc in docs_test]
y_pred = [max(nlp(doc.text).cats, key=nlp(doc.text).cats.get) for doc in docs_test]
# Text Report
print("\n" + "="*40)
print("Test Metrics")
print("="*40)
print(classification_report(y_true, y_pred, digits=4))

labels = sorted(list(set(y_true)))
cm = confusion_matrix(y_true, y_pred, labels=labels)
cm_df = pd.DataFrame(cm, index=labels, columns=labels)

plt.figure(figsize=(10, 8))

#Create seaborn heatmap
sns.heatmap(cm_df, annot=True, cbar=True)

plt.title('Confusion Matrix - Test Set\n')
plt.xlabel('\nPredict Value', fontsize=12)
plt.ylabel('Real Value\n', fontsize=12)

plt.show()

### Latency for specific queries.

In [None]:
spacy.prefer_gpu(False) 

#Load CPU model.
nlp_cpu = spacy.load("./output_nlu/model-best") 

speed_test_queries = [
    "I am looking for a flat in Calle Lagasca with a gym and 1 bedroom",
    "How many square feet does Splendom Chacao have?",
    "How much is 15 multiplied by 27?",
    "I want an apartment near Retiro Park with a swimming pool"
]

for text in speed_test_queries:
    print(f"Query: {text}")
    
    start_time = time.perf_counter()
    doc = nlp_cpu(text)
    end_time = time.perf_counter()
    
    print(f"Latency: {end_time - start_time:.5f} seconds")
    print("---")

In [None]:
torch.cuda.empty_cache()