<a href="https://colab.research.google.com/github/PradipNichite/Youtube-Tutorials/blob/main/Spacy_Custom_NER_Youtube.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install --no-deps spacy-transformers 'spacy_alignments'
!python -m spacy download en_core_web_trf
!pip install transformers>=4.36.0 huggingface_hub>=0.19.0 accelerate>=0.25.0

import spacy
import torch
import cupy
import sys
import numpy as np
import scipy
import scipy.special
from spacy.training import Example
from spacy.util import minibatch, compounding, filter_spans
from spacy.tokens import DocBin
from datasets import load_dataset
from pathlib import Path
from tqdm import tqdm
import random
import json
import warnings
warnings.filterwarnings('ignore')

print(f"spaCy version: {spacy.__version__}")
print(f"PyTorch version: {torch.__version__}")
print(f"CuPy version: {cupy.__version__}")

nlp = spacy.load("en_core_web_trf")
doc = nlp("Hello world")
print("✅ Tutto funzionante!")

Collecting spacy-transformers
  Downloading spacy_transformers-1.3.9-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.0 kB)
Collecting spacy_alignments
  Downloading spacy_alignments-0.9.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.6 kB)
Downloading spacy_transformers-1.3.9-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (758 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m758.8/758.8 kB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hDownloading spacy_alignments-0.9.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (314 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m314.2/314.2 kB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: spacy-transformers, spacy_alignments
Successfully installed spacy-transformers-1.3.9 spacy_alignments-0.9.2
Collecting en-core-web-trf==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/dow

In [2]:
print("CuPy version:", cupy.__version__)
print("CUDA available:", cupy.cuda.is_available())
print("Torch GPU disponibile?", torch.cuda.is_available())
print("Torch versione:", torch.__version__)
print("CUDA supportata da torch:", torch.version.cuda)

CuPy version: 13.6.0
CUDA available: True
Torch GPU disponibile? True
Torch versione: 2.6.0+cu124
CUDA supportata da torch: 12.4


In [3]:
with open('/kaggle/input/dataset-rev-maiusc/medical_dataset_NER_training.json', 'r') as file:
    dataset = json.load(file)

medical_dataset = dataset['DatasetDict']['medical_consultations']['Dataset']['data']

print(medical_dataset[0]['tokens'])

['Dear', 'doctor', 'i', 'hope', 'this', 'letter', 'finds', 'you', 'well', 'I', 'am', 'seeking', 'your', 'expert', 'opinion', 'on', 'a', 'patient', 'who', 'has', 'been', 'referred', 'to', 'our', "library's", 'community', 'health', 'program', 'for', 'further', 'evaluation', 'The', 'individual', 'diagnosed', 'with', 'peripheral', 'arterial', 'disease', 'is', 'experiencing', 'recurring', 'episodes', 'of', 'mild', 'discomfort', 'in', 'their', 'lower', 'back', 'which', 'appears', 'to', 'be', 'exacerbated', 'by', 'prolonged', 'sitting', 'or', 'standing', 'They', 'have', 'tried', 'various', 'over-the-counter', 'remedies', 'without', 'significant', 'improvement', 'As', 'a', 'healthcare', 'professional', 'myself', 'i', 'am', 'particularly', 'interested', 'in', 'understanding', 'how', 'this', 'issue', 'may', 'impact', 'their', 'daily', 'life', 'and', 'overall', 'well-being', 'Could', 'you', 'please', 'provide', 'guidance', 'on', 'potential', 'causes', 'and', 'any', 'recommended', 'treatments', 'o

In [4]:
def convert_to_spacy_format(dataset_split):
    spacy_data = []

    for example in dataset_split:
        temp_dict = {}
        tokens = example['tokens']
        ent_tags = example['ent_tags']

        text = " ".join(tokens)

        temp_dict['text'] = text
        temp_dict['entities'] = []

        current_pos = 0

        for i in range(len(tokens)):
            token = tokens[i]

            start_idx = current_pos
            end_idx = start_idx + len(token)

            if ent_tags[i] != 'O':
                extracted_token = text[start_idx:end_idx]
                if extracted_token != token:
                    print("sbagliato!")

                temp_dict['entities'].append((start_idx, end_idx, ent_tags[i]))

            current_pos = end_idx + 1

        spacy_data.append(temp_dict)

    return spacy_data

In [5]:
training_data = convert_to_spacy_format(medical_dataset)

In [6]:
nlp = spacy.blank("en")
doc_bin = DocBin()

In [7]:
for training_example in tqdm(training_data):
    text = training_example['text']
    labels = training_example['entities']
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in labels:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    filtered_ents = filter_spans(ents)
    doc.ents = filtered_ents
    doc_bin.add(doc)

doc_bin.to_disk("train.spacy")

100%|██████████| 48056/48056 [00:32<00:00, 1488.19it/s]


In [8]:
nlp = spacy.blank("en")

input_path = Path("./train.spacy")
train_path = Path("./train_split.spacy")
dev_path = Path("./dev_split.spacy")

doc_bin = DocBin().from_disk(input_path)
docs = list(doc_bin.get_docs(nlp.vocab))
random.seed(42)
random.shuffle(docs)

split_idx = int(len(docs) * 0.8)
train_docs = docs[:split_idx]
dev_docs = docs[split_idx:]

train_bin = DocBin()
for doc in train_docs:
    train_bin.add(doc)
train_bin.to_disk(train_path)

dev_bin = DocBin()
for doc in dev_docs:
    dev_bin.add(doc)
dev_bin.to_disk(dev_path)

print(f"Total docs: {len(docs)}")
print(f"Train: {len(train_docs)} → {train_path}")
print(f"Dev: {len(dev_docs)} → {dev_path}")

Total docs: 48056
Train: 38444 → train_split.spacy
Dev: 9612 → dev_split.spacy


In [9]:
!python -m spacy init fill-config /kaggle/input/dataset-training/base_config.cfg config.cfg

[38;5;3m⚠ Nothing to auto-fill: base config is already complete[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [10]:
!python -m spacy train /kaggle/working/config.cfg --output ./ --paths.train ./train_split.spacy --paths.dev ./dev_split.spacy --gpu-id 0

[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[2025-10-25 15:49:05,685] [INFO] Set up nlp object from config
[2025-10-25 15:49:05,706] [INFO] Pipeline: ['transformer', 'ner']
[2025-10-25 15:49:05,710] [INFO] Created vocabulary
[2025-10-25 15:49:05,710] [INFO] Finished initializing nlp object
tokenizer_config.json: 100%|██████████████████| 25.0/25.0 [00:00<00:00, 227kB/s]
config.json: 100%|█████████████████████████████| 481/481 [00:00<00:00, 4.34MB/s]
vocab.json: 100%|████████████████████████████| 899k/899k [00:00<00:00, 5.43MB/s]
merges.txt: 100%|████████████████████████████| 456k/456k [00:00<00:00, 2.76MB/s]
tokenizer.json: 100%|██████████████████████| 1.36M/1.36M [00:00<00:00, 5.79MB/s]
2025-10-25 15:49:15.516936: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1761407355.682128     141 cuda_dnn.cc:8310] Unab