<a href="https://colab.research.google.com/github/PradipNichite/Youtube-Tutorials/blob/main/Spacy_Custom_NER_Youtube.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --no-deps spacy-transformers 'spacy_alignments'
!python -m spacy download en_core_web_trf
!pip install transformers>=4.36.0 huggingface_hub>=0.19.0 accelerate>=0.25.0

import spacy
import torch
import cupy
import sys
import numpy as np
import scipy
import scipy.special
from spacy.training import Example
from spacy.util import minibatch, compounding, filter_spans
from spacy.tokens import DocBin
from datasets import load_dataset
from pathlib import Path
from tqdm import tqdm
import random
import json
import warnings
warnings.filterwarnings('ignore')

print(f"spaCy version: {spacy.__version__}")
print(f"PyTorch version: {torch.__version__}")
print(f"CuPy version: {cupy.__version__}")

nlp = spacy.load("en_core_web_trf")
doc = nlp("Hello world")
print("✅ Tutto funzionante!")

In [None]:
print("CuPy version:", cupy.__version__)
print("CUDA available:", cupy.cuda.is_available())
print("Torch GPU disponibile?", torch.cuda.is_available())
print("Torch versione:", torch.__version__)
print("CUDA supportata da torch:", torch.version.cuda)

In [None]:
with open('/kaggle/input/dataset-rev-maiusc/medical_dataset_NER_training.json', 'r') as file:
    dataset = json.load(file)

medical_dataset = dataset['DatasetDict']['medical_consultations']['Dataset']['data']

print(medical_dataset[0]['tokens'])

In [None]:
def convert_to_spacy_format(dataset_split):
    spacy_data = []

    for example in dataset_split:
        temp_dict = {}
        tokens = example['tokens']
        ent_tags = example['ent_tags']

        text = " ".join(tokens)

        temp_dict['text'] = text
        temp_dict['entities'] = []

        current_pos = 0

        for i in range(len(tokens)):
            token = tokens[i]

            start_idx = current_pos
            end_idx = start_idx + len(token)

            if ent_tags[i] != 'O':
                extracted_token = text[start_idx:end_idx]
                if extracted_token != token:
                    print("sbagliato!")

                temp_dict['entities'].append((start_idx, end_idx, ent_tags[i]))

            current_pos = end_idx + 1

        spacy_data.append(temp_dict)

    return spacy_data

In [None]:
training_data = convert_to_spacy_format(medical_dataset)

In [None]:
nlp = spacy.blank("en")
doc_bin = DocBin()

In [None]:
for training_example in tqdm(training_data):
    text = training_example['text']
    labels = training_example['entities']
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in labels:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    filtered_ents = filter_spans(ents)
    doc.ents = filtered_ents
    doc_bin.add(doc)

doc_bin.to_disk("train.spacy")

In [None]:
nlp = spacy.blank("en")

input_path = Path("./train.spacy")
train_path = Path("./train_split.spacy")
dev_path = Path("./dev_split.spacy")

doc_bin = DocBin().from_disk(input_path)
docs = list(doc_bin.get_docs(nlp.vocab))
random.seed(42)
random.shuffle(docs)

split_idx = int(len(docs) * 0.8)
train_docs = docs[:split_idx]
dev_docs = docs[split_idx:]

train_bin = DocBin()
for doc in train_docs:
    train_bin.add(doc)
train_bin.to_disk(train_path)

dev_bin = DocBin()
for doc in dev_docs:
    dev_bin.add(doc)
dev_bin.to_disk(dev_path)

print(f"Total docs: {len(docs)}")
print(f"Train: {len(train_docs)} → {train_path}")
print(f"Dev: {len(dev_docs)} → {dev_path}")

In [None]:
!python -m spacy init fill-config /kaggle/input/dataset-training/base_config.cfg config.cfg

In [None]:
!python -m spacy train /kaggle/working/config.cfg --output ./ --paths.train ./train_split.spacy --paths.dev ./dev_split.spacy --gpu-id 0