# Imports & Downloads

In [1]:
import sys
import subprocess

def install(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

install("spacy")
install("nltk")
install("transformers")
install("torch")
install("stanza")

In [2]:
import nltk
nltk.download("punkt", quiet=True)

import spacy
from spacy.tokens import Span

import stanza
stanza.download("en")

from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json: 435kB [00:00, 174MB/s]                     
2025-12-28 18:38:50 INFO: Downloaded file to C:\Users\gasse\stanza_resources\resources.json
2025-12-28 18:38:50 INFO: Downloading default packages for language: en (English) ...
2025-12-28 18:38:52 INFO: File exists: C:\Users\gasse\stanza_resources\en\default.zip
2025-12-28 18:38:55 INFO: Finished downloading models and saved to C:\Users\gasse\stanza_resources


In [None]:
subprocess.check_call([sys.executable, "-m", "pip", "install", "nltk"])

nltk.download("punkt", quiet=True)
nltk.download("averaged_perceptron_tagger", quiet=True)
nltk.download("averaged_perceptron_tagger_eng", quiet=True)
nltk.download("maxent_ne_chunker", quiet=True)
nltk.download("maxent_ne_chunker_tab", quiet=True)  # ðŸ”¥ THIS WAS MISSING
nltk.download("words", quiet=True)

from nltk import word_tokenize, pos_tag, ne_chunk

# SpaCy NER â€“ Model Loading

In [4]:
nlp_spacy = spacy.load("en_core_web_sm")

# SpaCy Basic NER Example

In [5]:
text = "Apple to build a Hong Kong factory for $6 million"
doc = nlp_spacy(text)

for token in doc:
    print(token.text, end=" | ")

print("\n\nEntities:")
for ent in doc.ents:
    print(ent.text, ent.label_, spacy.explain(ent.label_))
    print(ent.start, ent.end, ent.start_char, ent.end_char)
    print("-" * 40)

Apple | to | build | a | Hong | Kong | factory | for | $ | 6 | million | 

Entities:
Apple ORG Companies, agencies, institutions, etc.
0 1 0 5
----------------------------------------
Hong Kong GPE Countries, cities, states
4 6 17 26
----------------------------------------
$6 million MONEY Monetary values, including unit
8 11 39 49
----------------------------------------


# SpaCy Entity Visualization Function

In [6]:
def show_ents_spacy(doc):
    if not doc.ents:
        print("No named entities found.")
    for ent in doc.ents:
        print(f"{ent.text} - {ent.label_} - {spacy.explain(ent.label_)}")
        print("-" * 30)

show_ents_spacy(nlp_spacy("May I go to Washington DC next May?"))
show_ents_spacy(nlp_spacy("Can I borrow 500 dollars from Microsoft?"))

Washington DC - GPE - Countries, cities, states
------------------------------
next May - DATE - Absolute or relative dates or periods
------------------------------
500 dollars - MONEY - Monetary values, including unit
------------------------------
Microsoft - ORG - Companies, agencies, institutions, etc.
------------------------------


# Adding Custom Entity In SpaCy

In [7]:
doc = nlp_spacy("CPRO to build a U.K. factory for $6 million")
ORG = doc.vocab.strings["ORG"]
new_ent = Span(doc, 0, 1, label=ORG)
doc.ents = list(doc.ents) + [new_ent]

show_ents_spacy(doc)

CPRO - ORG - Companies, agencies, institutions, etc.
------------------------------
U.K. - GPE - Countries, cities, states
------------------------------
$6 million - MONEY - Monetary values, including unit
------------------------------


# SpaCy Noun Chunks

In [8]:
doc = nlp_spacy("Autonomous cars shift insurance liability toward manufacturers")

for chunk in doc.noun_chunks:
    print(chunk.text, chunk.root.text, chunk.root.dep_, spacy.explain(chunk.root.dep_))
    print("-" * 40)

Autonomous cars cars nsubj nominal subject
----------------------------------------
insurance liability liability dobj direct object
----------------------------------------
manufacturers manufacturers pobj object of preposition
----------------------------------------


# SpaCy Large Text NER

In [9]:
long_doc = nlp_spacy("Inflation affects economies in various positive and negative ways.")
show_ents_spacy(long_doc)

No named entities found.


# SpaCy Arabic Text

In [10]:
doc_ar = nlp_spacy("Ø°Ù‡Ø¨ Ù…Ø­Ù…Ø¯ Ø¥Ù„Ù‰ Ù…ØµØ± Ùˆ Ù„Ø¨Ù†Ø§Ù†")
show_ents_spacy(doc_ar)

Ø°Ù‡Ø¨ Ù…Ø­Ù…Ø¯ - ORG - Companies, agencies, institutions, etc.
------------------------------


# NLTK NER (POS + Chunking)

In [15]:
def nltk_ner(text):
    tokens = word_tokenize(text)
    pos_tags = pos_tag(tokens)
    tree = ne_chunk(pos_tags)
    return tree

print(nltk_ner("Mohamed Salah was born in Hawaii"))

(S
  (PERSON Mohamed/NNP)
  (PERSON Salah/NNP)
  was/VBD
  born/VBN
  in/IN
  (GPE Hawaii/NNP))


# HUGGING FACE TRANSFORMERS NER

In [16]:
ner_hf = pipeline(
    "ner",
    model="dslim/bert-base-NER",
    aggregation_strategy="simple"
)

hf_result = ner_hf("Apple is looking at buying U.K. startup for $1 billion")

for ent in hf_result:
    print(ent["word"], ent["entity_group"], ent["score"])

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the chec

Apple ORG 0.9987307
U. K. LOC 0.99434024


# STANFORD NER (via Stanza)

In [17]:
nlp_stanza = stanza.Pipeline("en", processors="tokenize,ner")

doc = nlp_stanza("Google was founded in California by Larry Page")

for ent in doc.ents:
    print(ent.text, ent.type)

2025-12-28 18:59:40 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json: 435kB [00:00, 10.1MB/s]                    
2025-12-28 18:59:40 INFO: Downloaded file to C:\Users\gasse\stanza_resources\resources.json
2025-12-28 18:59:41 INFO: Loading these models for language: en (English):
| Processor | Package                   |
-----------------------------------------
| tokenize  | combined                  |
| mwt       | combined                  |
| ner       | ontonotes-ww-multi_charlm |

2025-12-28 18:59:41 INFO: Using device: cpu
2025-12-28 18:59:41 INFO: Loading: tokenize
2025-12-28 18:59:41 INFO: Loading: mwt
2025-12-28 18:59:41 INFO: Loading: ner
2025-12-28 18:59:45 INFO: Done loading processors!


Google ORG
California GPE
Larry Page PERSON


# Unified NER Interface

In [18]:
def run_all_ner(text):
    print("spaCy:")
    show_ents_spacy(nlp_spacy(text))

    print("\nNLTK:")
    print(nltk_ner(text))

    print("\nHugging Face:")
    for ent in ner_hf(text):
        print(ent["word"], ent["entity_group"])

    print("\nStanford (Stanza):")
    doc = nlp_stanza(text)
    for ent in doc.ents:
        print(ent.text, ent.type)

# Final Test

In [19]:
run_all_ner("Microsoft acquired a company in Egypt for 5 million dollars")

spaCy:
Microsoft - ORG - Companies, agencies, institutions, etc.
------------------------------
Egypt - GPE - Countries, cities, states
------------------------------
5 million dollars - MONEY - Monetary values, including unit
------------------------------

NLTK:
(S
  (PERSON Microsoft/NNP)
  acquired/VBD
  a/DT
  company/NN
  in/IN
  (GPE Egypt/NNP)
  for/IN
  5/CD
  million/CD
  dollars/NNS)

Hugging Face:
Microsoft ORG
Egypt LOC

Stanford (Stanza):
Microsoft ORG
Egypt GPE
5 million dollars MONEY
