In [1]:
import spacy
from spacy import displacy

#### Dictionary-based method

In [13]:
nlp = spacy.load("en_core_web_sm")
# Define a dictionary of location names
location_dict = {"Buenos Aires": "GPE", "Santiago": "GPE", "Caracas": "GPE", "La Paz": "GPE"}
#GPE stands for Geo-Political Entity

In [14]:
# Add the location names to the NER pipeline
for name, entity_type in location_dict.items():
    nlp.vocab.strings.add(name)

In [17]:
# Process a document and extract the location entities
doc = nlp("I was in Buenos Aires last month, but currently I live in Santiago and travel to La Paz and Caracas very frequently.")
for ent in doc.ents:
    if ent.label_ == "GPE":
        print(ent.text)

Buenos Aires
Santiago
La Paz
Caracas


In [19]:
nlp

<spacy.lang.en.English at 0x18d6da176d0>

doc

#### Rule-based method

In [20]:
# Defining a function to identify product entities based on keywords
def identify_products(doc):
    for token in doc:
        if token.text in ["iPhone", "iPad", "MacBook"]:
            yield token

In [21]:
# Adding the product entity ruler to the NER pipeline
ruler = nlp.add_pipe("entity_ruler", before="ner")
patterns = [{"label": "PRODUCT", "pattern": [{"TEXT": {"IN": ["iPhone", "iPad", "MacBook"]}}]}]
ruler.add_patterns(patterns)

In [23]:
# Process a document and extract the product entities
doc = nlp("I use to have an iPad, but recently I bought an iPhone and a brand new MacBook.")
for ent in doc.ents:
    if ent.label_ == "PRODUCT":
        print(ent.text)

iPad
iPhone
MacBook


In [2]:
spacy.cli.download("en_core_web_sm")
NER = spacy.load("en_core_web_sm")

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [3]:
doc = "Musk attended Waterkloof House Preparatory School, Bryanston High School, and Pretoria Boys High School, from which he graduated. Musk applied for a Canadian passport through his Canadian-born mother, knowing that it would be easier to immigrate to the United States this way. While waiting for his application to be processed, he attended the University of Pretoria for five months."

In [4]:
def spacy_ner(document):
  return {(ent.text.strip(), ent.label_) for ent in NER(document).ents}

In [5]:
spacy_ner(doc)

{('Bryanston High School', 'ORG'),
 ('Canadian', 'NORP'),
 ('Pretoria', 'GPE'),
 ('Waterkloof House Preparatory School', 'ORG'),
 ('five months', 'DATE'),
 ('the United States', 'GPE'),
 ('the University of Pretoria', 'ORG')}