In [1]:
import spacy

In [2]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("Britain is a place. Mary is a doctor.")

In [3]:
for ent in doc.ents: print(ent.text, ent.label_)

Britain GPE
Mary PERSON


In [4]:
from spacy.language import Language

In [8]:
@Language.component("remove_gpe")
def remove_gpe(doc):
    original_ents = list(doc.ents)
    for ent in doc.ents:
        if ent.label_ == "GPE":
            original_ents.remove(ent)
    doc.ents = original_ents
    return doc

In [9]:
nlp.add_pipe("remove_gpe")

<function __main__.remove_gpe(doc)>

In [10]:
doc = nlp("Britain is a place. Mary is a doctor.")
for ent in doc.ents: print(ent.text, ent.label_)

Mary PERSON


In [11]:
nlp.to_disk("data/new_en_core_web_sm")

In [12]:
import re

In [13]:
text = "Paul Newman was an American actor, but Paul Hollywood is a British TV Host."

In [14]:
pattern = r"Paul [A-Z]\w+"

In [15]:
matches = re.finditer(pattern, text)
for match in matches:
    print(match)

<re.Match object; span=(0, 11), match='Paul Newman'>
<re.Match object; span=(39, 53), match='Paul Hollywood'>


In [16]:
import spacy
from spacy.tokens import Span

In [23]:
nlp = spacy.blank("en")
doc = nlp(text)
original_ents = list(doc.ents)
mwt_ents = []
for match in re.finditer(pattern, doc.text):
    start, end = match.span()
    span = doc.char_span(start, end)
    if span is not None:
        mwt_ents.append((span.start, span.end, span.text))

for ent in mwt_ents:
    start, end, name = ent
    per_ent = Span(doc, start, end, label="PERSON")
    original_ents.append(per_ent)

doc.ents = original_ents
print(doc.ents)

for ent in doc.ents:
    print(ent.text, ent.label_)

(Paul Newman, Paul Hollywood)
Paul Newman PERSON
Paul Hollywood PERSON


In [27]:
from spacy.language import Language

@Language.component("paul_ner")
def paul_ner(doc):
    patterns = r"Paul [A-Z]\w+"
    original_ents = list(doc.ents)
    mwt_ents = []
    for match in re.finditer(pattern, doc.text):
        start, end = match.span()
        span = doc.char_span(start, end)
        if span is not None:
            mwt_ents.append((span.start, span.end, span.text))

    for ent in mwt_ents:
        start, end, name = ent
        per_ent = Span(doc, start, end, label="PERSON")
        original_ents.append(per_ent)

    doc.ents = original_ents
    return doc

In [28]:
nlp2 = spacy.blank("en")
nlp2.add_pipe("paul_ner")

<function __main__.paul_ner(doc)>

In [30]:
doc2 = nlp2(text)
print(doc2.ents)

(Paul Newman, Paul Hollywood)


In [None]:

from spacy.language import Language

@Language.component("cinema_ner")
def cinema_ner(doc):
    patterns = r"Hollywood"
    original_ents = list(doc.ents)
    mwt_ents = []
    for match in re.finditer(pattern, doc.text):
        start, end = match.span()
        span = doc.char_span(start, end)
        if span is not None:
            mwt_ents.append((span.start, span.end, span.text))

    for ent in mwt_ents:
        start, end, name = ent
        per_ent = Span(doc, start, end, label="PERSON")
        original_ents.append(per_ent)

    doc.ents = original_ents
    return doc