# Basics

In [1]:
import spacy

In [2]:
text = "This is a sample number (555) 555-5555."

In [3]:
nlp = spacy.blank("en")

Let's try to extract the number from the text:

In [4]:
ruler = nlp.add_pipe("entity_ruler")

In [5]:
patterns = [ {
    "label": "PHONE_NUMBER",
    "pattern": [{
        "TEXT": {
            "REGEX": "((\d){3}-(\d){4})"
        }
    }]
}]

In [6]:
ruler.add_patterns(patterns)

In [7]:
doc = nlp(text)

In [8]:
for ent in doc.ents:
    print(ent.text, ent.label_)

Not working... REGEX only works with single token, which is highlighted as the dash present in the phone number pattern is causing the problems.

In [27]:
text = "This is a sample number 5555555."

In [28]:
nlp = spacy.blank("en")

In [29]:
ruler = nlp.add_pipe("entity_ruler")

In [30]:
patterns = [ {
    "label": "PHONE_NUMBER",
    "pattern": [{
        "TEXT": {
            "REGEX": "((\d){5})"
        }
    }]
}]

In [31]:
ruler.add_patterns(patterns)

In [32]:
doc = nlp(text)

In [33]:
for ent in doc.ents:
    print(ent.text, ent.label_)

5555555 PHONE_NUMBER


# Advanced

In [34]:
import re

In [35]:
text = "Paul Newman was an American actor, but Paul Hollywood is a British TV Host. The name Paul is quite common."

pattern = r"Paul [A-Z]\w+"

matches = re.finditer(pattern, text)

for match in matches:
    print (match)

<re.Match object; span=(0, 11), match='Paul Newman'>
<re.Match object; span=(39, 53), match='Paul Hollywood'>


Putting the multi words tokens ner into spacy pipeline:

In [37]:
from spacy.tokens import Span

In [50]:
nlp = spacy.blank("en")
doc = nlp(text)
original_ents = list(doc.ents)
mwt_ents = []
print(doc.ents)
for match in re.finditer(pattern, doc.text):
    start, end = match.span()
    span = doc.char_span(start, end)
    if span is not None:
        mwt_ents.append((span.start, span.end, span.text))
        
for ent in mwt_ents:
    start, end, name = ent
    per_ent = Span(doc, start, end, label="PERSON")
    original_ents.append(per_ent)
doc.ents = original_ents

()


In [51]:
print(doc.ents)

(Paul Newman, Paul Hollywood)


Now, we have the ner functioning well, as both ents are now into the pipe:

In [52]:
for ents in doc.ents:
    print(ents.text, ents.label_)

Paul Newman PERSON
Paul Hollywood PERSON


In [46]:
# Now, span is represented by tokens, not by char
print(mwt_ents)

[(0, 2, 'Paul Newman'), (8, 10, 'Paul Hollywood')]


Let's use custom components here, as well:

In [65]:
from spacy.language import Language

In [66]:
@Language.component("paul_ner")
def paul_ner(doc):
    pattern = r"Paul [A-Z]\w+"
    original_ents = list(doc.ents)
    mwt_ents = []
    for match in re.finditer(pattern, doc.text):
        start, end = match.span()
        span = doc.char_span(start, end)
        if span is not None:
            mwt_ents.append((span.start, span.end, span.text))

    for ent in mwt_ents:
        start, end, name = ent
        per_ent = Span(doc, start, end, label="PERSON")
        original_ents.append(per_ent)
    doc.ents = original_ents
    return doc

In [67]:
nlp2 = spacy.blank("en")
nlp2.add_pipe("paul_ner")

<function __main__.paul_ner(doc)>

In [68]:
doc2 = nlp2(text)
print(doc2.ents)

(Paul Newman, Paul Hollywood)


In [69]:
nlp2.analyze_pipes()

{'summary': {'paul_ner': {'assigns': [],
   'requires': [],
   'scores': [],
   'retokenizes': False}},
 'problems': {'paul_ner': []},
 'attrs': {}}