In [1]:
import re

In [2]:
pattern = re.compile(r"((\d){1,2} (January|February|March|April|December))")
sentence = "I was born 7 December and started going for important matters in 17 March"
pattern.findall(sentence)

[('7 December', '7', 'December'), ('17 March', '7', 'March')]

In [3]:
text = '''
May
01/05/2022 Sun | + Event
______
02/05/2022 Mon |
03/05/2022 Tue |
04/05/2022 Wed | + 1200 May the force be with you
+ 1230 More events added on new lines to improve readability
+ 1300 May the force stay with you
+ 1800 Hopefully it is still there
05/05/2022 Thu |
06/05/2022 Fri | + 0600 Is the force there?
.
.
.
etc.
#repeatingYearly
04/05 - 1200 May the force be with you (this is an event (like a birthday) which repeats every year)
'''

In [4]:
pattern2 = re.compile(r'\d{2}/\d{2}/\d{4}\s[a-zA-Z]+')

pattern2.findall(text)

['01/05/2022 Sun',
 '02/05/2022 Mon',
 '03/05/2022 Tue',
 '04/05/2022 Wed',
 '05/05/2022 Thu',
 '06/05/2022 Fri']

In [5]:
for date in pattern2.findall(text):
    print(date)

01/05/2022 Sun
02/05/2022 Mon
03/05/2022 Tue
04/05/2022 Wed
05/05/2022 Thu
06/05/2022 Fri


In [6]:
import spacy

In [7]:
#Sample text
text = "This is a sample number 555-5555."

In [8]:
#Build upon the spaCy Small Model
nlp = spacy.blank("en")

In [9]:
#Create the Ruler and Add it
ruler = nlp.add_pipe("entity_ruler")

In [10]:
#List of Entities and Patterns (source: https://spacy.io/usage/rule-based-matching)
patterns = [
    {"label": "PHONE_NUMBER", "pattern": [
        {"SHAPE": "ddd"},
        {"ORTH": '-'},
        {"SHAPE": 'dddd'}
    ]}
]

In [11]:
#add patterns to ruler
ruler.add_patterns(patterns)

In [12]:
# create the doc
doc = nlp(text)

In [13]:
for ent in doc.ents:
    print(ent.text, ent.label_)

555-5555 PHONE_NUMBER


## How to Use RegEx in spaCy

In [14]:
text = "This is a simple phone number 55555 my email is brchris2001@yahoo.com"
patterns = [
    {"label": "PHONE_NUMBER", "pattern": [{"TEXT": {"REGEX": "((\d){5})"}}]},
    {"label": "EMAIL_ADDRESS", "pattern": [{"TEXT": {"REGEX": "[a-z0-9]+@[a-z]+\.[a-z]"}}]}
]
# Add patterns to ruler
ruler.add_patterns(patterns)

In [15]:
# Create the doc
doc2 = nlp(text)

# extract entities
for ent in doc2.ents:
    print(ent.text, ent.label_)

55555 PHONE_NUMBER
brchris2001@yahoo.com EMAIL_ADDRESS


### Using RegEX with spacy (Advanced)

In [16]:
text = "Paul Newman was an American actor, but Paul Hollywood is a British TV Host. The name Paul is quite common."

pattern = r"Paul [A-Z]\w+"
matches = re.finditer(pattern, text)

for match in matches:
    print(match)

<re.Match object; span=(0, 11), match='Paul Newman'>
<re.Match object; span=(39, 53), match='Paul Hollywood'>


In [17]:
##

In [18]:
from spacy.tokens import Span

In [19]:
nlp = spacy.blank("en")
doc = nlp(text)
# print(doc.ents)
original_ents = list(doc.ents)
mwt_ents = [] #mwt --> multiword token

for match in re.finditer(pattern, doc.text):
    start, end = match.span()
    span = doc.char_span(start, end)
    
    if span is not None:
        mwt_ents.append((span.start, span.end, span.text))

for ent in mwt_ents:
    start, end, name = ent
    per_ent = Span(doc, start, end, label = "PERSON")
    original_ents.append(per_ent)
doc.ents = original_ents


for ent in doc.ents:
    print(ent.text, ent.label_)

Paul Newman PERSON
Paul Hollywood PERSON


In [20]:
mwt_ents

[(0, 2, 'Paul Newman'), (8, 10, 'Paul Hollywood')]

### Creating a custom component

In [34]:
from spacy.language import Language

@Language.component("paul_ner")

def paul_ner(doc):
    pattern = r"Paul [A-Z]\w+"
    original_ents = list(doc.ents)
    mwt_ents = [] #mwt --> multiword token

    for match in re.finditer(pattern, doc.text):
        start, end = match.span()
        span = doc.char_span(start, end)

        if span is not None:
            mwt_ents.append((span.start, span.end, span.text))

    for ent in mwt_ents:
        start, end, name = ent
        per_ent = Span(doc, start, end, label = "PERSON")
        original_ents.append(per_ent)
    
    doc.ents = original_ents
    
    return (doc)

In [35]:
nlp2 = spacy.blank("en")
nlp2.add_pipe("paul_ner")

<function __main__.paul_ner(doc)>

In [33]:
nlp2.analyze_pipes()

{'summary': {'paul_ner': {'assigns': [],
   'requires': [],
   'scores': [],
   'retokenizes': False}},
 'problems': {'paul_ner': []},
 'attrs': {}}

In [24]:
doc2 = nlp2(text)
print(doc.ents)

(Paul Newman, Paul Hollywood)


In [25]:
@Language.component("cinema_ner")
from spacy.util import filter_spans

def cinema_ner(doc):
    pattern = r"Hollywood"
    original_ents = list(doc.ents)
    mwt_ents = [] #mwt --> multiword token

    for match in re.finditer(pattern, doc.text):
        start, end = match.span()
        span = doc.char_span(start, end)

        if span is not None:
            mwt_ents.append((span.start, span.end, span.text))

    for ent in mwt_ents:
        start, end, name = ent
        per_ent = Span(doc, start, end, label = "CINEMA")
        original_ents.append(per_ent)
        
    filtered = filter_spans(original_ents)
    doc.ents = filtered
    
    return (doc)

In [27]:
nlp3 = spacy.load("en_core_web_sm")
nlp3.add_pipe("cinema_ner")

<function __main__.cinema_ner(doc)>

In [30]:
doc3 = nlp3(text)

ValueError: [E1010] Unable to set entity information for token 9 which is included in more than one span in entities, blocked, missing or outside.

In [29]:
nlp3.analyze_pipes()

{'summary': {'tok2vec': {'assigns': ['doc.tensor'],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'tagger': {'assigns': ['token.tag'],
   'requires': [],
   'scores': ['tag_acc'],
   'retokenizes': False},
  'parser': {'assigns': ['token.dep',
    'token.head',
    'token.is_sent_start',
    'doc.sents'],
   'requires': [],
   'scores': ['dep_uas',
    'dep_las',
    'dep_las_per_type',
    'sents_p',
    'sents_r',
    'sents_f'],
   'retokenizes': False},
  'attribute_ruler': {'assigns': [],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'lemmatizer': {'assigns': ['token.lemma'],
   'requires': [],
   'scores': ['lemma_acc'],
   'retokenizes': False},
  'ner': {'assigns': ['doc.ents', 'token.ent_iob', 'token.ent_type'],
   'requires': [],
   'scores': ['ents_f', 'ents_p', 'ents_r', 'ents_per_type'],
   'retokenizes': False},
  'cinema_ner': {'assigns': [],
   'requires': [],
   'scores': [],
   'retokenizes': False}},
 'problems': {'tok2vec': [],
  