In [None]:
import spacy
from spacy.matcher import Matcher, PhraseMatcher

nlp = spacy.load("en_core_web_sm")
text = "Good morning! I hope you have a good evening. Bill Gates and John Smith are here."
doc = nlp(text)

# Matcher in spaCy
matcher = Matcher(nlp.vocab)   
    # creating the matcher object

# Defining pattern
morning_pattern = [{"LOWER": "good"}, {"LOWER": "morning"}]

# Adding pattern to the matcher object
matcher.add("MORNING_GREETING", [morning_pattern])

# Matching text
matches = matcher(doc)
    # returns list of tuples of the form
    # (match_id, start, end)

# Printing matches 
for match_id, start, end in matches:
    span = doc[start:end]
        # span is a slice of doc, has attributes like doc
    print(span.text)
        # so span.text is required 



<spacy.matcher.matcher.Matcher at 0x1bdda54fbe0>

## Extended Matcher


In [None]:
matcher_extended = Matcher(nlp.vocab)

# defining pattern
greeeting_pattern = [
    {"LOWER": "good"},
    {"LOWER": {"IN": ["morning", "evening"]}}
        # here we are using IN 
]
matcher_extended.add("GREETINGS", [greeeting_pattern])
extended_matches = matcher_extended(doc)

for match_id, start, end in extended_matches:
    span = doc[start:end]
    print(span.text)


Good morning
good evening


## Phrase Matcher

In [21]:
import spacy
from spacy.matcher import PhraseMatcher

nlp = spacy.load("en_core_web_sm")

# Defining PhraseMatcher Object
phrase_matcher = PhraseMatcher(nlp.vocab)

# Define Phrases to Match
people_list = ["Bill Gates", "John Smith", "Steve Jobs"]
patterns = [nlp.make_doc(person) for person in people_list]
    # make_doc does the tokenization job only
    # nlp() runs the full pipeline
    # so make_doc increases efficiency

# Add patterns to PhraseMatcher
phrase_matcher.add("PEOPLE", patterns)

# Matching text
phrase_matches = phrase_matcher(doc)

# printing matches
for match_id, start, end in phrase_matches:
    span = doc[start:end]
    print(span.text)

Bill Gates
John Smith


In [23]:
# PhraseMatcher with LOWER attribute
phrase_matcher_extended = PhraseMatcher(nlp.vocab, attr="LOWER")

# Creating patterns for lowercase matching
lower_patterns = [nlp.make_doc(person.lower()) for person in people_list]

# Add patterns
phrase_matcher_extended.add("PEOPLE_LOWER", lower_patterns)

# Test with mixed text
mixed_text = "I met bill gates and JOHN SMITH yesterday."
doc = nlp(mixed_text)
matches = phrase_matcher_extended(doc)

# printing matches
for match_id, start, end in matches:
    span = doc[start:end]
    print(span.text)


bill gates
JOHN SMITH


In [None]:
# Phrase matching with Shape
import spacy
from spacy.matcher import PhraseMatcher

nlp = spacy.load("en_core_web_sm")

# Defining Phrase Matcher Object
phrase_matcher_shape = PhraseMatcher(nlp.vocab, attr = "SHAPE")

# Examples to understand patterns
ip_examples = ["192.168.1.1", "10.0.0.1"]
ip_patterns = [nlp.make_doc(ip) for ip in ip_examples]

# Adding patterns
phrase_matcher_shape.add("IP_ADDRESS", ip_patterns)

# Testing matcher object
text = "Server IPs are 172.160.0.1 and 203.45.67.89 and 192.168.1.1"
doc = nlp(text)
matches = phrase_matcher_shape(doc)

# printing matches
for match_id, start, end in matches:
    span = doc[start:end]
    print(span.text)
"""
    172.160.0.1
    192.168.1.1
"""


172.160.0.1
192.168.1.1
