<a href="https://colab.research.google.com/github/geersenthil/NLP-Learning/blob/main/Vocabulary_and_Matching.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


1) Rule-Based Matching

In [1]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [2]:
# Import the Matcher library
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

In [3]:
# create a pattern

pattern_1 = [{'LOWER': 'hello'}, {'LOWER': 'world'}]
pattern_2 = [{'LOWER': 'hello'}, {'IS_PUNCT': True}, {'LOWER': 'world'}]

In [4]:
# add patterns to matcher
matcher.add('Hello World', None, pattern_1, pattern_2)

In [5]:
doc = nlp(" 'Hello World' are the first two printed words for Hello WORLD most of the programmers, printing 'Hello-World' is most common for beginners")

In [7]:
# pass doc to matcher object

find_matches = matcher(doc) 
print(find_matches)

[(8585552006568828647, 2, 4), (8585552006568828647, 12, 14), (8585552006568828647, 21, 24)]


In [8]:
# define a function to find the matches

for match_id, start, end in find_matches:
    string_id = nlp.vocab.strings[match_id]  # get string rep. of match_id
    span = doc[start:end]                    # get the match span
    print(match_id, string_id, start, end, span.text)

8585552006568828647 Hello World 2 4 Hello World
8585552006568828647 Hello World 12 14 Hello WORLD
8585552006568828647 Hello World 21 24 Hello-World


In [9]:
# Removing the matches
matcher.remove('Hello World')

In [10]:
# Redefine the patterns:
pattern_3 = [{'LOWER': 'hello'}, {'LOWER': 'world'}]
pattern_4 = [{'LOWER': 'hello'}, {'IS_PUNCT': True, 'OP':'*'}, {'LOWER': 'world'}]
# 'OP':'*' ----> This allows this pattern to match zero or more times for any punctuation

matcher.add('Hello World', None, pattern_3, pattern_4)

In [11]:
doc_2 = nlp("You can print Hello World or hello world or Hello-World")

In [12]:
find_matches = matcher(doc_2)
print(find_matches)

[(8585552006568828647, 3, 5), (8585552006568828647, 6, 8), (8585552006568828647, 9, 12)]


2) Phrase Matching

In [13]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [14]:
# Import the PhraseMatcher library
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)

In [15]:
phrase_list = ["Barack Obama", "Angela Merkel", "Washington, D.C."]

In [16]:
# Convert each phrase to a document object
phrase_patterns = [nlp(text) for text in phrase_list] 

In [17]:
phrase_patterns

[Barack Obama, Angela Merkel, Washington, D.C.]

In [18]:
# pass each doc object into the matcher
matcher.add("TerminologyList", None, *phrase_patterns)

In [19]:
doc_3 = nlp("German Chancellor Angela Merkel and US President Barack Obama "
          "converse in the Oval Office inside the White House in Washington, D.C.")

In [20]:
find_matches = matcher(doc_3) # pass doc to matcher object and store this in a variable 
print(find_matches)

[(3766102292120407359, 2, 4), (3766102292120407359, 7, 9), (3766102292120407359, 19, 22)]


In [21]:
# define a function to find the matches

for match_id, start, end in find_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation o match id
    span = doc_3[start:end]                   
    print(match_id, string_id, start, end, span.text)

3766102292120407359 TerminologyList 2 4 Angela Merkel
3766102292120407359 TerminologyList 7 9 Barack Obama
3766102292120407359 TerminologyList 19 22 Washington, D.C.
