In [1]:
import spacy

### Rule Based Matching

### Token Based Matching

In [4]:
from spacy.matcher import Matcher
from spacy.tokens import Span
from spacy import displacy

In [5]:
nlp = spacy.load('en_core_web_sm')

In [6]:
doc = nlp('Hello World!')

In [7]:
doc

Hello World!

In [8]:
for token in doc:
    print(token.text)

Hello
World
!


In [31]:
pattern = [{"LOWER":"hello", "OP":"?"},{"IS_PUNCT":True, 'OP':'?'},{"LOWER":"world"}]

In [32]:
matcher = Matcher(nlp.vocab)
matcher.add('HelloWorld', None, pattern)

In [33]:
doc = nlp("Hello, World!")

In [34]:
matches = matcher(doc)

In [35]:
matches

[(15578876784678163569, 0, 3),
 (15578876784678163569, 1, 3),
 (15578876784678163569, 2, 3)]

In [36]:
for token in doc:
    print(token)

Hello
,
World
!


In [37]:
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]
    span = doc[start:end]
    print(match_id, string_id, start, end, span.text)

15578876784678163569 HelloWorld 0 3 Hello, World
15578876784678163569 HelloWorld 1 3 , World
15578876784678163569 HelloWorld 2 3 World


In [39]:
text = "My phone number is 1234. Ohh its wrong! Correct one is 1234567890. Give me a call"

In [40]:
import re

In [41]:
re.search(r'\d{10}', text)

<re.Match object; span=(55, 65), match='1234567890'>

In [42]:
re.search(r'\d{4}', text)

<re.Match object; span=(19, 23), match='1234'>

In [49]:
re.findall(r'\d{4,10}', text)

['1234', '1234567890']

In [51]:
re.findall(r'\w{4,}', text)

['phone', 'number', '1234', 'wrong', 'Correct', '1234567890', 'Give', 'call']

#### Wild Card Text

In [58]:
re.findall(r'p....', text)

['phone']

In [60]:
text1 = "This is cat but not that. i want hat and cat both"

In [62]:
re.findall(r'.a.', text1)

['cat', 'hat', 'wan', 'hat', ' an', 'cat']

In [63]:
text3 = "Hi, Thanks for watching <3"

In [66]:
re.findall(r'\d$', text3)

['3']

In [68]:
re.findall(r'^\d', text)

[]

In [69]:
text4 = "3 Hi, Thanks for watching <3"

In [71]:
re.findall(r'^\d', text4)

['3']

### Exclusion

In [72]:
text4

'3 Hi, Thanks for watching <3'

In [75]:
re.findall(r'[^\d]+', text4)

[' Hi, Thanks for watching <']

In [81]:
re.findall(r'[^\W]+', text4)

['3', 'Hi', 'Thanks', 'for', 'watching', '3']

In [82]:
re.findall(r'[^\D]+', text4)

['3', '3']

In [87]:
text5 = "You can get free-cookies in your-school"

In [88]:
re.findall(r'[\w]+-[\w]+', text5)

['free-cookies', 'your-school']

In [99]:
text = "Google announced a new Pixel at Google I/O Google I/O is a great place to get all updates from Google."

In [100]:
text

'Google announced a new Pixel at Google I/O Google I/O is a great place to get all updates from Google.'

In [101]:
pattern = [{'TEXT':'Google'},{'TEXT':'I'},{'TEXT':'/'},{'TEXT':'O'}]

In [102]:
pattern

[{'TEXT': 'Google'}, {'TEXT': 'I'}, {'TEXT': '/'}, {'TEXT': 'O'}]

In [103]:
def callback_method(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    entity = doc[start:end]
    print(entity.text)

In [104]:
matcher = Matcher(nlp.vocab)
matcher.add('Google',callback_method, pattern)

In [105]:
doc = nlp(text)

In [106]:
matcher(doc)

Google I/O
Google I/O


[(11578853341595296054, 6, 10), (11578853341595296054, 10, 14)]

### Find word Google

In [108]:
text = "Google announced a new Pixel at Google I/O Google I/O is a great place to get all updates from Google."

#text

pattern = [{'TEXT':'Google'},{'TEXT':'I', 'OP':'?'},{'TEXT':'/','OP':'?'},{'TEXT':'O','OP':'?'}]

#pattern

def callback_method(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    entity = doc[start:end]
    print(entity.text)

matcher = Matcher(nlp.vocab)
matcher.add('Google',callback_method, pattern)

doc = nlp(text)

matcher(doc)

Google
Google
Google I
Google I/
Google I/O
Google
Google I
Google I/
Google I/O
Google


[(11578853341595296054, 0, 1),
 (11578853341595296054, 6, 7),
 (11578853341595296054, 6, 8),
 (11578853341595296054, 6, 9),
 (11578853341595296054, 6, 10),
 (11578853341595296054, 10, 11),
 (11578853341595296054, 10, 12),
 (11578853341595296054, 10, 13),
 (11578853341595296054, 10, 14),
 (11578853341595296054, 23, 24)]