# Vocabulary Matching

In [None]:
import spacy

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
from spacy.matcher import Matcher

matcher = Matcher(nlp.vocab)

In [None]:
pattern1 = [{"LOWER": "solarpower"}]
pattern2 = [{"LOWER": "solar"}, {"LOWER": "power"}]
pattern3 = [{"LOWER": "solar"}, {"IS_PUNCT": True}, {"LOWER": "power"}]

In [None]:
matcher.add('SolarPower', [pattern1, pattern2, pattern3])

In [None]:
doc = nlp(u"The Solar Power industry continues to grow a solarpower increase. Solar-power is amazing.")

In [None]:
found_matches = matcher(doc)

In [None]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc[start:end]  # get the matched span
    print(match_id, string_id, start, end, span.text)

In [None]:
matcher.remove('SolarPower')

In [None]:
# solarpower SolarPower
pattern1 = [{'LOWER': 'solarpower'}]
# solar.power
pattern2 = [{'LOWER': 'solar'}, {'IS_PUNCT': True, 'OP': '*'}, {'LOWER': 'power'}]

In [None]:
matcher.add('SolarPower', [pattern1, pattern2])

In [None]:
doc2 = nlp(u"Solar--power is solarpower yay!")

In [None]:
found_matches = matcher(doc2)

In [None]:
print(found_matches)

# Phrase Matching

In [79]:
from spacy.matcher import PhraseMatcher

In [80]:
matcher = PhraseMatcher(nlp.vocab)

In [None]:
with open('../data/reaganomics.txt') as f:
  doc3 = nlp(f.read())

In [82]:
phrase_list = ['voodoo economics', 'supply-side economics', 'trickle-down economics', 'trickle-down economics', 'free-market economics']

In [83]:
phrase_patterns = [nlp(text) for text in phrase_list]

In [84]:
type(phrase_patterns[0])

spacy.tokens.doc.Doc

In [85]:
matcher.add('EconMatcher', [*phrase_patterns])

In [86]:
found_matches = matcher(doc3)

In [87]:
found_matches

[(3680293220734633682, 41, 45),
 (3680293220734633682, 49, 53),
 (3680293220734633682, 54, 56),
 (3680293220734633682, 61, 65),
 (3680293220734633682, 673, 677),
 (3680293220734633682, 2987, 2991)]

In [88]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc3[start:end]  # get the matched span
    print(match_id, string_id, start, end, span.text)

3680293220734633682 EconMatcher 41 45 supply-side economics
3680293220734633682 EconMatcher 49 53 trickle-down economics
3680293220734633682 EconMatcher 54 56 voodoo economics
3680293220734633682 EconMatcher 61 65 free-market economics
3680293220734633682 EconMatcher 673 677 supply-side economics
3680293220734633682 EconMatcher 2987 2991 trickle-down economics
