In [21]:
"""
Stemming: the process whereby words are etymologically reduced to a root.
spaCy intentionally doesn't include a stemmer, instead relying entirely on lemmatization,
so we use nltk for stemming.
"""
import nltk
words=['swam','swimmer','swims','swim',
       'easily','easy','quickly','fairly', 'fairness',
       'caresses','cares','ponies',
       'relational','national','unconventional',
       'generous','generously','generate','generosity','general','generation']

In [22]:
#Porter's Algorithm (1980): common and effective stemming tool which uses 5-phase word reduction.
#1: suffix shortening, e.g. caresses->caress NOT cares, relational->relate NOT national->nate
from nltk.stem.porter import PorterStemmer
p_s=PorterStemmer()
for word in words:
    print(word + '---->' + p_s.stem(word))

swam---->swam
swimmer---->swimmer
swims---->swim
swim---->swim
easily---->easili
easy---->easi
quickly---->quickli
fairly---->fairli
fairness---->fair
caresses---->caress
cares---->care
ponies---->poni
relational---->relat
national---->nation
unconventional---->unconvent
generous---->gener
generously---->gener
generate---->gener
generosity---->generos
general---->gener
generation---->gener


In [23]:
#Snowball is a stemming language also developed by Martin Porter
#"English Stemmer" or "Porter 2 Stemmer" is below:
from nltk.stem.snowball import SnowballStemmer
s_s=SnowballStemmer(language='english')
for word in words:
    print(word + '----->' + s_s.stem(word))

swam----->swam
swimmer----->swimmer
swims----->swim
swim----->swim
easily----->easili
easy----->easi
quickly----->quick
fairly----->fair
fairness----->fair
caresses----->caress
cares----->care
ponies----->poni
relational----->relat
national----->nation
unconventional----->unconvent
generous----->generous
generously----->generous
generate----->generat
generosity----->generos
general----->general
generation----->generat


In [2]:
"""
Lemmatization: The process whereby words are broken into their etymological components.
In contrast to stemming, lemmatization looks at a language's full morphology in order 
to go beyond word reduction and achieve meaning reduction.
Spacy drops stemming in favor of lemmatization because lemmatization can yield much more 
informative semantic insights. Lemmatization looks at surrounding text to infer a word's
meaning, not just the word itself. 
was to be, mice to mouse, meeting to meet or maybe meeting
"""
import spacy
nlp=spacy.load('en_core_web_sm')

In [24]:
doc=nlp(u"I'm a swimmer swimming swimmingly; at a swim meet, I swim because I swam.")
def show_lemmas(text):
    for token in text: #lemma returns a lemma's hashcode; lemma_ returns the lemma itself
        print(f'{token.text:{12}}{token.pos_:{6}}{token.lemma_:{10}}{token.lemma:{10}}') 
show_lemmas(doc)

I           PRON  -PRON-    561228191312463089
'm          VERB  be        10382539506755952630
a           DET   a         11901859001352538922
swimmer     NOUN  swimmer   8984364056738817612
swimming    VERB  swim      13054409096476681252
swimmingly  ADV   swimmingly14521368103460307620
;           PUNCT ;         631425121691394544
at          ADP   at        11667289587015813222
a           DET   a         11901859001352538922
swim        NOUN  swim      13054409096476681252
meet        NOUN  meet      6880656908171229526
,           PUNCT ,         2593208677638477497
I           PRON  -PRON-    561228191312463089
swim        VERB  swim      13054409096476681252
because     ADP   because   16950148841647037698
I           PRON  -PRON-    561228191312463089
swam        VERB  swam      10694386587443064459
.           PUNCT .         12646065887601541794


In [37]:
#Stop-words: ultra-common, semantically unhelpful, and sometimes harmful to nlp results.
print(len(nlp.Defaults.stop_words))
print(nlp.vocab['is'].is_stop)
print(nlp.vocab['btw'].is_stop) 
nlp.Defaults.stop_words.add('btw') #add a new word to vocab
nlp.vocab['btw'].is_stop=True #add a word from vocab into stop_words; reverse to remove one
print(len(nlp.Defaults.stop_words))
print(nlp.vocab['btw'].is_stop)
nlp.vocab['btw'].is_stop=False #reset for next exercise

306
True
False
306
True


In [86]:
#spacy has a rule-matching tool called matcher, similar to regex but more powerful
from spacy.matcher import Matcher
m=Matcher(nlp.vocab)
pattern1=[{'LOWER':'solarpower'}] #detect SolarPower,  
pattern2=[{'LOWER':'solar'},{'IS_PUNCT':True},{'LOWER':'power'}] #Solar-power,
pattern3=[{'LOWER':'solar'},{'LOWER':'power'}] #Solar power
m.add('SolarPower',None,pattern1,pattern2,pattern3)
doc=nlp(u"I'm so into SolarPower, I myself am Solar-power, I love Solar power. solar-----power.")
found=m(doc) #this will load the matches' hashes and indices into a data structure
for match_id, start, end in found:
    str_id=nlp.vocab.strings[match_id] #get string representation
    span = doc[start:end]   #get the matched span
    print(match_id, str_id, start, end, span.text)
m.remove('SolarPower') #remove the prior set of patterns added

"""
now let's make a regex. Quantifiers for the 'OP' key: 
\! requires zero matches
? requires zero or one matches
\+ requires one or more matches
\* allows zero or more matches
"""
pattern1=[{'LOWER':'solarpower'}] #detect SolarPower,  
pattern2=[{'LOWER':'solar'},{'IS_PUNCT':True,'OP':'*'},{'LOWER':'power'}] #Solar---------power
doc2=nlp("Solar---power is solarpower.")
m.add('SolarPower', None, pattern1, pattern2)
found=m(doc2)
print("\n\n after punctuation rules:",found)
m.remove('SolarPower')

8656102463236116519 SolarPower 4 5 SolarPower
8656102463236116519 SolarPower 9 12 Solar-power
8656102463236116519 SolarPower 15 17 Solar power


 after punctuation rules: [(8656102463236116519, 0, 3), (8656102463236116519, 4, 5)]


In [8]:
from spacy.matcher import Matcher
with open('../TextFiles/reaganomics.txt',mode='rb') as f:
    byte_string=f.read()
    #replace any error characters in source bytes
    text = byte_string.decode('utf8', errors='replace') 
    doc3=nlp(text)
phrase_list=['voodoo economics','supply-side economics','trickle-down economics','free-market economics']
#create a document object for each individual phrase
patterns=[nlp(phrase) for phrase in phrase_list]
print(patterns)
pattern1=[{'LOWER':'voodoo economics'}]
pattern2=[{'LOWER':'supply-side economics'}]
pattern3=[{'LOWER':'trickle-down economics'}]


#pass each document object individually into the matcher
m2=Matcher(nlp.vocab)
m2.add('EconMatcher',*patterns) #keyword arguments or asterisks arg
found=m2(doc3)
print(found)
for match_id, start, end in found:
    str_id=nlp.vocab.strings[match_id] #get string representation
    span = doc3[start-5:end+5]   #get the matched span, plus five words on either side
    print(match_id, str_id, start, end, span.text)


[voodoo economics, supply-side economics, trickle-down economics, free-market economics]


AttributeError: 'spacy.tokens.token.Token' object has no attribute 'items'