In [26]:
import spacy

In [27]:
nlp = spacy.load('en_core_web_sm')

In [28]:
doc = nlp(u'Tesla is looking at buying U.S. startup for $6 million')

In [29]:
for token in doc:
    print(token)

Tesla
is
looking
at
buying
U.S.
startup
for
$
6
million


In [6]:
for token in doc:
    print(token.text)

Tesla
is
looking
at
buying
U.S.
startup
for
$
6
million


In [7]:
for token in doc:
    print(token.text,token.pos_,token.dep_)

Tesla PROPN nsubj
is VERB aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.S. PROPN compound
startup NOUN dobj
for ADP prep
$ SYM quantmod
6 NUM compound
million NUM pobj


In [8]:
nlp.pipeline

[('tagger', <spacy.pipeline.Tagger at 0x25ca4e10808>),
 ('parser', <spacy.pipeline.DependencyParser at 0x25ca4df7d08>),
 ('ner', <spacy.pipeline.EntityRecognizer at 0x25ca4e132e8>)]

In [9]:
nlp.pipe_names

['tagger', 'parser', 'ner']

In [10]:
doc2 =nlp(u"Tesla isn't   looking into startups anymore.")

In [11]:
for token in doc2:
    print(token.text,token.pos_,token.dep_)

Tesla PROPN nsubj
is VERB aux
n't ADV neg
   SPACE 
looking VERB ROOT
into ADP prep
startups NOUN pobj
anymore ADV advmod
. PUNCT punct


In [12]:
doc2

Tesla isn't   looking into startups anymore.

In [13]:
doc2[0]

Tesla

In [14]:
doc[2].pos_

'VERB'

In [15]:
spacy.explain('PROPN')

'proper noun'

In [16]:
spacy.explain('advmod')

'adverbial modifier'

In [17]:
for token in doc:
    print(token.text,token.lemma_,token.pos_,token.tag_,token.shape_,token.is_alpha,token.is_stop)

Tesla tesla PROPN NNP Xxxxx True False
is be VERB VBZ xx True True
looking look VERB VBG xxxx True False
at at ADP IN xx True True
buying buy VERB VBG xxxx True False
U.S. u.s. PROPN NNP X.X. False False
startup startup NOUN NN xxxx True False
for for ADP IN xxx True True
$ $ SYM $ $ False False
6 6 NUM CD d False False
million million NUM CD xxxx True False


In [18]:
doc3 = nlp(u'Although commmonly attributed to John Lennon from his song "Beautiful Boy", \
the phrase "Life is what happens to us while we are making other plans" was written by \
cartoonist Allen Saunders and published in Reader\'s Digest in 1957, when Lennon was 17.')

In [19]:
life = doc3[0]
life

Although

In [20]:
life = doc3[16:30]
life

"Life is what happens to us while we are making other plans"

In [21]:
type(life)

spacy.tokens.span.Span

In [22]:
type(doc3)

spacy.tokens.doc.Doc

In [23]:
doc4 = nlp(u'This is the first sentence. This is another sentence. This is the last sentence.')

In [24]:
for sent in doc4.sents:
    print(sent)

This is the first sentence.
This is another sentence.
This is the last sentence.


In [25]:
doc4[0].is_sent_start

In [26]:
doc4[11].is_sent_start

True

### Tokenization

In [27]:
import spacy

In [28]:
nlp = spacy.load('en_core_web_sm')

In [29]:
mystring = '"We\'re moving to L.A.!"'

In [30]:
print(mystring)

"We're moving to L.A.!"


In [31]:
doc = nlp(mystring)

In [32]:
doc

"We're moving to L.A.!"

In [33]:
for token in doc:
    print(token.text)

"
We
're
moving
to
L.A.
!
"


In [34]:
doc2 = nlp(u"We're here to help! Send snail-mail, email support@oursite.com or visit us at http://www.oursite.com!")


In [35]:
for t in doc2:
    print(t.text)

We
're
here
to
help
!
Send
snail
-
mail
,
email
support@oursite.com
or
visit
us
at
http://www.oursite.com
!


In [36]:
doc3 = nlp(u'A 5km NYC cab ride costs $10.30')

In [37]:
for t in doc3:
    print(t.text)

A
5
km
NYC
cab
ride
costs
$
10.30


In [38]:
doc4 = nlp(u"Let's visit St. Louis in the U.S. next year.")

for t in doc4:
    print(t.text)

Let
's
visit
St.
Louis
in
the
U.S.
next
year
.


In [39]:
len(doc4)

11

In [40]:
doc4.vocab

<spacy.vocab.Vocab at 0x25ca65ac9c8>

In [41]:
len(doc4.vocab)

57852

In [42]:
doc5 = nlp(u'It is better to give than to receive.')

# Retrieve the third token:
doc5[2]

better

In [43]:
doc5[2:5]

better to give

In [44]:
doc5[-4:]

than to receive.

In [45]:
#doc5[0]='gives' #gives error

In [46]:
doc8 = nlp(u'Apple to build a Hong Kong factory for $6 million')

In [47]:
for token in doc8:
    print(token.text,end=' | ')

Apple | to | build | a | Hong | Kong | factory | for | $ | 6 | million | 

In [48]:
for entity in doc8.ents:
    print(entity)

Apple
Hong Kong
$6 million


In [49]:
# for entity in doc8.sents:
#     print(entity)

In [50]:
for entity in doc8.ents:
    print(entity)
    print(entity.label_)
    print(spacy.explain(entity.label_))
    print('\n')

Apple
ORG
Companies, agencies, institutions, etc.


Hong Kong
GPE
Countries, cities, states


$6 million
MONEY
Monetary values, including unit




In [51]:
entity.label_

'MONEY'

In [52]:
len(doc8.ents)

3

In [53]:
doc9 = nlp(u"Autonomous cars shift insurance liability toward manufacturers.")
for t in doc9.noun_chunks:
    print(t)

Autonomous cars
insurance liability
manufacturers


In [54]:
from spacy import displacy

In [55]:
doc = nlp(u'Apple is going to build a U.K. factory for $6 million.')

In [56]:
displacy.render(doc,style='dep',jupyter=True,options={'distance':60})

In [57]:
doc = nlp(u'Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million.')

In [58]:
displacy.render(doc,style='ent',jupyter=True)

In [59]:
for t in doc.ents:
    print(t,t.label_)

the last quarter DATE
Apple ORG
nearly 20 thousand CARDINAL
iPods PRODUCT
$6 million MONEY


In [60]:
# displacy.serve(doc,style='dep')


After running the cell above, click the link below to view the dependency parse:
http://127.0.0.1:5000  

### Stemming

In [61]:
import nltk

In [62]:
from nltk.stem.porter import PorterStemmer

In [63]:
ps = PorterStemmer()

In [64]:
words =['run','runner','ran','running','runs','easily','fairly']

In [65]:
for word in words:
    print(word+'...'+ps.stem(word))

run...run
runner...runner
ran...ran
running...run
runs...run
easily...easili
fairly...fairli


In [66]:
from nltk.stem.snowball import SnowballStemmer

In [67]:
ss =SnowballStemmer(language='english')

In [68]:
for word in words:
    print(word+'...'+ss.stem(word))

run...run
runner...runner
ran...ran
running...run
runs...run
easily...easili
fairly...fair


### Lemmatization

In [69]:
doc1 = nlp(u"I am a runner running in a race because I love to run since I ran today")

In [70]:
for token in doc1:
    print(f'{token.text:{12}} {token.pos_:{6}} {token.lemma:<{22}} {token.lemma_}')

I            PRON   561228191312463089     -PRON-
am           VERB   10382539506755952630   be
a            DET    11901859001352538922   a
runner       NOUN   12640964157389618806   runner
running      VERB   12767647472892411841   run
in           ADP    3002984154512732771    in
a            DET    11901859001352538922   a
race         NOUN   8048469955494714898    race
because      ADP    16950148841647037698   because
I            PRON   561228191312463089     -PRON-
love         VERB   3702023516439754181    love
to           PART   3791531372978436496    to
run          VERB   12767647472892411841   run
since        ADP    10066841407251338481   since
I            PRON   561228191312463089     -PRON-
ran          VERB   12767647472892411841   run
today        NOUN   11042482332948150395   today


In [71]:
def show_lemmas(text):
    for token in text:
        print(f'{token.text:{12}} {token.pos_:{6}} {token.lemma:<{22}} {token.lemma_}')

In [72]:
doc2 = nlp(u"I saw eighteen mice today!")

show_lemmas(doc2)

I            PRON   561228191312463089     -PRON-
saw          VERB   11925638236994514241   see
eighteen     NUM    9609336664675087640    eighteen
mice         NOUN   1384165645700560590    mouse
today        NOUN   11042482332948150395   today
!            PUNCT  17494803046312582752   !


### STOP Words

In [73]:
print(nlp.Defaults.stop_words)

{'any', 'she', 'or', 'thus', 'nevertheless', 'when', 'about', 'into', 'whither', 'above', 'sometime', 'this', 'will', 'might', 'hereupon', 'what', 'two', 'you', 'from', 'forty', 'here', 'seeming', 'us', 'empty', 'then', 'thence', 'indeed', 'herself', 'whence', 'whereby', 'wherein', 'which', 'so', 'whereas', 'through', 'except', 'his', 'really', 'each', 'he', 'hereafter', 'often', 'and', 'nothing', 'on', 'too', 'of', 'may', 'seems', 'already', 'thereupon', 'seem', 'some', 'twenty', 'otherwise', 'why', 'four', 'yet', 'itself', 'did', 'make', 'am', 'themselves', 'own', 'be', 'were', 'latter', 'himself', 'twelve', 'afterwards', 'during', 'under', 'whom', 'also', 'a', 'somehow', 'up', 'her', 'show', 'without', 'our', 'enough', 'former', 'part', 'top', 'out', 'bottom', 'one', 'rather', 'whatever', 'as', 'further', 'name', 'hence', 'move', 'amongst', 'all', 'by', 'had', 'become', 'most', 'made', 'once', 'are', 'beforehand', 'though', 'toward', 'call', 'very', 'perhaps', 'seemed', 'next', 'acr

In [74]:
len(nlp.Defaults.stop_words)

305

In [75]:
len(nlp.vocab)

57852

In [76]:
nlp.vocab['is'].is_stop

True

In [77]:
nlp.vocab['mystery'].is_stop

False

In [78]:
nlp.Defaults.stop_words.add('btw')

In [79]:
nlp.vocab['btw'].is_stop=True

In [80]:
len(nlp.Defaults.stop_words)

306

In [81]:
nlp.vocab['btw'].is_stop

True

In [82]:
nlp.Defaults.stop_words.remove('beyond')

In [83]:
nlp.vocab['beyond'].is_stop=False

In [84]:
nlp.vocab['beyond'].is_stop

False

###  Vocabulary and Matching

In [85]:
from spacy.matcher import Matcher

In [86]:
matcher = Matcher(nlp.vocab)

In [87]:
# SolarPower

pattern1 = [{'LOWER':'solarpower'}]

# solar-power
pattern2 = [{'LOWER':'solar'},{'IS_PUNCT':True},{'LOWER':'power'}]

# solar power
pattern3 = [{'LOWER':'solar'},{'LOWER':'power'}]

In [88]:
matcher.add('SolarPower',None,pattern1,pattern2,pattern3)

In [89]:
doc = nlp(u'The Solar Power industry continues to grow as demand \
for solarpower increases. Solar-power cars are gaining popularity.')

In [90]:
found_matches = matcher(doc)

In [91]:
print(found_matches)

[(8656102463236116519, 1, 3), (8656102463236116519, 10, 11), (8656102463236116519, 13, 16)]


In [92]:
for matchid,start,end in found_matches:
    stringid = nlp.vocab.strings[matchid]
    span = doc[start:end]
    print(matchid, stringid, start, end, span.text)

8656102463236116519 SolarPower 1 3 Solar Power
8656102463236116519 SolarPower 10 11 solarpower
8656102463236116519 SolarPower 13 16 Solar-power


In [93]:
matcher.remove('SolarPower')

In [94]:
pattern1 = [{'LOWER':'solarpower'}]
pattern2 = [{'LOWER':'solar'},{'IS_PUNCT':True,'OP':'*'},{'LOWER':'power'}]

In [95]:
matcher.add('SolarPowerz',None,pattern1,pattern2)

In [96]:
doc = nlp(u'The Solar Power industry continues to grow as demand \
for solarpower increases. Solar-power cars are gaining popularity. solar---power')

In [97]:
found_matches = matcher(doc)

In [98]:
print(found_matches)

[(13502650225408460481, 1, 3), (13502650225408460481, 10, 11), (13502650225408460481, 13, 16), (13502650225408460481, 21, 24)]


In [99]:
for matchid,start,end in found_matches:
    stringid = nlp.vocab.strings[matchid]
    span = doc[start:end]
    print(matchid, stringid, start, end, span.text)

13502650225408460481 SolarPowerz 1 3 Solar Power
13502650225408460481 SolarPowerz 10 11 solarpower
13502650225408460481 SolarPowerz 13 16 Solar-power
13502650225408460481 SolarPowerz 21 24 solar---power


In [100]:
pattern=[{'ORTH':'#'},{}]
matcher.add('hashtag',None,pattern)

In [101]:
doc = nlp(u'this is for freedom #Azaadi. Love #Kashmir # Peace #4. wowoz')
found_matches = matcher(doc)

In [102]:
print(found_matches)
for matchid,start,end in found_matches:
    stringid = nlp.vocab.strings[matchid]
    span = doc[start:end]
    print(matchid, stringid, start, end, span.text)

[(4995106245934493313, 4, 6), (4995106245934493313, 8, 10), (4995106245934493313, 10, 12), (4995106245934493313, 12, 14)]
4995106245934493313 hashtag 4 6 #Azaadi
4995106245934493313 hashtag 8 10 #Kashmir
4995106245934493313 hashtag 10 12 # Peace
4995106245934493313 hashtag 12 14 #4


In [103]:
from spacy.matcher import PhraseMatcher

In [104]:
matcher = PhraseMatcher(nlp.vocab)

In [106]:
with open('reaganomics.txt') as f:
    doc3 = nlp(f.read())

In [109]:
phrase_list = ['voodoo economics', 'supply-side economics', 'trickle-down economics', 'free-market economics']

In [110]:
phrase_patterns = [nlp(text) for text in phrase_list]

In [111]:
phrase_patterns

[voodoo economics,
 supply-side economics,
 trickle-down economics,
 free-market economics]

In [112]:
type(phrase_patterns[0])

spacy.tokens.doc.Doc

In [114]:
matcher.add('EconMatcher',None,*phrase_patterns)

In [115]:
found_matches = matcher(doc3)
found_matches

[(3680293220734633682, 41, 45),
 (3680293220734633682, 49, 53),
 (3680293220734633682, 54, 56),
 (3680293220734633682, 61, 65),
 (3680293220734633682, 673, 677),
 (3680293220734633682, 2985, 2989)]

In [118]:
#print(found_matches)
for matchid,start,end in found_matches:
    stringid = nlp.vocab.strings[matchid]
    span = doc3[start-2:end+2]
    print(matchid, stringid, start, end, span.text)

3680293220734633682 EconMatcher 41 45 associated with supply-side economics, referred
3680293220734633682 EconMatcher 49 53 to as trickle-down economics or voodoo
3680293220734633682 EconMatcher 54 56 economics or voodoo economics by political
3680293220734633682 EconMatcher 61 65 , and free-market economics by political
3680293220734633682 EconMatcher 673 677 from the supply-side economics movement,
3680293220734633682 EconMatcher 2985 2989 as "trickle-down economics",


In [None]:
`

In [33]:
doc

Tesla is looking at buying U.S. startup for $6 million

In [38]:
doc.count_by(spacy.attrs.TAG)

{1534113631682161808: 2,
 15794550382381185553: 2,
 13927759927860985106: 1,
 15308085513773655218: 1,
 11283501755624150392: 1,
 8427216679587749980: 2,
 1292078113972184607: 2}

In [40]:
for t in doc:
    print(t.text,t.dep_,spacy.explain(t.dep_))

Tesla nsubj nominal subject
is aux auxiliary
looking ROOT None
at prep prepositional modifier
buying pcomp complement of preposition
U.S. compound None
startup dobj direct object
for prep prepositional modifier
$ quantmod modifier of quantifier
6 compound None
million pobj object of preposition


In [53]:
nlp.add_pipe

['tagger', 'parser', 'ner']