In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [10]:
doc = nlp(u'Tesla is looking at buying U.S. startup for $6 million')

In [11]:
for token in doc:
    print(token.text,token.pos_,token.dep_)

Tesla PROPN nsubj
is AUX aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.S. PROPN compound
startup NOUN dobj
for ADP prep
$ SYM quantmod
6 NUM compound
million NUM pobj


In [12]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x19ec7a88ae0>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x19ec7a5ef40>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x19ec7a85280>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x19ec78be760>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x19ec7a3d040>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x19ec77e9100>)]

In [14]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']

In [15]:
# First step is tokenization; splitting all the words in text into tokens
# tokens are annotated in the doc object

In [16]:
doc2 = nlp(u"Tesla isn't looking into startups anymore.")

In [17]:
for token in doc2:
    print(token.text,token.pos_,token.dep_)

Tesla PROPN nsubj
is AUX aux
n't PART neg
looking VERB ROOT
into ADP prep
startups NOUN pobj
anymore ADV advmod
. PUNCT punct


In [25]:
# Also works with indexing
doc2[0].pos_

'PROPN'

In [26]:
doc3 = nlp(u'Although commmonly attributed to John Lennon from his song "Beautiful Boy", \
the phrase "Life is what happens to us while we are making other plans" was written by \
cartoonist Allen Saunders and published in Reader\'s Digest in 1957, when Lennon was 17.')

In [27]:
life_quote = doc3[16:30]

In [28]:
print(life_quote)

"Life is what happens to us while we are making other plans"


In [29]:
# Spacy is smart enough to know this is a span of a larger document
type(life_quote)

spacy.tokens.span.Span

In [30]:
type(doc3)

spacy.tokens.doc.Doc

In [31]:
# Start of sentence tags

doc4 = nlp(u"This is the first sentence. This is the second sentence. This is the last sentence.")

In [32]:
for sentence in doc4.sents:
    print(sentence)

This is the first sentence.
This is the second sentence.
This is the last sentence.


In [39]:
# Ask if the work is start of a sentence

doc4[6].is_sent_start

True

In [40]:
doc4[7].is_sent_start

False

In [41]:
# tokens

In [42]:
mystring = '"We\'re moving to L.A.!"'

In [43]:
print(mystring)

"We're moving to L.A.!"


In [44]:
doc = nlp(mystring)

In [47]:
for token in doc:
    print(token.text)

"
We
're
moving
to
L.A.
!
"


In [48]:
len(doc)

8

In [50]:
# cannot reassign tokens
doc[0] = "test"

TypeError: 'spacy.tokens.doc.Doc' object does not support item assignment

In [51]:
for token in doc:
    print(token.text,end=' | ')

" | We | 're | moving | to | L.A. | ! | " | 

In [53]:
# spacy can detect the named entities

for entity in doc.ents:
    print(entity)
    print(entity.label_)

L.A.
GPE


In [54]:
# noun chunks
doc9 = nlp(u'Autonomous cars shift insurance liability toward manufacturers.')

In [56]:
for chunk in doc9.noun_chunks:
    print(chunk)

Autonomous cars
insurance liability
manufacturers


In [57]:
# display visualization

from spacy import displacy

In [58]:
doc = nlp(u'Apple is going to build a U.K. factory for $6 million.')

In [59]:
displacy.render(doc,style='dep',jupyter=True,options={'distance':110})

In [60]:
doc = nlp(u'Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million.')

In [61]:
displacy.render(doc,style='ent',jupyter=True)

In [63]:
doc = nlp(u"This is a sentence.")
displacy.serve(doc,style='dep')




Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...



127.0.0.1 - - [31/Oct/2021 07:12:31] "GET / HTTP/1.1" 200 3394
127.0.0.1 - - [31/Oct/2021 07:12:31] "GET /favicon.ico HTTP/1.1" 200 3394


Shutting down server on port 5000.


In [67]:
# Stemming
# catalogs related words
# need a basic understanding of stemming before lemmatization
import nltk

In [68]:
from nltk.stem.porter import *

In [69]:
p_stemmer = PorterStemmer()

In [70]:
words = ['run','runner','running','ran','runs','easily','fairly']

In [71]:
for word in words:
    print(word + ' --> ' + p_stemmer.stem(word))

run --> run
runner --> runner
running --> run
ran --> ran
runs --> run
easily --> easili
fairly --> fairli


In [73]:
# Snowball Stemmer
from nltk.stem.snowball import SnowballStemmer

In [74]:
# The Snowball Stemmer requires that you pass a language parameter
s_stemmer = SnowballStemmer(language='english')

In [77]:
# words = ['run','runner','running','ran','runs','easily','fairly']
words = ['generous','generation','generously','generate']

In [78]:
for word in words:
    print(word+' --> '+s_stemmer.stem(word))

generous --> generous
generation --> generat
generously --> generous
generate --> generat


In [79]:
# Lemmatization
nlp = spacy.load('en_core_web_sm')

In [81]:
doc1 = nlp(u'I am a runner running in a race because I love to run since I ran today')

In [82]:
for token in doc1:
    print(token.text, '\t', token.pos_, '\t', token.lemma, '\t', token.lemma_)

I 	 PRON 	 4690420944186131903 	 I
am 	 AUX 	 10382539506755952630 	 be
a 	 DET 	 11901859001352538922 	 a
runner 	 NOUN 	 12640964157389618806 	 runner
running 	 VERB 	 12767647472892411841 	 run
in 	 ADP 	 3002984154512732771 	 in
a 	 DET 	 11901859001352538922 	 a
race 	 NOUN 	 8048469955494714898 	 race
because 	 SCONJ 	 16950148841647037698 	 because
I 	 PRON 	 4690420944186131903 	 I
love 	 VERB 	 3702023516439754181 	 love
to 	 PART 	 3791531372978436496 	 to
run 	 VERB 	 12767647472892411841 	 run
since 	 SCONJ 	 10066841407251338481 	 since
I 	 PRON 	 4690420944186131903 	 I
ran 	 VERB 	 12767647472892411841 	 run
today 	 NOUN 	 11042482332948150395 	 today
