#spaCy Basics


In [None]:
!pip install spacy



In [None]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m34.9 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [None]:
doc = nlp("I want c.a.t.s and $100 million cash")

for token in doc:
  print(token.text, token.pos_, token.dep_)

I PRON nsubj
want VERB ROOT
c.a.t.s PROPN dobj
and CCONJ cc
$ SYM quantmod
100 NUM compound
million NUM nummod
cash NOUN conj


In [None]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x7cba0d83f8f0>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x7cba0d83e3f0>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x7cbaa95b71b0>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x7cba0d633b50>),
 ('lemmatizer',
  <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x7cba0d63a250>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x7cbacf5989e0>)]

In [None]:
doc2 = nlp("Tesla isn't looking into       startups anymore")
for i in doc2:
  print(i.text, i.pos_, i.dep_)

Tesla PROPN nsubj
is AUX aux
n't PART neg
looking VERB ROOT
into ADP prep
       SPACE dep
startups NOUN pobj
anymore ADV advmod


In [None]:
print(doc2[0])
print(doc2[-1])

Tesla
anymore


In [None]:
doc2[0].pos_

'PROPN'

In [None]:
doc3 = nlp(u'Although commmonly attributed to John Lennon from his song "Beautiful Boy", \
the phrase "Life is what happens to us while we are making other plans" was written by \
cartoonist Allen Saunders and published in Reader\'s Digest in 1957, when Lennon was 17.')

In [None]:
life_quote = doc3[16:30]
print(life_quote) #this is called a span

"Life is what happens to us while we are making other plans"


In [None]:
type(life_quote) #spacy knows that it's a span

spacy.tokens.span.Span

In [None]:
type(doc)

spacy.tokens.doc.Doc

In [None]:
#spacy can seperate the sentences too!
doc4 = nlp("This is the 1st sentence. This is 2nd. And this is 3rd")

for x in doc4.sents: #sents attribute
  print(x)

This is the 1st sentence.
This is 2nd.
And this is 3rd


In [None]:
#spacy can detect the starting/ending token of a sentence too. It returs boolean value

doc4[6].is_sent_start
#doc4[7].is_sent_end


True

#Tokenization
#####You have somewhat idea of tokens. Now let’s dig deeper.

In [None]:
myString  = '"We\'re moving to L.A.!"'
myString

'"We\'re moving to L.A.!"'

In [None]:
doc5 = nlp(myString) # ""- as a token
for t in doc5:
  print(t)

"
We
're
moving
to
L.A.
!
"


In [None]:
doc6 = nlp("We're here to help! Send snail-mail, email support@oursite.com or visit us at http://www.oursite.com!")

for t in doc6:
  print(t)

# punctuation that exists as part of an email address, website, or numerical value
# will be kept as part of the token.

We
're
here
to
help
!
Send
snail
-
mail
,
email
support@oursite.com
or
visit
us
at
http://www.oursite.com
!


In [None]:
# checking how many tokens a doc has
len(doc6)

19

In [None]:
doc6[0] = "change text"
#you cant reassign

TypeError: 'spacy.tokens.doc.Doc' object does not support item assignment

In [None]:
#enttities

doc7 = nlp("Apple to build a Hong Kong factory for $6 million")

for entity in doc7.ents:
  print(entity)
  #we can label those entities too
  print(entity.label_)
  print()

  #more explation
  print(spacy.explain(entity.label_))

Apple
ORG

Companies, agencies, institutions, etc.
Hong Kong
GPE

Countries, cities, states
$6 million
MONEY

Monetary values, including unit


In [None]:
# noun chunks
doc8 = nlp("Autonomous cars shift insurance liability toward manufacturers")

for chunks in doc8.noun_chunks:
  print( chunks )

Autonomous cars
insurance liability
manufacturers


In [None]:
from spacy import displacy

doc9 = nlp(u'Apple is going to build a U.K. factory for $6 million.')
displacy.render(doc9, style='dep', jupyter=True, options={'distance': 110}) #params

displacy.render(doc9) #it works just fine too

In [None]:
from spacy import displacy

doc9 = nlp(u'Apple is going to build a U.K. factory for $6 million.')
displacy.render(doc9, style='ent', jupyter=True) #params


#Stemming
The idea of extracting the main word

In [None]:
import nltk

In [None]:
#using porter stemmer

from nltk.stem.porter import PorterStemmer
p_stemmer = PorterStemmer()

words = ['run', 'runner', 'ran', 'easily', 'fairly', 'fairness', 'frog', 'soup']

for word in words:
  print(word + '-------> ' + p_stemmer.stem(word))

run-------> run
runner-------> runner
ran-------> ran
easily-------> easili
fairly-------> fairli
fairness-------> fair
frog-------> frog
soup-------> soup


In [None]:
#using snowball stemmer (better version)

from nltk.stem.snowball import SnowballStemmer

s_stemmer = SnowballStemmer(language='english') #you'll need an lang param, or it will throw error

words = ['run', 'runner', 'ran', 'easily', 'fairly', 'fairness','frog', 'soup']

for i in words:
  print(i + '------> ' + s_stemmer.stem(i))

run------> run
runner------> runner
ran------> ran
easily------> easili
fairly------> fair
fairness------> fair
frog------> frog
soup------> soup


#Lemmatization

In [None]:
doc10 = nlp("I'm a runner running a race brcause i love to run since i ran today")

for token in doc10:
  print(token.text, '\t', token.pos_, '\t', token.lemma, '\t',token.lemma_)

I 	 PRON 	 4690420944186131903 	 I
'm 	 AUX 	 10382539506755952630 	 be
a 	 DET 	 11901859001352538922 	 a
runner 	 NOUN 	 12640964157389618806 	 runner
running 	 VERB 	 12767647472892411841 	 run
a 	 DET 	 11901859001352538922 	 a
race 	 NOUN 	 8048469955494714898 	 race
brcause 	 NOUN 	 17161821237301587057 	 brcause
i 	 PRON 	 4690420944186131903 	 I
love 	 VERB 	 3702023516439754181 	 love
to 	 PART 	 3791531372978436496 	 to
run 	 VERB 	 12767647472892411841 	 run
since 	 SCONJ 	 10066841407251338481 	 since
i 	 PRON 	 4690420944186131903 	 I
ran 	 VERB 	 12767647472892411841 	 run
today 	 NOUN 	 11042482332948150395 	 today


In [None]:
def show_lemma(text):
  for token in text:
    print(f'{token.text:{12}} {token.pos_:{6}} {token.lemma:<{22}} {token.lemma_}')

show_lemma(doc10)

I            PRON   4690420944186131903    I
'm           AUX    10382539506755952630   be
a            DET    11901859001352538922   a
runner       NOUN   12640964157389618806   runner
running      VERB   12767647472892411841   run
a            DET    11901859001352538922   a
race         NOUN   8048469955494714898    race
brcause      NOUN   17161821237301587057   brcause
i            PRON   4690420944186131903    I
love         VERB   3702023516439754181    love
to           PART   3791531372978436496    to
run          VERB   12767647472892411841   run
since        SCONJ  10066841407251338481   since
i            PRON   4690420944186131903    I
ran          VERB   12767647472892411841   run
today        NOUN   11042482332948150395   today


#Stop words

In [None]:
print(nlp.Defaults.stop_words)

{'around', 'me', 'quite', 'six', 'else', 'down', 'eleven', 'various', 'he', 'however', 'either', 'cannot', 'before', 'two', 'anyhow', 'full', "n't", 'beyond', 'own', 'in', 'hereby', 'only', 'therefore', "'ve", 'become', 'also', 'us', 'sometime', 'why', 'seeming', '‘s', 'though', 'whenever', 'call', 'there', 'until', 'alone', 'whither', 're', 'more', 'through', 'be', 'move', '‘re', 'not', 'your', 'everywhere', 'anyone', 'see', 'among', 'himself', 'any', 'which', 'indeed', 'herself', 'seems', 'then', 'enough', 'when', 'most', 'front', 'part', 'has', 'someone', 'this', 'always', 'across', 'used', 'nowhere', 'hereafter', 'what', 'thru', 'by', 'without', 'will', 'could', 'something', "'re", 'almost', 'serious', 'off', 'sixty', 'we', 'using', 'along', 'doing', 'already', 'whence', 'really', 'against', 'further', 'few', 'to', '’s', "'d", 'show', 'its', 'eight', 'no', 'she', 'together', 'latterly', 'take', 'first', 'n‘t', 'n’t', 'thus', 'again', 'i', 'about', 'an', 'ca', 'beforehand', 'one', "

In [None]:

len(nlp.Defaults.stop_words)

326

In [None]:
nlp.vocab['is'].is_stop #is_stop is an attribute

True

In [None]:
nlp.vocab['fiction'].is_stop #is_stop is an attribute

False

In [None]:
nlp.Defaults.stop_words.add('btw')
#manually grab it
nlp.vocab['btw'].is_stop = True

In [None]:
len(nlp.Defaults.stop_words)

327

In [None]:
nlp.vocab['btw'].is_stop

True

In [None]:
nlp.Defaults.stop_words.remove('btw')
#manually grab it
nlp.vocab['btw'].is_stop = False

In [None]:
len(nlp.Defaults.stop_words)

326

In [None]:
nlp.vocab['btw'].is_stop

False

#Phrase matching and vocabulary

In [None]:
from spacy.matcher import Matcher

matcher = Matcher(nlp.vocab) #passing nlp.vocab

In [None]:
#creating patterns

# SolarPower
pattern1 = [{'LOWER': 'solarpower'}]

# Solar-power
pattern2 = [{'LOWER': 'solar'}, {'LOWER': 'power'}]

# Solar power
pattern3 = [{'LOWER': 'solar'}, {'IS_PUNCT': True}, {'LOWER': 'power'}]

In [None]:
matcher.add('SolarPower', [pattern1,pattern2,pattern3])

In [None]:
doc11 = nlp(u'The Solar Power industry continues to grow as demand \
for solarpower increases. Solar-power cars are gaining popularity.')

In [None]:
found_matches = matcher(doc11)

In [None]:
print(found_matches)

[(8656102463236116519, 1, 3), (8656102463236116519, 10, 11), (8656102463236116519, 13, 16)]


In [None]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc11[start:end]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

8656102463236116519 SolarPower 1 3 Solar Power
8656102463236116519 SolarPower 10 11 solarpower
8656102463236116519 SolarPower 13 16 Solar-power


In [None]:
# Redefine the patterns:
pattern1 = [{'LOWER': 'solarpower'}]
pattern2 = [{'LOWER': 'solar'}, {'IS_PUNCT': True, 'OP':'*'}, {'LOWER': 'power'}]

# Add the new set of patterns to the 'SolarPower' matcher:
matcher.add('SolarPower', [pattern1, pattern2])

In [None]:
doc12 = nlp('Solar--power is solarpower yay!')

In [None]:
found_matches = matcher(doc12)
print(found_matches)

[(8656102463236116519, 0, 3), (8656102463236116519, 4, 5)]
