In [1]:
import spacy

from spacy import displacy
spacy.prefer_gpu()
nlp = spacy.load("en_core_web_sm")

In [2]:
text = "Apple is looking at buying U.K. startup for $1 billion"
doc = nlp(text)

# Print the named entities
for ent in doc.ents:
    print(ent.text, ent.label_)

Apple ORG
U.K. GPE
$1 billion MONEY


In [3]:
for token in doc:
    print(token.text, token.pos, token.pos_, token.dep_)

Apple 96 PROPN nsubj
is 87 AUX aux
looking 100 VERB ROOT
at 85 ADP prep
buying 100 VERB pcomp
U.K. 96 PROPN nsubj
startup 100 VERB ccomp
for 85 ADP prep
$ 99 SYM quantmod
1 93 NUM compound
billion 93 NUM pobj


In [4]:
print(doc)

Apple is looking at buying U.K. startup for $1 billion


In [5]:
print(spacy.explain('PROPN'))
print(spacy.explain('AUX'))


proper noun
auxiliary


In [6]:
doc3 = nlp(u'Although commmonly attributed to John Lennon from his song "Beautiful Boy", \
the phrase "Life is what happens to us while we are making other plans" was written by \
cartoonist Allen Saunders and published in Reader\'s Digest in 1957, when Lennon was 17.')

life_quote = doc3[16:30]

print(life_quote)
print()
print(type(life_quote))

"Life is what happens to us while we are making other plans"

<class 'spacy.tokens.span.Span'>


In [7]:
doc4 = nlp('This is the first sentence. This is another sentence. This is the last sentence.')
for sentence in doc4.sents:
    print(sentence)

This is the first sentence.
This is another sentence.
This is the last sentence.


In [8]:
print(f"word: {doc4[6]}, start a sentence: {doc4[6].is_sent_start}")
print(f"word: {doc4[8]}, start a sentence: {doc4[8].is_sent_start}")

word: This, start a sentence: True
word: another, start a sentence: False


In [9]:
mystring = '"We\'re moving to L.A.!"'

# Print the string
print(mystring)
print()

# Process the string with spaCy
doc = nlp(mystring)

# Tokenize and print each token
for token in doc:
    print(token.text)

"We're moving to L.A.!"

"
We
're
moving
to
L.A.
!
"


In [10]:
# Input text for NLP processing
doc8 = nlp("Apple to build a Hong Kong factory for $6 million")

# Tokenization: print each token separated by '|'
for token in doc8:
    print(token.text, end=' | ')
print('\n')

Apple | to | build | a | Hong | Kong | factory | for | $ | 6 | million | 



In [11]:
for entity in doc8.ents:
    print(entity)
    print(entity.label_)
    print(spacy.explain(entity.label_))
    print()

Apple
ORG
Companies, agencies, institutions, etc.

Hong Kong
GPE
Countries, cities, states

$6 million
MONEY
Monetary values, including unit



In [12]:
doc9 = nlp("The quick brown fox jumps over the lazy dog.")

print("Noun chunks in first sentence:")
for chunk in doc9.noun_chunks:
    print(chunk.text)

print()

# Example 2: sentence with technical nouns
doc10 = nlp("Autonomous cars shift insurance liability toward manufacturers.")

print("Noun chunks in second sentence:")
for chunk in doc10.noun_chunks:
    print(chunk.text)

Noun chunks in first sentence:
The quick brown fox
the lazy dog

Noun chunks in second sentence:
Autonomous cars
insurance liability
manufacturers


In [13]:
# SpaCy - Display

In [14]:
text = "The quick brown fox jumps over the lazy dog."

nlp = spacy.load("en_core_web_sm")

doc = nlp(text)

displacy.render(doc, style="dep", jupyter=True, options={"distance": 100})

In [15]:
# End of class 20/4

Visualizing Named Entities

In [16]:
doc = nlp('Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million.')
displacy.render(doc, style='ent', jupyter=True)

In [17]:
import nltk
from nltk.stem.porter import PorterStemmer

p_stemmer = PorterStemmer()

words = ['run', 'runner', 'running', 'ran', 'runs', 'easily', 'fairly', 'fairness']

for word in words:
    print(word + ' --> ' + p_stemmer.stem(word))

run --> run
runner --> runner
running --> run
ran --> ran
runs --> run
easily --> easili
fairly --> fairli
fairness --> fair


We can also use the second, more advanced porter algorithm called ‘Snowball’, when using ‘snowball’
stemmer we also need to provide the language because the algorithm support multiple languages

In [18]:
from nltk.stem.snowball import SnowballStemmer

s_stemmer = SnowballStemmer(language='english')

words = ['run', 'runner', 'running', 'ran', 'runs', 'easily', 'fairly', 'fairness']

for word in words:
    print(word + ' --> ' + p_stemmer.stem(word))

run --> run
runner --> runner
running --> run
ran --> ran
runs --> run
easily --> easili
fairly --> fairli
fairness --> fair


Let’s use the Lemmatization functionality built in SpaCy library, for this example we will provide SpaCy a
sentence with different prons of the word ‘run’ and see what is the lemma of each word.

In [19]:
doc1 = nlp(u"I am a runner running in a race because I love to run since I ran today")

for token in doc1:
    print(token.text, '\t', token.pos_, '\t', token.lemma, '\t', token.lemma_)

I 	 PRON 	 4690420944186131903 	 I
am 	 AUX 	 10382539506755952630 	 be
a 	 DET 	 11901859001352538922 	 a
runner 	 NOUN 	 12640964157389618806 	 runner
running 	 VERB 	 12767647472892411841 	 run
in 	 ADP 	 3002984154512732771 	 in
a 	 DET 	 11901859001352538922 	 a
race 	 NOUN 	 8048469955494714898 	 race
because 	 SCONJ 	 16950148841647037698 	 because
I 	 PRON 	 4690420944186131903 	 I
love 	 VERB 	 3702023516439754181 	 love
to 	 PART 	 3791531372978436496 	 to
run 	 VERB 	 12767647472892411841 	 run
since 	 SCONJ 	 10066841407251338481 	 since
I 	 PRON 	 4690420944186131903 	 I
ran 	 VERB 	 12767647472892411841 	 run
today 	 NOUN 	 11042482332948150395 	 today


We can see that running, run and
ran got the same lemma ‘run’

A lemma hash value → each
different lemma has it’s own hash
value. Lemmas that have the same
hash are the same lemma

SpaCy - Stop Words:

In [20]:
print(nlp.Defaults.stop_words)

{'whither', '‘ve', 'whole', 'same', 'without', 'may', 'forty', 'towards', 'itself', 'thru', 'out', 'make', '’d', 'with', 'whereafter', 'amount', "'ll", 'though', 'into', 'while', 'last', 'own', 'seemed', 'so', 'often', 'first', 'wherein', 'becomes', 'more', 'whenever', 'down', 'due', 'should', 'below', 'from', 'anywhere', 'five', 'until', 'hereafter', 'keep', 'up', 'we', 'perhaps', 'a', 'top', 'full', 'cannot', 'his', 'yet', 'he', 'the', 'whether', 'she', 'was', "'ve", 'who', 'hence', 'every', 'any', 'whence', 'else', "'s", 'twenty', 'each', 'whoever', 'does', 'sometimes', 'therefore', 'quite', 'latter', 'further', 'another', 'indeed', 'but', 'much', 'me', 'everything', 'hundred', 'mine', 'before', 'thence', 'eleven', 'as', 'no', 'above', 'then', 'next', 'doing', 'seem', 'nine', 'anyhow', 'were', 'now', 'onto', 'whom', 'noone', 'really', 'either', 'too', 'toward', 'behind', 'others', 'in', 'her', 'move', 'beyond', 'least', 'n’t', 'anyone', 'i', 'among', 'four', 'an', 'themselves', 'my'

We can also check if a specific word consider a stop word by calling the ‘vocab’ functionally: 

In [21]:
print(nlp.vocab['myself'].is_stop)
print(nlp.vocab['mystery'].is_stop)

True
False


We add the new word to the stop_words
dictionary, make sure to use only
lowercase letters

We mark the new word as stop word by
passing True to the word is_stop property

In [22]:
nlp.Defaults.stop_words.add('btw')

nlp.vocab['btw'].is_stop = True
nlp.vocab['btw'].is_stop

True

Same way we can remove existing stop words from the list:

In [23]:
nlp.Defaults.stop_words.remove('beyond')

nlp.vocab['beyond'].is_stop = False
nlp.vocab['beyond'].is_stop

False

SpaCy - Phrase Matching Python Example

In [24]:
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

pattern1 = [{'LOWER': 'solarpower'}] ## - SolarPower
pattern2 = [{'LOWER': 'solar'}, {'LOWER': 'power'}] ## - Solar Power
pattern3 = [{'LOWER': 'solar'}, {'IS_PUNCT': True}, {'LOWER': 'power'}] ## - Solar-Power

matcher.add('SolarPower', [pattern1, pattern2, pattern3])

Matcher name and
list of phrases

In [25]:
doc = nlp("The Solar Power industry continues to grow as demand for solarpower increases. Solar-power cars are gaining popularity.")
found_matches = matcher(doc)
print(found_matches)

[(8656102463236116519, 1, 3), (8656102463236116519, 10, 11), (8656102463236116519, 13, 16)]


The matcher provide a list of all the
matches. Each match contain the matcher
id, the first phrase index and the last
phrase index (not included)

We can also use the following code to make the matcher response be more readable:


In [26]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]
    span = doc[start:end]
    print(match_id, string_id, start, end, span.text)

8656102463236116519 SolarPower 1 3 Solar Power
8656102463236116519 SolarPower 10 11 solarpower
8656102463236116519 SolarPower 13 16 Solar-power


Let’s see another example with ‘SolarPower’ matcher, this time we will use different match phrases. 

In [27]:
pattern1 = [{'LOWER': 'solarpower'}]
pattern2 = [{'LOWER': 'solar'}, {'IS_PUNCT': True, 'OP':'*'}, {'LEMMA': 'power'}]

matcher.remove('SolarPower')
matcher.add('SolarPower', [pattern1, pattern2])

Pattern1 → Designed to match a single token that is the word "solarpower" in any case
("solarpower", "SolarPower", "SOLARPOWER", etc…)
● Pattern2 → Designed to match sequences where "solar" is followed by zero or more punctuation
marks and then a token with the lemma "power" ("solar-power", "solar--power", "solar.power",
“solar_ powered”, etc…)

In [28]:
doc2 = nlp('Solarpower energy runs solar-powered cars.')

found_matches = matcher(doc2)

for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]
    span = doc2[start:end]
    print(match_id, string_id, start, end, span.text)

8656102463236116519 SolarPower 0 1 Solarpower
8656102463236116519 SolarPower 3 6 solar-powered


The matcher successfully
identified ‘Solarpower’ and
‘solar-powered

End of slide 65