<a href="https://colab.research.google.com/github/gc2321/3235-Machine-Learning/blob/main/XTRA_NLP/1_tokens.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [2]:
doc = nlp(u'Tesla is looking at buying U.S. startup for $6 million')

In [3]:
# Print each token separately
for token in doc:
    print(token.text, token.pos_, token.dep_)

Tesla PROPN nsubj
is AUX aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.S. PROPN compound
startup NOUN dobj
for ADP prep
$ SYM quantmod
6 NUM compound
million NUM pobj


In [4]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x7c940d3796c0>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x7c940d379cc0>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x7c941171bed0>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x7c940d0fdec0>),
 ('lemmatizer',
  <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x7c940d53af40>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x7c94113cbb50>)]

In [5]:
doc2 = nlp(u"Tesla isn't looking into startups anymore.")

In [6]:
for token in doc2:
    print(token.text, token.pos_, token.dep_)

Tesla PROPN nsubj
is AUX aux
n't PART neg
looking VERB ROOT
into ADP prep
startups NOUN pobj
anymore ADV advmod
. PUNCT punct


In [7]:
doc3 = nlp(u'Although commmonly attributed to John Lennon from his song "Beautiful Boy", \
the phrase "Life is what happens to us while we are making other plans" was written by \
cartoonist Allen Saunders and published in Reader\'s Digest in 1957, when Lennon was 17.')

In [8]:
life_quote = doc3[16:30]
print(life_quote)

"Life is what happens to us while we are making other plans"


In [9]:
type(life_quote)

spacy.tokens.span.Span

In [10]:
doc4 = nlp(u'This is the first sentence. This is another sentence. This is the last sentence.')

In [11]:
for sent in doc4.sents:
    print(sent)

This is the first sentence.
This is another sentence.
This is the last sentence.


## Tokenization

In [12]:
doc5 = nlp(u'It is better to give than to receive.')

# Retrieve the third token:
doc5[2]

better

In [13]:
# Retrieve three tokens from the middle:
doc5[2:5]

better to give

In [14]:
# Retrieve the last four tokens:
doc5[-4:]

than to receive.

In [15]:
doc5.vocab

<spacy.vocab.Vocab at 0x7c940d46bd00>

In [16]:
len(doc5.vocab)

816

In [17]:
doc8 = nlp(u'Apple to build a Hong Kong factory for $6 million')

for token in doc8:
    print(token.text, end=' | ')

print('\n----')

for ent in doc8.ents:
    print(ent.text+' - '+ent.label_+' - '+str(spacy.explain(ent.label_)))

Apple | to | build | a | Hong | Kong | factory | for | $ | 6 | million | 
----
Apple - ORG - Companies, agencies, institutions, etc.
Hong Kong - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


In [18]:
# noun chunks: noun plus description
doc9 = nlp(u"Autonomous cars shift insurance liability toward manufacturers.")

for chunk in doc9.noun_chunks:
    print(chunk.text)

Autonomous cars
insurance liability
manufacturers


## Visualizer

In [19]:
from spacy import displacy

doc = nlp(u'Apple is going to build a U.K. factory for $6 million.')
displacy.render(doc, style='dep', jupyter=True, options={'distance': 110})

In [20]:
displacy.render(doc, style='ent', jupyter=True)

## NLTK

In [23]:
import nltk

from nltk.stem.porter import *

In [24]:
p_stemmer = PorterStemmer()

In [25]:
words = ['run','runner','running','ran','runs','easily','fairly']

In [26]:
for word in words:
    print(word+' --> '+p_stemmer.stem(word))

run --> run
runner --> runner
running --> run
ran --> ran
runs --> run
easily --> easili
fairly --> fairli


In [27]:
from nltk.stem.snowball import SnowballStemmer

# The Snowball Stemmer requires that you pass a language parameter
s_stemmer = SnowballStemmer(language='english')

In [32]:
words = ['run','runner','running','ran','runs','easily','fairly', 'fairness']
# words = ['generous','generation','generously','generate']

In [33]:
for word in words:
    print(word+' --> '+s_stemmer.stem(word))

run --> run
runner --> runner
running --> run
ran --> ran
runs --> run
easily --> easili
fairly --> fair
fairness --> fair


In [34]:
words = ['generous','generation','generously','generate']

In [35]:
for word in words:
    print(word+' --> '+s_stemmer.stem(word))

generous --> generous
generation --> generat
generously --> generous
generate --> generat


## Lemmatization

In [36]:
doc1 = nlp(u"I am a runner running in a race because I love to run since I ran today")

for token in doc1:
    print(token.text, '\t', token.pos_, '\t', token.lemma, '\t', token.lemma_)

I 	 PRON 	 4690420944186131903 	 I
am 	 AUX 	 10382539506755952630 	 be
a 	 DET 	 11901859001352538922 	 a
runner 	 NOUN 	 12640964157389618806 	 runner
running 	 VERB 	 12767647472892411841 	 run
in 	 ADP 	 3002984154512732771 	 in
a 	 DET 	 11901859001352538922 	 a
race 	 NOUN 	 8048469955494714898 	 race
because 	 SCONJ 	 16950148841647037698 	 because
I 	 PRON 	 4690420944186131903 	 I
love 	 VERB 	 3702023516439754181 	 love
to 	 PART 	 3791531372978436496 	 to
run 	 VERB 	 12767647472892411841 	 run
since 	 SCONJ 	 10066841407251338481 	 since
I 	 PRON 	 4690420944186131903 	 I
ran 	 VERB 	 12767647472892411841 	 run
today 	 NOUN 	 11042482332948150395 	 today


In [37]:
def show_lemmas(text):
    for token in text:
        print(f'{token.text:{12}} {token.pos_:{6}} {token.lemma:<{22}} {token.lemma_}')

In [38]:
doc2 = nlp(u"I saw eighteen mice today!")

show_lemmas(doc2)

I            PRON   4690420944186131903    I
saw          VERB   11925638236994514241   see
eighteen     NUM    9609336664675087640    eighteen
mice         NOUN   1384165645700560590    mouse
today        NOUN   11042482332948150395   today
!            PUNCT  17494803046312582752   !


## Stop Words

In [39]:
print(nlp.Defaults.stop_words)

{'or', 'cannot', "'s", 'on', 'onto', 'under', '’s', "'d", 'you', 'side', 'yourself', 'there', 'anything', 'part', 'name', 'several', 'somewhere', 'moreover', 'hundred', 'whence', 'alone', 'whole', 'hereby', 'together', 'thence', 'became', 'be', 'itself', 'off', 'perhaps', 'once', 'ourselves', 'across', 'unless', 'please', 'thus', 'empty', 'already', 'for', 'up', 'third', 'a', 'another', 'here', 'whither', 'besides', 'would', 'who', 'seems', 'hers', 'to', 'bottom', 'else', 'because', 'two', 'after', 'than', 'only', 'are', 'formerly', 'somehow', 'forty', 'out', 'does', 'via', '’re', 'was', 'beyond', 'mine', 'wherein', 'i', 'least', 'sixty', 'my', 'every', 'twelve', '‘s', 'become', 'first', 'behind', 'therein', 'as', 'more', 'them', 'has', 'seeming', 'amount', 'everyone', 'whereupon', 'into', 'that', 'keep', 'when', 'such', 'anyone', 'whoever', 'well', 'hereupon', 'using', 'ten', 'quite', 'our', 'both', 'were', 'myself', "'ve", 'afterwards', 'nowhere', 'should', 'we', 'not', 'being', 'tow

In [40]:
len(nlp.Defaults.stop_words)

326

In [41]:
nlp.vocab['myself'].is_stop

True

In [42]:
nlp.vocab['mystery'].is_stop

False

In [43]:
# Add the word to the set of stop words. Use lowercase!
nlp.Defaults.stop_words.add('btw')

# Set the stop_word tag on the lexeme
nlp.vocab['btw'].is_stop = True

In [46]:
len(nlp.Defaults.stop_words)

327

In [49]:
## remove a stop word
# Remove the word from the set of stop words

nlp.Defaults.stop_words.remove('beyond')

# Remove the stop_word tag from the lexeme
nlp.vocab['beyond'].is_stop = False

In [50]:
nlp.vocab['beyond'].is_stop

False

## Vocabulary and Matching

In [51]:
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

In [54]:
# SolarPower
pattern1 = [{'LOWER': 'solarpower'}]
# Solar-power
pattern2 = [{'LOWER': 'solar'}, {'LOWER': 'power'}]
# Solar power
pattern3 = [{'LOWER': 'solar'}, {'IS_PUNCT': True}, {'LOWER': 'power'}]


In [56]:
matcher.add('SolarPower', [pattern1, pattern2, pattern3])

In [57]:
doc = nlp(u'The Solar Power industry continues to grow as demand \
for solarpower increases. Solar-power cars are gaining popularity.')

In [58]:
found_matches = matcher(doc)
print(found_matches)

[(8656102463236116519, 1, 3), (8656102463236116519, 10, 11), (8656102463236116519, 13, 16)]


In [59]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc[start:end]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

8656102463236116519 SolarPower 1 3 Solar Power
8656102463236116519 SolarPower 10 11 solarpower
8656102463236116519 SolarPower 13 16 Solar-power


In [60]:
# Redefine the patterns:
pattern1 = [{'LOWER': 'solarpower'}]
pattern2 = [{'LOWER': 'solar'}, {'IS_PUNCT': True, 'OP':'*'}, {'LOWER': 'power'}]

# Remove the old patterns to avoid duplication:
matcher.remove('SolarPower')

# Add the new set of patterns to the 'SolarPower' matcher:
matcher.add('SolarPower', [pattern1, pattern2])

In [61]:
found_matches = matcher(doc)
print(found_matches)

[(8656102463236116519, 1, 3), (8656102463236116519, 10, 11), (8656102463236116519, 13, 16)]


In [62]:
pattern1 = [{'LOWER': 'solarpower'}]
pattern2 = [{'LOWER': 'solar'}, {'IS_PUNCT': True, 'OP':'*'}, {'LOWER': 'power'}]
pattern3 = [{'LOWER': 'solarpowered'}]
pattern4 = [{'LOWER': 'solar'}, {'IS_PUNCT': True, 'OP':'*'}, {'LOWER': 'powered'}]

# Remove the old patterns to avoid duplication:
matcher.remove('SolarPower')

# Add the new set of patterns to the 'SolarPower' matcher:
matcher.add('SolarPower', [pattern1, pattern2, pattern3, pattern4])

In [63]:
doc2 = nlp(u'Solar-powered energy runs solar-powered cars.')

In [64]:
found_matches = matcher(doc2)
print(found_matches)

[(8656102463236116519, 0, 3), (8656102463236116519, 5, 8)]


## PhraseMatcher

In [65]:
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)