In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
doc = nlp(u"The quick brown fox jumped over the dog's back.")

In [4]:
print(doc.text)

The quick brown fox jumped over the dog's back.


In [5]:
print(doc[4].tag_)

VBD


In [6]:
print(doc[4].pos_)

VERB


In [7]:
for token in doc:
    print(f'{token.text:{10}} {token.pos_:{8}} {token.tag_:{6}} {spacy.explain(token.tag_)}')

The        DET      DT     determiner
quick      ADJ      JJ     adjective
brown      ADJ      JJ     adjective
fox        NOUN     NN     noun, singular or mass
jumped     VERB     VBD    verb, past tense
over       ADP      IN     conjunction, subordinating or preposition
the        DET      DT     determiner
dog        NOUN     NN     noun, singular or mass
's         PART     POS    possessive ending
back       NOUN     NN     noun, singular or mass
.          PUNCT    .      punctuation mark, sentence closer


In [8]:
doc = nlp(u"I read books on NLP.")

In [9]:
word = doc[1]

In [10]:
word.text

'read'

In [11]:
token = word
print(f'{token.text:{10}} {token.pos_:{8}} {token.tag_:{6}} {spacy.explain(token.tag_)}')

read       VERB     VBP    verb, non-3rd person singular present


In [12]:
doc = nlp(u"I read a book on NLP.")

In [13]:
word = doc[1]

token = word
print(f'{token.text:{10}} {token.pos_:{8}} {token.tag_:{6}} {spacy.explain(token.tag_)}')

read       VERB     VBD    verb, past tense


In [14]:
doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")

In [15]:
POS_counts = doc.count_by(spacy.attrs.POS)

In [16]:
POS_counts

{96: 1, 83: 3, 99: 1, 84: 1, 89: 2, 91: 3, 93: 1}

In [17]:
doc.vocab[83].text

'ADJ'

In [18]:
doc[2].pos_

'ADJ'

In [19]:
TAG_counts = doc.count_by(spacy.attrs.TAG)

for k,v in sorted(TAG_counts.items()):
    print(f"{k}. {doc.vocab[k].text:{5}} {spacy.explain(doc.vocab[k].text):{80}} {v}")

74. POS   possessive ending                                                                1
1292078113972184607. IN    conjunction, subordinating or preposition                                        1
10554686591937588953. JJ    adjective                                                                        3
12646065887601541794. .     punctuation mark, sentence closer                                                1
15267657372422890137. DT    determiner                                                                       2
15308085513773655218. NN    noun, singular or mass                                                           3
17109001835818727656. VBD   verb, past tense                                                                 1


In [20]:
doc = nlp(u"Colorless green ideas sleep furiously.")

for token in doc:
    print(f"{token.text:{10}} {spacy.explain(token.tag_)}")

Colorless  adjective
green      adjective
ideas      noun, plural
sleep      verb, non-3rd person singular present
furiously  adverb
.          punctuation mark, sentence closer


## Visualizing parts of speech with displacy

In [21]:
doc = nlp(u"The quick brown fox jumped over the lazy dog")

In [22]:
from spacy import displacy

In [23]:
displacy.render(doc,style='dep',jupyter=True)

In [24]:
options = {'compact':False,'color':'yellow','bg':'#09a3d5','font':'Times'}
displacy.render(doc,style='dep',options=options,jupyter=True)

In [28]:
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text + " - " + ent.label_ + " - " + str(spacy.explain(ent.label_)))
    else:
        print('No entities found.')
        

In [29]:
doc = nlp(u'Hi, how are you?')
show_ents(doc)

No entities found.


In [30]:
doc = nlp(u'May I go to Washington, DC next May to see the Washington Monument?')
show_ents(doc)

Washington, DC - GPE - Countries, cities, states
next May - DATE - Absolute or relative dates or periods
the Washington Monument - ORG - Companies, agencies, institutions, etc.


In [33]:
doc = nlp(u'Can I please have 500 dollars of Microsoft stock?')
show_ents(doc)

500 dollars - MONEY - Monetary values, including unit
Microsoft - ORG - Companies, agencies, institutions, etc.


In [34]:
doc = nlp(u'Tesla to build a U.K. factory for $6 million')
show_ents(doc)

U.K. - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


In [35]:
from spacy.tokens import Span

In [36]:
ORG = doc.vocab.strings[u"ORG"]

In [37]:
ORG

381

In [38]:
new_entity = Span(doc,0,1,label=ORG)

In [39]:
doc.ents = list(doc.ents) + [new_entity]

In [40]:
show_ents(doc)

Tesla - ORG - Companies, agencies, institutions, etc.
U.K. - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


In [41]:
doc = nlp(u'Our company created a brand new vacuum cleaner.'
         u'This new vacuum-cleaner is the best in show.')

In [42]:
show_ents(doc)

No entities found.


In [43]:
from spacy.matcher import PhraseMatcher

In [44]:
matcher = PhraseMatcher(nlp.vocab)

In [45]:
phrase_list = ['vacuum cleaner', 'vacuum-cleaner']

In [48]:
phrase_patterns = [nlp(text) for text in phrase_list]

In [49]:
matcher.add('newproduct',None,*phrase_patterns)

In [50]:
found_matches = matcher(doc)

In [51]:
found_matches

[(2689272359382549672, 6, 8), (2689272359382549672, 11, 14)]

In [52]:
from spacy.tokens import Span

In [53]:
PROD = doc.vocab.strings[u'PRODUCT']

In [54]:
new_ents = [Span(doc,match[1],match[2],label=PROD) for match in found_matches]

In [55]:
doc.ents = list(doc.ents) + new_ents

In [56]:
show_ents(doc)

vacuum cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)
vacuum-cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)


In [57]:
doc = nlp(u'Originally I paid $29.95 for this car toy, but now it is marked down by 10 dollars.')
[ent for ent in doc.ents if ent.label_ == 'MONEY']

[29.95, 10 dollars]

In [63]:
doc = nlp(u'Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million.'
         u'By contrast, Sony only sold 8 thousand Walkman music players.')

In [64]:
displacy.render(doc,style='ent',jupyter=True)

In [65]:
for sent in doc.sents:
    displacy.render(nlp(sent.text),style='ent',jupyter=True)

In [84]:
colors = {'ORG': 'linear-gradient(yellow,orange,pink)'}
options = {'ents': ['PRODUCT', 'ORG'], 'colors': colors}


In [85]:
displacy.render(doc,style='ent',jupyter=True, options=options)

In [87]:
displacy.serve(doc,style='ent',options=options,port=5001)


[93m    Serving on port 5001...[0m
    Using the 'ent' visualizer



127.0.0.1 - - [12/May/2023 15:56:40] "GET / HTTP/1.1" 200 2153
127.0.0.1 - - [12/May/2023 15:56:40] "GET /favicon.ico HTTP/1.1" 200 2153



    Shutting down server on port 5001.



In [88]:
doc = nlp(u'This is the first sentence. This is another. This is the final one.')

In [89]:
for sent in doc.sents:
    print(sent)

This is the first sentence.
This is another.
This is the final one.


In [91]:
list(doc.sents)[1]

This is another.

In [99]:
doc = nlp(u'"Management is doing the right things; leadership is doing the right things." -Peter Drucker')

In [100]:
doc.text

'"Management is doing the right things; leadership is doing the right things." -Peter Drucker'

In [101]:
for sent in doc.sents:
    print(sent)
    print("--")

"Management is doing the right things; leadership is doing the right things."
--
-Peter Drucker
--


In [103]:
# ADD A SEGMENTATION RULE
def set_custom_boundaries(doc):
    for token in doc:
        print(token.i)

# CHANGE SEGMENTATION RULES ENTIRELY