In [1]:
# POS and visualization
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")

In [4]:
print(doc.text)

The quick brown fox jumped over the lazy dog's back.


In [5]:
print(doc[4].tag_)

VBD


In [6]:
print(doc[4].pos_)

VERB


In [10]:
for token in doc:
    print(f"{token.text:{10}} {token.pos_:{10}} {token.tag_:{10}} {spacy.explain(token.tag_)}")

The        DET        DT         determiner
quick      ADJ        JJ         adjective
brown      ADJ        JJ         adjective
fox        NOUN       NN         noun, singular or mass
jumped     VERB       VBD        verb, past tense
over       ADP        IN         conjunction, subordinating or preposition
the        DET        DT         determiner
lazy       ADJ        JJ         adjective
dog        NOUN       NN         noun, singular or mass
's         PART       POS        possessive ending
back       NOUN       NN         noun, singular or mass
.          PUNCT      .          punctuation mark, sentence closer


In [18]:
doc = nlp(u"I will read books on NLP.")

In [22]:
word = doc[2]

In [23]:
word.text

'read'

In [24]:
token = word
print(f"{token.text:{10}} {token.pos_:{10}} {token.tag_:{10}} {spacy.explain(token.tag_)}")

read       VERB       VB         verb, base form


In [15]:
doc = nlp(u"I read a book on NLP.")

In [16]:
word = doc[1]

In [17]:
token = word
print(f"{token.text:{10}} {token.pos_:{10}} {token.tag_:{10}} {spacy.explain(token.tag_)}")

read       VERB       VBD        verb, past tense


In [26]:
POS_counts = doc.count_by(spacy.attrs.POS)

In [27]:
for k,v in sorted(POS_counts.items()):
    print(f"{k}. {doc.vocab[k].text:{5}} {v}")

85. ADP   1
87. AUX   1
92. NOUN  1
95. PRON  1
96. PROPN 1
97. PUNCT 1
100. VERB  1


In [28]:
POS_counts

{95: 1, 87: 1, 100: 1, 92: 1, 85: 1, 96: 1, 97: 1}

In [29]:
doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")

In [30]:
from spacy import displacy

In [32]:
displacy.render(doc,style='dep',jupyter=True)

In [38]:
# Named Entity Recognition

def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text + ' - '+ent.label_ + ' - '+str(spacy.explain(ent.label_)))
    else:
        print('No entities found')

In [39]:
doc = nlp(u'Hi how are you?')

In [40]:
show_ents(doc)

No entities found


In [41]:
doc = nlp(u'May I go to Washington, DC next May to see the Washington Monument?')

In [42]:
show_ents(doc)

Washington, DC - GPE - Countries, cities, states
next May - DATE - Absolute or relative dates or periods
the Washington Monument - ORG - Companies, agencies, institutions, etc.


In [43]:
# Spacy does not recognize Tesla as an entity in this sentence

doc = nlp(u'Tesla to build a U.K. factory for $6 million')

show_ents(doc)

U.K. - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


In [44]:
from spacy.tokens import Span

# Get the hash value of the ORG entity label
ORG = doc.vocab.strings[u'ORG']  

# Create a Span for the new entity
new_ent = Span(doc, 0, 1, label=ORG)

# Add the entity to the existing Doc object
doc.ents = list(doc.ents) + [new_ent]

In [45]:
show_ents(doc)

Tesla - ORG - Companies, agencies, institutions, etc.
U.K. - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


In [46]:
# add multiple named entities

doc = nlp(u'Our company created a brand new vacuum cleaner.'
         u'This new vacuum-cleaner is the best in show.')

In [47]:
show_ents(doc)

No entities found


In [48]:
from spacy.matcher import PhraseMatcher

In [49]:
matcher = PhraseMatcher(nlp.vocab)

In [50]:
phrase_list = ['vacuum cleaner','vacuum-cleaner']

In [51]:
phrase_patterns = [nlp(text) for text in phrase_list]

In [52]:
matcher.add('newproduct',None,*phrase_patterns)

In [53]:
found_matches = matcher(doc)

In [54]:
found_matches

[(2689272359382549672, 6, 8), (2689272359382549672, 11, 14)]

In [55]:
from spacy.tokens import Span

In [56]:
PROD = doc.vocab.strings[u"PRODUCT"]

In [57]:
new_ents = [Span(doc,match[1],match[2],label=PROD) for match in found_matches]

In [58]:
doc.ents = list(doc.ents) + new_ents

In [59]:
show_ents(doc)

vacuum cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)
vacuum-cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)


In [60]:
# visualizing named entity recognition

from spacy import displacy

In [61]:
doc = nlp(u"Over the last quarter Apple sold 20 thousand iPods for a profit of $6 million.")

In [62]:
displacy.render(doc,style='ent',jupyter=True)