#POS

In [1]:
import spacy

nlp = spacy.load('en_core_web_sm')
doc = nlp("The quick brown fox jumped over the lazy dog")

for token in doc:
  print(f"{token.text:{10}} {token.pos_:{10}} {token.tag:{10}} {token.tag_:{10}} {spacy.explain(token.tag_):{10}}")

The        DET        15267657372422890137 DT         determiner
quick      ADJ        10554686591937588953 JJ         adjective (English), other noun-modifier (Chinese)
brown      ADJ        10554686591937588953 JJ         adjective (English), other noun-modifier (Chinese)
fox        NOUN       15308085513773655218 NN         noun, singular or mass
jumped     VERB       17109001835818727656 VBD        verb, past tense
over       ADP        1292078113972184607 IN         conjunction, subordinating or preposition
the        DET        15267657372422890137 DT         determiner
lazy       ADJ        10554686591937588953 JJ         adjective (English), other noun-modifier (Chinese)
dog        NOUN       15308085513773655218 NN         noun, singular or mass


In [2]:
doc1 = nlp("I read books on NLP")
word = doc1[1]

token = word
print(f"{token.text:{10}} {token.pos_:{10}} {token.tag:{10}} {token.tag_:{10}} {spacy.explain(token.tag_):{10}}")

read       VERB       9188597074677201817 VBP        verb, non-3rd person singular present


In [3]:
doc1 = nlp("I read a book on NLP")
word = doc1[1]

token = word
print(f"{token.text:{10}} {token.pos_:{10}} {token.tag:{10}} {token.tag_:{10}} {spacy.explain(token.tag_):{10}}")

read       VERB       17109001835818727656 VBD        verb, past tense


In [4]:
pos_count = doc.count_by(spacy.attrs.POS)

pos_count

{90: 2, 84: 3, 92: 2, 100: 1, 85: 1}

In [5]:
doc1.vocab[84].text

'ADJ'

In [6]:
for k,v in sorted(pos_count.items()):
  print(f"{k}. {doc.vocab[k].text:{5}} {v}")

84. ADJ   3
85. ADP   1
90. DET   2
92. NOUN  2
100. VERB  1


In [7]:
tag_count = doc.count_by(spacy.attrs.TAG)

for k,v in sorted(tag_count.items()):
  print(f"{k}. {doc.vocab[k].text:{5}} {v}")

1292078113972184607. IN    1
10554686591937588953. JJ    3
15267657372422890137. DT    2
15308085513773655218. NN    2
17109001835818727656. VBD   1


In [8]:
dep_count = doc.count_by(spacy.attrs.DEP)

for k,v in sorted(dep_count.items()):
  print(f"{k}. {doc.vocab[k].text:{5}} {v}")

402. amod  3
415. det   2
429. nsubj 1
439. pobj  1
443. prep  1
8206900633647566924. ROOT  1


#Visualization

In [9]:
from spacy import displacy

doc2 = nlp("The quick brown fox jumped over the lazy dog")

displacy.render(doc2)

In [10]:
options = {'distance': 110, 'compact': 'True', 'color': 'yellow', 'bg': '#09a3d5', 'font': 'Times'}

displacy.render(doc2, style='dep', options=options)


#NER

##Adding Named Entities to a Span

In [11]:
# displaying basic entity information

def show_entity(doc):
  if doc.ents:
    for entity in doc.ents:
      print(f"{entity.text} -- {entity.label_} -- {spacy.explain(entity.label_)}")
  else:
    print("No entities found")

In [12]:
#demo 1
doc = nlp('Hi, how are you')
show_entity(doc)

No entities found


In [13]:
#demo 2
doc = nlp("Hi,I'm Fatima. I live in Bangladesh")
show_entity(doc)

Fatima -- PERSON -- People, including fictional
Bangladesh -- GPE -- Countries, cities, states


In [14]:
#demo 3
doc = nlp("Can I have  500 dollars of Microsoft stock?")
show_entity(doc)

500 dollars -- MONEY -- Monetary values, including unit
Microsoft -- ORG -- Companies, agencies, institutions, etc.


In [15]:
#demo 4
doc = nlp("Tesla to build a UK factory for $6 million")
show_entity(doc)

UK -- GPE -- Countries, cities, states
$6 million -- MONEY -- Monetary values, including unit


In [16]:
# setting up a custom entity

from spacy.tokens import Span

#grab the "ORG" as entity label
ORG = doc.vocab.strings["ORG"]

In [17]:
ORG

383

In [18]:
# Create a new Span for a new entity (e.g., "startup" as a COMPANY)

new_entity = Span(doc, 0, 1, label = ORG) #which is in the hash value

In [19]:
# Append the new entity to doc.ents

doc.ents = list(doc.ents) + [new_entity] #you can use append here

In [20]:
# function we created before
show_entity(doc)

Tesla -- ORG -- Companies, agencies, institutions, etc.
UK -- GPE -- Countries, cities, states
$6 million -- MONEY -- Monetary values, including unit


##Adding Named Entities to all Matching Spans

In [21]:
# creating a document

doc = nlp("Our company created a brand new vacuum cleaner."
            "This new vacuum-cleaner is the best in the show")

In [22]:
# checking if vaccum cleaner is an entity or not
show_entity(doc)

No entities found


In [23]:
# Import PhraseMatcher
from spacy.matcher import PhraseMatcher

In [24]:
# create an instance and pass the vocab of document
# linking the matcher to the vocabulary

matcher = PhraseMatcher(nlp.vocab)

In [25]:
# Create the desired phrase patterns:

pharse_list = ['vacuum cleaner', 'vacuum-cleaner']

# turning these into phrase pattens by passing them into nlp function

phrase_pattern = [nlp(text) for text in pharse_list]

In [26]:
# Apply the patterns to our matcher object:
# You can name whatever the matcher you want
# None for the callback
matcher.add('newproduct', None, *phrase_pattern)

# Apply the matcher to our Doc object:
found_matches = matcher(doc)

In [27]:
found_matches

[(2689272359382549672, 6, 8), (2689272359382549672, 11, 14)]

In [28]:
# Here we create Spans from each match, and create named entities from them:
from spacy.tokens import Span

PROD = doc.vocab.strings["PRODUCT"]

# (2689272359382549672, 6, 8); we need only the numbers from 2nd nd 3d idx
new_ents = [Span(doc, match[1],match[2],label=PROD) for match in found_matches]

doc.ents = list(doc.ents) + new_ents

In [29]:
show_entity(doc)

vacuum cleaner -- PRODUCT -- Objects, vehicles, foods, etc. (not services)
vacuum-cleaner -- PRODUCT -- Objects, vehicles, foods, etc. (not services)


#Visualizing Named Entity Recognition

In [30]:
doc = nlp(u'Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million. '
         u'By contrast, Sony sold only 7 thousand Walkman music players.')

# For style='ent', displacy will highlight entities
displacy.render(doc, style='ent', jupyter=True)

In [31]:
# For line by line, use a for loop.
# Seperate out with sentence segmentation
for sent in doc.sents:
  # Passing the text of individual sentence segmentation
  # Make sure to add style
    displacy.render(nlp(sent.text), style='ent', jupyter=True)

In [32]:
doc2 = nlp(u'Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million. '
         u'By contrast, Sony sold only 7 thousand Walkman music players.')

for sent in doc2.sents:
    displacy.render(nlp(sent.text), style='ent', jupyter=True)

In [33]:
for sent in doc2.sents:
    docx = nlp(sent.text)
    if docx.ents:
        displacy.render(docx, style='ent', jupyter=True)
    else:
        print(docx.text)

In [38]:
# Additionally you can opt for which entities you want
# Store them in a dict
# Under the 'ents' key, pass is a list of what you are interested in

options = {'ents': ['ORG', 'MONEY']}

# Then render the whole thing
displacy.render(doc, style='ent', jupyter=True, options=options)

In [39]:
# You can customize colours for different entities
# Create another dict fo color
colors = {'ORG': 'pink'}

# Inside your options, state 'colors' key and set it equal to the colour dictionary
options = {'ents': ['ORG', 'MONEY'], 'colors': colors}
displacy.render(doc, style='ent', jupyter=True, options=options)

In [40]:
# Hex code works too
colors = {'ORG': '#aa9cfc'}

options = {'ents': ['ORG', 'MONEY'], 'colors': colors}
displacy.render(doc, style='ent', jupyter=True, options=options)

In [42]:
# You can actually linear gradient them too!
# You can linear gradient
# You can radial gradient

colors = {'ORG': 'linear-gradient(90deg, #aa9cfc, #fc9ce7)', 'MONEY': 'radial-gradient(yellow, green)'}

options = {'ents': ['ORG', 'MONEY'], 'colors':colors}

displacy.render(doc, style='ent', jupyter=True, options=options)

#Sentence Segment

In [43]:
# From Spacy Basics:
doc = nlp(u'This is the first sentence. This is another sentence. This is the last sentence.')

for sent in doc.sents:
    print(sent)

This is the first sentence.
This is another sentence.
This is the last sentence.


In [44]:
doc.sents[0] # doc.sents is a generator

TypeError: 'generator' object is not subscriptable

In [45]:
list(doc.sents)[1]

This is another sentence.

In [46]:
# SPACY'S DEFAULT BEHAVIOR
doc3 = nlp(u'"Management is doing things right; leadership is doing the right things." -Peter Drucker')

for sent in doc3.sents:
    print(sent)

"Management is doing things right; leadership is doing the right things."
-Peter Drucker


In [62]:
# ADD A NEW RULE TO THE PIPELINE

from spacy.language import Language
# 1. Register your component with a decorator
@Language.component("custom_boundaries")  # Give it a name

def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if token.text == ';':
            doc[token.i+1].is_sent_start = True
    return doc

# 2. Add using the REGISTERED NAME (as string)
nlp.remove_pipe("custom_boundaries")  # Remove old first
nlp.add_pipe("custom_boundaries", before="parser")

# 3. Verify
print(nlp.pipe_names)  # Should show your component

['tok2vec', 'tagger', 'custom_boundaries', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


In [64]:
# Re-run the Doc object creation:
doc4 = nlp(u'"Management is doing things right; leadership is doing the right things." -Peter Drucker')

for sent in doc4.sents:
    print(sent)

"Management is doing things right;
leadership is doing the right things."
-Peter Drucker
