In [1]:
#Tokenization
#The first step in creating a Doc object is to break down the incoming text into component pieces or "tokens".

In [2]:
# Import spaCy and load the English language library
import spacy
# This will take a while to load initially
nlp = spacy.load("en_core_web_sm")

In [3]:
# The text is within "" which we want to display and therefore we need to use the \ character to identify that
# some of the single quote marks are not the end of the sentence

# SpaCy works with doc objects. This doc object is called "sentence"
sentence = '"Mr. O\'Neill thinks that the boys\' stories about Chile\'s capital aren\'t amusing."'
print (sentence)

"Mr. O'Neill thinks that the boys' stories about Chile's capital aren't amusing."


In [4]:
nlp_sentence = nlp(sentence)
for token in nlp_sentence:
    print (token.text)

"
Mr.
O'Neill
thinks
that
the
boys
'
stories
about
Chile
's
capital
are
n't
amusing
.
"


In [5]:
nlp_sentence = nlp(sentence)

# Show the tokens of the sentence and use
# the "|" between each token for additional clarification
for token in nlp_sentence:
    print (token.text, token.pos_, end = " | ")

" PUNCT | Mr. PROPN | O'Neill PROPN | thinks VERB | that SCONJ | the DET | boys NOUN | ' PART | stories NOUN | about ADP | Chile PROPN | 's PART | capital NOUN | are AUX | n't PART | amusing ADJ | . PUNCT | " PUNCT | 

In [6]:
sentence = "It is best to access our website from 9 a.m. to 1 p.m. every weekend. The address is www.mywebsite.ie."

In [7]:
doc_object = nlp(sentence)

In [8]:
for token in doc_object:
    print (token.text)

It
is
best
to
access
our
website
from
9
a.m.
to
1
p.m.
every
weekend
.
The
address
is
www.mywebsite.ie
.


In [9]:
#Spacy can detect the difference between units such as distance and cost.
sentence = "I live about 20km from here. A taxi will cost around £50."
doc_object = nlp(sentence)

for token in doc_object:
    print (token,)

I
live
about
20
km
from
here
.
A
taxi
will
cost
around
£
50
.


In [10]:
#Counting Tokens Doc objects have a set number of tokens:

In [11]:
# Number of tokens in our sentence
len (doc_object)

16

In [12]:
# Count the number of vocab objects in the currently loaded language library
# This is from the en_core_web_sm library
# Use en_core_web_lg for larger library
len (doc_object.vocab)

511

In [13]:
#Retrieve token by index position and slice
#Doc objects can be thought of as lists of token objects. 
#As such, individual tokens can be retrieved by index position, 
#and spans of tokens can be retrieved through slicing, just as shown in the previous notebook.
#Let's enter the text into a doc object and then show the contents of the sentence

In [14]:
doc = nlp(u"I really like working with words!")

# Print each token
for token in doc:
    print (token)
    
    

I
really
like
working
with
words
!


In [15]:
#Now I'll extract some tokens from the sentence. 
#Note that the indexer starts at 0, and all tokens such as suffix count as a token position.

In [16]:
# Retrieve the first token
doc[0]

I

In [17]:
# Retrieve the 3rd to 6th token
doc[3:6]

working with words

In [18]:
# Retrieve the last 2 tokens
doc[-2:]

words!

In [19]:
doc[2]

like

In [20]:
#Named Entity Recognition (NER)

In [21]:
doc_object = nlp(u"Samsung in Ireland are pleased with their new folding screen that they released after a large $9 million investment.")

for token in doc_object:
    # show the token followed by a separator
    print (token, end = " | ")

Samsung | in | Ireland | are | pleased | with | their | new | folding | screen | that | they | released | after | a | large | $ | 9 | million | investment | . | 

In [22]:
for entity in doc_object.ents:
    print (entity)

Samsung
Ireland
$9 million


In [23]:
for entity in doc_object.ents:
    # Show the entity and its general label
    print (entity, entity.label_)

Samsung ORG
Ireland GPE
$9 million MONEY


In [24]:
for entity in doc_object.ents:
    # Show the entity and its general label
    # and show a full description on each named entity
    # using the spacy.explain command
    print (entity, entity.label_, spacy.explain(entity.label_))

Samsung ORG Companies, agencies, institutions, etc.
Ireland GPE Countries, cities, states
$9 million MONEY Monetary values, including unit


In [25]:
#If a named entity does not exist, the show_ents function will not work. 
#For example, the word car is not automatically recognised as a named entity.

In [26]:
doc_object = nlp(u"I like my car")
for entity in doc_object.ents:
    print (entity, entity.label_)

In [27]:
# Create a function to display entity information from a doc_object
def show_entity_info(doc_object):
    if doc_object:
        for entity in doc_object.ents:
            print(f"{entity.text} {entity.label_:{20}} {spacy.explain(entity.label_)}")
    else:
        print(f"No entities found in text.")

In [28]:
doc_object = nlp(u"I like my lyit")
show_entity_info(doc_object)

In [29]:
#Noun Chunks

In [30]:
doc_object = nlp("Autonomous cars shift insurance liability toward manufacturers")

# Create header text for table output
column1 = "Text"
column2 = "Root text"
column3 = "Root dependency"
column4 = "Root head text"
# Show the header for the table output
print (f"{column1:25} {column2:20} {column3:25} {column4:20}")
# Show relevant detail for each noun chunk in the text
for chunk in doc_object.noun_chunks:
    print(f"{chunk.text:{25}} {chunk.root.text:{20}} {spacy.explain(chunk.root.dep_):{25}} {chunk.root.head.text:{20}}")

Text                      Root text            Root dependency           Root head text      
Autonomous cars           cars                 nominal subject           shift               
insurance liability       liability            direct object             shift               
manufacturers             manufacturers        object of preposition     toward              


In [31]:
#displaCy Built-in Visualiser

In [32]:

from spacy import displacy

In [33]:
doc_object = nlp(u"Autonomous cars shift insurance liability toward manufacturers")

In [34]:
# Command to display the sentence. Be careful of the case with the word "True"
# Style set to "dep" means display dependencies
displacy.render(doc_object, style="dep", jupyter=True, options={"distance":100} )

In [35]:
displacy.render(doc_object, style="dep", jupyter=True, options={"distance":130, "color":"Blue", "arrow_stroke":4, "arrow_spacing":20, "word_spacing":50, "compact":True} )