# Tokenization


In [1]:
# Import spaCy and load the language library
import spacy

ModuleNotFoundError: No module named 'click._bashcomplete'

In [None]:
nlp = spacy.load('en_core_web_sm')

In [3]:
# Create a string that includes opening and closing quotation marks
mystring = '"We\'re moving to L.A.!"'
print(mystring)

"We're moving to L.A.!"


In [4]:
# Create a Doc object and explore tokens
doc = nlp(mystring)

In [5]:
for token in doc:
    print(token.text, end=' | ')

" | We | 're | moving | to | L.A. | ! | " | 

## Prefixes, Suffixes and Infixes


In [5]:
doc2 = nlp(u"We're here to help! Send snail-mail, email support@oursite.com or visit us at http://www.oursite.com!")

for t in doc2:
    print(t)

We
're
here
to
help
!
Send
snail
-
mail
,
email
support@oursite.com
or
visit
us
at
http://www.oursite.com
!


In [7]:
doc3 = nlp(u'A 5km NYC cab ride costs $10.30')

for t in doc3:
    print(t)

A
5
km
NYC
cab
ride
costs
$
10.30


## Exceptions
Punctuation that exists as part of a known abbreviation will be kept as part of the token.

In [8]:
doc4 = nlp(u"Let's visit St. Louis in the U.S. next year.")

for t in doc4:
    print(t)

Let
's
visit
St.
Louis
in
the
U.S.
next
year
.


## Counting Tokens


In [9]:
len(doc)

8

## Tokens can be retrieved by index position and slice


In [10]:
doc5 = nlp(u'It is better to give than to receive.')

# Retrieve the third token:
doc5[2]

better

In [11]:
# Retrieve three tokens from the middle:
doc5[2:5]

better to give

In [12]:
# Retrieve the last four tokens:
doc5[-4:]

than to receive.

## Tokens cannot be reassigned


In [13]:
doc6 = nlp(u'My dinner was horrible.')
doc7 = nlp(u'Your dinner was delicious.')

In [14]:
# Try to change "My dinner was horrible" to "My dinner was delicious"
doc6[3] = doc7[3]

TypeError: 'spacy.tokens.doc.Doc' object does not support item assignment

___
# Named Entities


In [7]:
doc8 = nlp(u'Apple to build a Hong Kong factory for $6 million')

In [8]:
for token in doc8:
    print(token.text, end=' | ')
    

Apple | to | build | a | Hong | Kong | factory | for | $ | 6 | million | 

In [9]:
for ent in doc8.ents:
    print(ent)

Apple
Hong Kong
$6 million


In [18]:
    
for ent in doc8.ents:
    print(ent.text)
    print(ent.label_)
    print(str(spacy.explain(ent.label_)))
    print('\n')

Apple
ORG
Companies, agencies, institutions, etc.


Hong Kong
GPE
Countries, cities, states


$6 million
MONEY
Monetary values, including unit




In [19]:
len(doc8.ents)

3


# Built-in Visualizers


For more info visit https://spacy.io/usage/visualizers

## Visualizing the dependency parse
Run the cell below to import displacy and display the dependency graphic

In [20]:
from spacy import displacy

In [21]:
doc = nlp(u'Apple is going to build a U.K. factory for $6 million.')

In [22]:
displacy.render(doc, style='dep', jupyter=True, options={'distance': 80})

# dep for syntatctic dependency


## Visualizing the entity recognizer

In [23]:
doc = nlp(u'Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million.')

In [24]:
displacy.render(doc, style='ent', jupyter=True)

# ent for entity