In [1]:
import spacy

nlp = spacy.load("en_core_web_sm")

doc = nlp("I love coffee")
print("hash value:", nlp.vocab.strings["coffee"])
print("string value:", nlp.vocab.strings[3197928453018144401])

hash value: 3197928453018144401
string value: coffee


In [2]:
doc = nlp("I love coffee")
print("hash value:", doc.vocab.strings["coffee"])

hash value: 3197928453018144401


### Lexemes: entries in the vocabulary

In [3]:
doc = nlp("I love coffee")
lexeme = nlp.vocab["coffee"]

# Print the lexical attributes
print(lexeme.text, lexeme.orth, lexeme.is_alpha)

coffee 3197928453018144401 True


Contains the context-independent information about a word

* Word text: lexeme.text and lexeme.orth (the hash)
* Lexical attributes like lexeme.is_alpha
* Not context-dependent part-of-speech tags, dependencies or entity labels

### Exercise

#### Number 1

In [4]:
import spacy

nlp = spacy.blank("en")
doc = nlp("I have a cat")

# Look up the hash for the word "cat"
cat_hash = doc.vocab.strings["cat"]
print(cat_hash)

# Look up the cat_hash to get the string
cat_string = doc.vocab.strings[cat_hash]
print(cat_string)

5439657043933447811
cat


#### Number 2

In [5]:
import spacy

nlp = spacy.blank("en")
doc = nlp("David Bowie is a PERSON")

# Look up the hash for the string label "PERSON"
person_hash = doc.vocab.strings["PERSON"]
print(person_hash)

# Look up the person_hash to get the string
person_string = doc.vocab.strings[person_hash]
print(person_string)

380
PERSON


### The Doc Object

In [6]:
import spacy
nlp = spacy.blank("en")

# Import the Doc class
from spacy.tokens import Doc

# The words and spaces to create the doc from
words = ["Hello", "world", "!"]
spaces = [True, False, False]

# Create a doc manually
doc = Doc(nlp.vocab, words=words, spaces=spaces)
doc

Hello world!

#### Import the Doc and Span classes

In [11]:
from spacy.tokens import Doc, Span

# The words and spaces to create the doc from
words = ["Hello", "world", "!"]
spaces = [True, False, False]

# Create a doc manually
doc = Doc(nlp.vocab, words=words, spaces=spaces)

# Create a span manually
span = (doc, 0, 2)

# Create a span with a label
span_with_label = Span(doc, 0, 1, label="GREETING")

# Add span to the doc.ents
doc.ents = [span_with_label]
doc

Hello world!

In [12]:
from spacy import displacy
displacy.serve(doc, style="ent")


Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


Best practices

* Doc and Span are very powerful and hold references and relationships of words and sentences
  - Convert result to strings as late as possible
  - Use token attributes if available â€“ for example, token.i for the token index
* Don't forget to pass in the shared vocab

### Exercise

#### Number 1:

In [13]:
nlp = spacy.blank("en")

# Import the Doc class
from spacy.tokens import Doc, Span

# Desired text: "spaCy is cool!"
words = ["spaCy", "is", "cool", "!"]
spaces = [True, True, False, False]

# Create a Doc from the words and spaces
doc = Doc(nlp.vocab, words=words, spaces=spaces)
print(doc.text)

spaCy is cool!


#### Number 2:

In [14]:
import spacy

nlp = spacy.blank("en")

# Import the Doc class
from spacy.tokens import Doc

# Desired text: "Go, get started!"
words = ["Go", ",", "get", "started", "!"]
spaces = [False, True, True, False, False]

# Create a Doc from the words and spaces
doc =Doc(nlp.vocab, words=words, spaces=spaces)
print(doc.text)

Go, get started!


#### Number 3:

In [15]:
import spacy

nlp = spacy.blank("en")

# Import the Doc class
from spacy.tokens import Doc

# Desired text: "Oh, really?!"
words = ["Oh", ",", "really", "?", "!"]
spaces =  [False, True, False, False, False]


# Create a Doc from the words and spaces
doc = Doc(nlp.vocab, words=words, spaces=spaces)
print(doc.text)

Oh, really?!


#### Number 4:

In [16]:
import spacy

nlp = spacy.blank("en")

# Import the Doc and Span classes
from spacy.tokens import Doc, Span

words = ["I", "like", "David", "Bowie"]
spaces = [True, True, True, False]

# Create a doc from the words and spaces
doc = Doc(nlp.vocab, words=words, spaces=spaces)
print(doc.text)

# Create a span for "David Bowie" from the doc and assign it the label "PERSON"
span = Span(doc, 2, 4, label="PERSON")
print(span.text, span.label_)

# Add the span to the doc's entities
doc.ents = [span]

# Print entities' text and labels
print([(ent.text, ent.label_) for ent in doc.ents])

I like David Bowie
David Bowie PERSON
[('David Bowie', 'PERSON')]


In [None]:
from spacy import displacy
displacy.serve(doc, style="ent")


Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...

