# Processing Text with spaCy

In [6]:
!pip install spacy



In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")


## Tokenize a Doc

In [8]:
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
for token in doc:
    print(token.text, token.pos_, token.dep_)

Apple PROPN nsubj
is AUX aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.K. PROPN compound
startup NOUN dobj
for ADP prep
$ SYM quantmod
1 NUM compound
billion NUM pobj


## Visualize a dependency parse and named entities in your browser

In [9]:
from spacy import displacy

doc_dep = nlp("This is a sentence.")
#displacy.serve(doc_dep, style="dep")

doc_ent = nlp("When Sebastian Thrun started working on self-driving cars at Google "
              "in 2007, few people outside of the company took him seriously.")
#displacy.serve(doc_ent, style="ent")

from IPython.core.display import display, HTML

html = displacy.render(doc_ent, style="dep")
display(HTML(html))

## Word Vectors and Word Similarity

In [None]:
#!python -m spacy download en_core_web_lg


In [10]:
import en_core_web_sm
nlp = en_core_web_sm.load()
mango = nlp(u'mango')
print(mango.vector.shape)
print(mango.vector) # 96 dimensional word vector for a word

(96,)
[ 0.20538223 -1.6033714   0.27122334  0.4102599   3.2985601   3.4889512
  1.8090308  -2.1398475   2.31565     1.5809067   4.1519527  -1.0185633
 -0.0325011  -2.7471437  -0.4177467  -2.4292274  -0.6153387   2.4422317
  0.8078671  -2.4846377   2.0988142   1.4448209  -0.552992   -1.3411183
 -0.69847786 -0.45548356  3.8267968  -4.0225782   0.81215733  0.3766132
  0.15751392 -1.1428392  -1.3328214   0.7187766   2.1567593  -3.018766
  3.4919028   0.6938907  -1.1943094  -0.10796624  4.7029977   3.551554
 -0.71505725 -4.4580555  -0.26480573  0.6314918  -0.538128   -1.1131921
 -1.1251849   0.5740081  -1.1976193  -3.5157654   0.425157   -1.7545594
 -3.058784    0.01680815  0.97784567  1.7633746   0.4561966   2.5090182
  0.35267782  0.8351371  -1.394351    0.5082075   0.75960976 -3.3654122
  2.3440146  -2.4311178   1.2401564  -1.4498216  -2.3708577   1.274456
  2.6584334   2.505236    0.24999112  0.45838034  0.7396465  -3.0134087
 -1.1449497   2.441533    0.58746856 -0.47240722 -0.99527466 

In [11]:
import spacy

doc = nlp("Apple and banana are similar. Pasta and hippo aren't.")

apple = doc[0]
banana = doc[2]
pasta = doc[6]
hippo = doc[8]

print("apple <-> banana", apple.similarity(banana))
print("pasta <-> hippo", pasta.similarity(hippo))
print(apple.has_vector, banana.has_vector, pasta.has_vector, hippo.has_vector)

apple <-> banana 0.47310075
pasta <-> hippo 0.36954373
True True True True


  "__main__", mod_spec)
  "__main__", mod_spec)


## Simple and Efficient Serialization

In [None]:
import spacy
from spacy.tokens import Doc
from spacy.vocab import Vocab

nlp = spacy.load("en_core_web_sm")
customer_feedback = open("Test.txt").read()
doc = nlp(customer_feedback)
doc.to_disk("/tmp/feedback.bin")

new_doc = Doc(Vocab()).from_disk("/tmp/feedback.bin")

## Match Text with Token Rules

In [14]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

def set_sentiment(matcher, doc, i, matches):
    doc.sentiment += 0.1

pattern1 = [{"ORTH": "Google"}, {"ORTH": "I"}, {"ORTH": "/"}, {"ORTH": "O"}]
pattern2 = [[{"ORTH": emoji, "OP": "+"}] for emoji in ["😀", "😂", "🤣", "😍"]]
matcher.add("GoogleIO", None, pattern1)  # Match "Google I/O" or "Google i/o"
matcher.add("HAPPY", set_sentiment, *pattern2)  # Match one or more happy emoji

doc = nlp("A text about Google I/O 😀😀")
matches = matcher(doc)

for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]
    span = doc[start:end]
    print(string_id, span.text)
print("Sentiment", doc.sentiment)

GoogleIO Google I/O
HAPPY 😀
HAPPY 😀😀
HAPPY 😀
Sentiment 0.30000001192092896


## Get Syntactic Dependencies

In [15]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("When Sebastian Thrun started working on self-driving cars at Google "
          "in 2007, few people outside of the company took him seriously.")

dep_labels = []
for token in doc:
    while token.head != token:
        dep_labels.append(token.dep_)
        token = token.head
print(dep_labels)

['advmod', 'advcl', 'compound', 'nsubj', 'advcl', 'nsubj', 'advcl', 'advcl', 'xcomp', 'advcl', 'prep', 'xcomp', 'advcl', 'npadvmod', 'amod', 'pobj', 'prep', 'xcomp', 'advcl', 'punct', 'amod', 'pobj', 'prep', 'xcomp', 'advcl', 'amod', 'pobj', 'prep', 'xcomp', 'advcl', 'pobj', 'prep', 'xcomp', 'advcl', 'prep', 'xcomp', 'advcl', 'pobj', 'prep', 'xcomp', 'advcl', 'prep', 'advcl', 'pobj', 'prep', 'advcl', 'punct', 'amod', 'nsubj', 'nsubj', 'prep', 'nsubj', 'prep', 'prep', 'nsubj', 'det', 'pobj', 'prep', 'prep', 'nsubj', 'pobj', 'prep', 'prep', 'nsubj', 'dobj', 'advmod', 'punct']


## Linguistic Annotations

In [16]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
for token in doc:
    print(token.text, token.pos_, token.dep_)

Apple PROPN nsubj
is AUX aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.K. PROPN compound
startup NOUN dobj
for ADP prep
$ SYM quantmod
1 NUM compound
billion NUM pobj


## Tokenization

In [17]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
for token in doc:
    print(token.text)

Apple
is
looking
at
buying
U.K.
startup
for
$
1
billion


## Part of Speech Tagging

In [18]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

Apple Apple PROPN NNP nsubj Xxxxx True False
is be AUX VBZ aux xx True True
looking look VERB VBG ROOT xxxx True False
at at ADP IN prep xx True True
buying buy VERB VBG pcomp xxxx True False
U.K. U.K. PROPN NNP compound X.X. False False
startup startup NOUN NN dobj xxxx True False
for for ADP IN prep xxx True True
$ $ SYM $ quantmod $ False False
1 1 NUM CD compound d False False
billion billion NUM CD pobj xxxx True False


## Named Entity Recognition (NER)

In [19]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Apple 0 5 ORG
U.K. 27 31 GPE
$1 billion 44 54 MONEY


In [20]:
from IPython.core.display import display, HTML

html = displacy.render(doc.ents, style="dep")
display(HTML(html))

In [21]:
displacy.render(doc, style = "ent",jupyter = True)

## Visualizing Named Entities

In [22]:
import spacy
from spacy import displacy

text = "When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously."

nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
#displacy.serve(doc, style="ent")

from IPython.core.display import display, HTML

html = displacy.render(doc, style="dep")
display(HTML(html))

## Visualizing Dependency Parse

In [23]:
import spacy
from spacy import displacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("This is a sentence.")
#displacy.serve(doc, style="dep")

from IPython.core.display import display, HTML

html = displacy.render(doc, style="dep")
display(HTML(html))


## Visualizing Long Text

In [24]:
import spacy
from spacy import displacy

nlp = spacy.load("en_core_web_sm")
text = """In ancient Rome, some neighbors live in three adjacent houses. In the center is the house of Senex, who lives there with wife Domina, son Hero, and several slaves, including head slave Hysterium and the musical's main character Pseudolus. A slave belonging to Hero, Pseudolus wishes to buy, win, or steal his freedom. One of the neighboring houses is owned by Marcus Lycus, who is a buyer and seller of beautiful women; the other belongs to the ancient Erronius, who is abroad searching for his long-lost children (stolen in infancy by pirates). One day, Senex and Domina go on a trip and leave Pseudolus in charge of Hero. Hero confides in Pseudolus that he is in love with the lovely Philia, one of the courtesans in the House of Lycus (albeit still a virgin)."""
doc = nlp(text)
sentence_spans = list(doc.sents)
#displacy.serve(sentence_spans, style="dep")

from IPython.core.display import display, HTML

html = displacy.render(sentence_spans, style="dep")
display(HTML(html))

## Using displaCy in Jupyter notebooks

displaCy is able to detect whether you’re working in a Jupyter notebook, and will return markup that can be rendered in a cell straight away. When you export your notebook, the visualizations will be included as HTML.



In [25]:
!python -m spacy download en

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [26]:


import spacy
from spacy import displacy

doc = nlp("Rats are various medium-sized, long-tailed rodents.")
displacy.render(doc, style="dep")


displacy.render(doc, style="ent")

  "__main__", mod_spec)


'<div class="entities" style="line-height: 2.5; direction: ltr">Rats are various medium-sized, long-tailed rodents.</div>'

In [27]:
from IPython.core.display import display, HTML

html = displacy.render(doc, style="dep")
display(HTML(html))

## Adding Title to a Document

In [None]:
doc = nlp("This is a sentence about Google.")
doc.user_data["title"] = "This is a title"
