# SET UP

In [2]:
import pandas as pd
import numpy as np
import spacy
from spacy import displacy

# spaCy 101:
https://spacy.io/usage/spacy-101

## Linguistic annotations

In [6]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
for token in doc:
    print(token.text, token.pos_, spacy.explain(token.pos_), token.dep_, spacy.explain(token.dep_))

Apple PROPN proper noun nsubj nominal subject
is AUX auxiliary aux auxiliary
looking VERB verb ROOT None
at ADP adposition prep prepositional modifier
buying VERB verb pcomp complement of preposition
U.K. PROPN proper noun dobj direct object
startup VERB verb dep unclassified dependent
for ADP adposition prep prepositional modifier
$ SYM symbol quantmod modifier of quantifier
1 NUM numeral compound compound
billion NUM numeral pobj object of preposition


In [7]:
doc

Apple is looking at buying U.K. startup for $1 billion

## Tokenization

In [9]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion.")
for token in doc:
    print(token.text)

Apple
is
looking
at
buying
U.K.
startup
for
$
1
billion
.


## Part-of-speech tags and dependencies

In [11]:
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for token in doc:
    print(token.text, token.lemma_, token.pos_, spacy.explain(token.pos_), token.tag_,
          spacy.explain(token.tag_), token.dep_, spacy.explain(token.dep_),
            token.shape_, token.is_alpha, token.is_stop)

Apple Apple PROPN proper noun NNP noun, proper singular nsubj nominal subject Xxxxx True False
is be AUX auxiliary VBZ verb, 3rd person singular present aux auxiliary xx True True
looking look VERB verb VBG verb, gerund or present participle ROOT None xxxx True False
at at ADP adposition IN conjunction, subordinating or preposition prep prepositional modifier xx True True
buying buy VERB verb VBG verb, gerund or present participle pcomp complement of preposition xxxx True False
U.K. U.K. PROPN proper noun NNP noun, proper singular dobj direct object X.X. False False
startup startup VERB verb VBD verb, past tense dep unclassified dependent xxxx True False
for for ADP adposition IN conjunction, subordinating or preposition prep prepositional modifier xxx True True
$ $ SYM symbol $ symbol, currency quantmod modifier of quantifier $ False False
1 1 NUM numeral CD cardinal number compound compound d False False
billion billion NUM numeral CD cardinal number pobj object of preposition xxxx T

## Named Entities

In [13]:
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_, spacy.explain(ent.label_))

Apple 0 5 ORG Companies, agencies, institutions, etc.
U.K. 27 31 GPE Countries, cities, states
$1 billion 44 54 MONEY Monetary values, including unit


In [17]:
displacy.serve(doc, style="dep")


Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


## Word vectors and similarity

In [5]:
nlp = spacy.load("en_core_web_md")
tokens = nlp("dog cat banana afskfsd")

for token in tokens:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)

dog True 7.0336733 False
cat True 6.6808186 False
banana True 6.700014 False
afskfsd False 0.0 True


In [6]:
nlp = spacy.load("en_core_web_lg")
tokens = nlp("dog cat banana afskfsd")

for token in tokens:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)

dog True 7.0336733 False
cat True 6.6808186 False
banana True 6.700014 False
afskfsd False 0.0 True


In [7]:
nlp = spacy.load("en_core_web_md")  # make sure to use larger package!
doc1 = nlp("I like salty fries and hamburgers.")
doc2 = nlp("Fast food tastes very good.")

# Similarity of two documents
print(doc1, "<->", doc2, doc1.similarity(doc2))
# Similarity of tokens and spans
french_fries = doc1[2:4]
burgers = doc1[5]
print(french_fries, "<->", burgers, french_fries.similarity(burgers))

I like salty fries and hamburgers. <-> Fast food tastes very good. 0.77994864211694
salty fries <-> hamburgers 0.730462372303009


## Vocab, hashes and lexemes

In [8]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("I love coffee")
print(doc.vocab.strings["coffee"])  # 3197928453018144401
print(doc.vocab.strings[3197928453018144401])  # 'coffee'

3197928453018144401
coffee


In [9]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("I love coffee")
for word in doc:
    lexeme = doc.vocab[word.text]
    print(lexeme.text, lexeme.orth, lexeme.shape_, lexeme.prefix_, lexeme.suffix_,
            lexeme.is_alpha, lexeme.is_digit, lexeme.is_title, lexeme.lang_)

I 4690420944186131903 X I I True False True en
love 3702023516439754181 xxxx l ove True False False en
coffee 3197928453018144401 xxxx c fee True False False en
