In [None]:
#most words are rare, and it's common for different words to mean the same thing & vice versa
#identical words mean something different in a different order; need linguistic context knowledge
#that's what spacy does: input raw text, output Doc with variety of annotations
#coarse-grained POS tags: 
#fine-grained POS tags:
import spacy 
nlp = spacy.load('en_core_web_sm')
import sys
print(sys.executable)


/Users/cooperreed/anaconda3/envs/nlp_ipynb/bin/python


In [9]:
#.pos_ is the coarse-grained POS tag, crudely sorting for adj, verb, preposition, noun, etc
#.tag_ is the fine-grained POS tag, explained using spacy below!
doc = nlp(u"Sphinx of black quartz, judge my vow.")
for token in doc:
    print(f"{token.text:{10}} {token.pos_:{10}} {token.tag_:{10}} {spacy.explain(token.tag_)}")

Sphinx     PROPN      NNP        noun, proper singular
of         ADP        IN         conjunction, subordinating or preposition
black      ADJ        JJ         adjective (English), other noun-modifier (Chinese)
quartz     NOUN       NN         noun, singular or mass
,          PUNCT      ,          punctuation mark, comma
judge      VERB       VBP        verb, non-3rd person singular present
my         PRON       PRP$       pronoun, possessive
vow        NOUN       NN         noun, singular or mass
.          PUNCT      .          punctuation mark, sentence closer


In [12]:
#spacy infers POS and tag from all context available, illustrated here by past/present separation
doc1 = nlp(u"I read books.")
token = doc1[1]
print(f"first doc: {token.text:{10}} {token.pos_:{10}} {token.tag_:{10}} {spacy.explain(token.tag_)}")
doc2 = nlp(u"I read a book.")
token = doc2[1]
print(f"second doc: {token.text:{10}} {token.pos_:{10}} {token.tag_:{10}} {spacy.explain(token.tag_)}")


first doc: read       VERB       VBP        verb, non-3rd person singular present
first doc: read       VERB       VBD        verb, past tense


In [16]:
#count pos tags; 
doc = nlp(u"Sphinx of black quartz, judge my vow.")
POS_counts = doc.count_by(spacy.attrs.POS)
print(POS_counts) # shows part of speech code
print(doc.vocab[96].text)
for k, v in sorted(POS_counts.items()):
    print(f"{k}. {doc.vocab[k].text:{5}} {v}")

{96: 1, 85: 1, 84: 1, 92: 2, 97: 2, 100: 1, 95: 1}
PROPN
84. ADJ   1
85. ADP   1
92. NOUN  2
95. PRON  1
96. PROPN 1
97. PUNCT 2
100. VERB  1


In [20]:

TAG_counts = doc.count_by(spacy.attrs.TAG)
for k, v in sorted(TAG_counts.items()):
    print(f"{k}. {doc.vocab[k].text:{5}} {v}")
DEP_counts = doc.count_by(spacy.attrs.DEP)   #syntactic dependency tags
for k, v in sorted(DEP_counts.items()):      
    print(f"{k}. {doc.vocab[k].text:{5}} {v}")

1292078113972184607. IN    1
2593208677638477497. ,     1
4062917326063685704. PRP$  1
9188597074677201817. VBP   1
10554686591937588953. JJ    1
12646065887601541794. .     1
15308085513773655218. NN    2
15794550382381185553. NNP   1
402. amod  1
416. dobj  1
429. nsubj 1
439. pobj  1
440. poss  1
443. prep  1
445. punct 2
8206900633647566924. ROOT  1


In [21]:
len(doc.vocab)


796

In [None]:
#Visualizing POS Structure; you should see a displacy syntactic dependency graph here
import spacy 
nlp = spacy.load('en_core_web_sm')
doc1 = nlp(u"The quick brown fox jumped over the lazy dog.")
doc2 = nlp(u"Sphinx of black quartz, judge my vow.")
from spacy import displacy
#'compact' flattens words; we also set colors, font, and horizontal distance
options = {'distance':50,'compact':'True','color':'yellow','bg':'#09a3d5','font':'Times'}
displacy.render(doc1,style='dep',jupyter=True,options=options)
displacy.render(doc2,style='dep',jupyter=True,options=options)
#if you want to select a single inline display option
displacy.render(doc1,style='dep',jupyter=True,options={'distance':50})


In [None]:
#uses displacy serve to display in a new tab
doc3=nlp(u"This is a sentence. This is a second longer sentence.")
spans=list(doc3.sents)
displacy.serve(spans,style='dep',options=options)




Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...



127.0.0.1 - - [03/Aug/2023 13:29:09] "GET / HTTP/1.1" 200 8248
