In [1]:
import spacy

In [2]:
nlp =spacy.load('en_core_web_sm')

In [3]:
doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")

In [4]:
print(doc.text)

The quick brown fox jumped over the lazy dog's back.


In [5]:
print(doc[4])

jumped


In [6]:
print(doc[4].text,doc[4].pos_,doc[4].tag_)

jumped VERB VBD


In [7]:
for token in doc:
    print(f'{token.text:10},{token.pos_:10},{token.tag_:10},{spacy.explain(token.tag_)}')

The       ,DET       ,DT        ,determiner
quick     ,ADJ       ,JJ        ,adjective
brown     ,ADJ       ,JJ        ,adjective
fox       ,NOUN      ,NN        ,noun, singular or mass
jumped    ,VERB      ,VBD       ,verb, past tense
over      ,ADP       ,IN        ,conjunction, subordinating or preposition
the       ,DET       ,DT        ,determiner
lazy      ,ADJ       ,JJ        ,adjective
dog       ,NOUN      ,NN        ,noun, singular or mass
's        ,PART      ,POS       ,possessive ending
back      ,NOUN      ,NN        ,noun, singular or mass
.         ,PUNCT     ,.         ,punctuation mark, sentence closer


In [8]:
doc = nlp(u'I read books on NLP.')

In [9]:
word = doc[1]
word

read

In [10]:
token = word
print(f'{token.text:10},{token.pos_:10},{token.tag_:10},{spacy.explain(token.tag_)}')

read      ,VERB      ,VBP       ,verb, non-3rd person singular present


In [11]:
doc = nlp(u'I read a book on NLP.')
word = doc[1]
token = word
print(f'{token.text:10},{token.pos_:10},{token.tag_:10},{spacy.explain(token.tag_)}')

read      ,VERB      ,VBD       ,verb, past tense


In [12]:
doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")

In [13]:
POS_counts = doc.count_by(spacy.attrs.POS)

In [14]:
POS_counts

{96: 1, 83: 3, 99: 1, 84: 1, 89: 2, 91: 3, 93: 1}

In [15]:
doc.vocab[83].text

'ADJ'

In [16]:
doc[4].pos

99

In [17]:
print(sorted(POS_counts.items()))


[(83, 3), (84, 1), (89, 2), (91, 3), (93, 1), (96, 1), (99, 1)]


In [18]:
for k,v in sorted(POS_counts.items()):
    print(f"{k},{doc.vocab[k].text:{5}},{v}")

83,ADJ  ,3
84,ADP  ,1
89,DET  ,2
91,NOUN ,3
93,PART ,1
96,PUNCT,1
99,VERB ,1


In [19]:
TAG_counts = doc.count_by(spacy.attrs.TAG)
for k,v in sorted(TAG_counts.items()):
    print(f"{k:<{25}},{doc.vocab[k].text:{15}},{v}")

74                       ,POS            ,1
1292078113972184607      ,IN             ,1
10554686591937588953     ,JJ             ,3
12646065887601541794     ,.              ,1
15267657372422890137     ,DT             ,2
15308085513773655218     ,NN             ,3
17109001835818727656     ,VBD            ,1


In [20]:
len(doc.vocab)

57863

In [21]:
DEP_counts = doc.count_by(spacy.attrs.DEP)
for k,v in sorted(DEP_counts.items()):
    print(f"{k:<{25}},{doc.vocab[k].text:{15}},{v}")

399                      ,amod           ,3
412                      ,det            ,2
426                      ,nsubj          ,1
436                      ,pobj           ,1
437                      ,poss           ,1
440                      ,prep           ,1
442                      ,punct          ,1
8110129090154140942      ,case           ,1
8206900633647566924      ,ROOT           ,1


## Visualizing

In [22]:
from spacy import displacy

In [23]:
doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")

In [24]:
displacy.render(doc,style='dep',jupyter=True,options={'distance':110,'compact':'True','color':'yellow','bg':'black','font':'Times'})

In [25]:
options = {'distance': 110, 'compact': 'True', 'color': 'yellow', 'bg': '#09a3d5', 'font': 'Times'}

displacy.render(doc, style='dep', options=options,jupyter=True)

In [26]:
doc2 = nlp(u"This is a sentence. This is another, possibly longer sentence.")

In [27]:
spans = list(doc2.sents)
print(spans)

[This is a sentence., This is another, possibly longer sentence.]


In [28]:
displacy.render(spans, style='dep', options=options,jupyter=True)

In [29]:

#displacy.serve(doc2, style='dep', options=options)

### Named Entity Recognition

In [30]:
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            
            print(ent.text+' - '+ent.label_+' - '+ str(spacy.explain(ent.label_)))
            print(ent.start,ent.end,ent.start_char,ent.end_char)
            print('\n')
    else:
        print('No entites found')
    
    if hasattr(doc,'ents'):
        print('\n\n')
        print(doc,'yes')

In [31]:
doc = nlp(u'May I go to Washington, DC next May to see the Washington Monument?')

In [32]:
show_ents(doc)

Washington, DC - GPE - Countries, cities, states
4 7 12 26


next May - DATE - Absolute or relative dates or periods
7 9 27 35


the Washington Monument - ORG - Companies, agencies, institutions, etc.
11 14 43 66





May I go to Washington, DC next May to see the Washington Monument? yes


In [33]:
doc = nlp(u'Can I please borrow 500 dollars from you to buy some Microsoft stock? yen 567  for the books')
show_ents(doc)

500 dollars - MONEY - Monetary values, including unit
4 6 20 31


Microsoft - ORG - Companies, agencies, institutions, etc.
11 12 53 62


567 - MONEY - Monetary values, including unit
15 16 74 77





Can I please borrow 500 dollars from you to buy some Microsoft stock? yen 567  for the books yes


In [34]:
doc = nlp(u'Tesla to build a U.K. factory for $6 million')
show_ents(doc)

U.K. - GPE - Countries, cities, states
4 5 17 21


$6 million - MONEY - Monetary values, including unit
7 10 34 44





Tesla to build a U.K. factory for $6 million yes


In [35]:
from spacy.tokens import Span

In [36]:
ORG = doc.vocab.strings[u'ORG']
ORG


381

In [37]:
new_entity = Span(doc,0,1,label=ORG)

In [38]:
new_entity

Tesla

In [39]:
doc.ents = list(doc.ents)+[new_entity]

In [40]:
show_ents(doc)

Tesla - ORG - Companies, agencies, institutions, etc.
0 1 0 5


U.K. - GPE - Countries, cities, states
4 5 17 21


$6 million - MONEY - Monetary values, including unit
7 10 34 44





Tesla to build a U.K. factory for $6 million yes


In [41]:
doc = nlp(u'Our company plans to introduce a new vacuum cleaner. '
          u'If successful, the vacuum cleaner will be our first product.')

In [42]:
show_ents(doc)

first - ORDINAL - "first", "second", etc.
19 20 99 104





Our company plans to introduce a new vacuum cleaner. If successful, the vacuum cleaner will be our first product. yes


In [43]:
from spacy.matcher import PhraseMatcher

In [44]:
matcher = PhraseMatcher(nlp.vocab)

In [45]:
phrase_list =phrase_list = ['vacuum cleaner', 'vacuum-cleaner']
phrase_patterns = [nlp(text) for text in phrase_list]

In [46]:
phrase_patterns

[vacuum cleaner, vacuum-cleaner]

In [47]:
matcher.add('newproduct',None,*phrase_patterns)

In [48]:
found_matches = matcher(doc)

In [49]:
found_matches

[(2689272359382549672, 7, 9), (2689272359382549672, 14, 16)]

In [50]:
from spacy.tokens import Span

In [51]:
PROD = doc.vocab.strings[u'PRODUCT']

In [52]:
PROD

384

In [53]:
new_ents=[Span(doc,match[1],match[2],label=PROD) for match in found_matches]

In [54]:
doc.ents = list(doc.ents)+new_ents

In [55]:
show_ents(doc)

vacuum cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)
7 9 37 51


vacuum cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)
14 16 72 86


first - ORDINAL - "first", "second", etc.
19 20 99 104





Our company plans to introduce a new vacuum cleaner. If successful, the vacuum cleaner will be our first product. yes


In [56]:
doc = nlp(u'Originally priced at $29.50, the sweater was marked down to five dollars.')

In [57]:
show_ents(doc)

29.50 - MONEY - Monetary values, including unit
4 5 22 27


five dollars - MONEY - Monetary values, including unit
12 14 60 72





Originally priced at $29.50, the sweater was marked down to five dollars. yes


In [58]:
[ent for ent in doc.ents]

[29.50, five dollars]

In [59]:
[ent for ent in doc.ents if ent.label_=="MONEY"]

[29.50, five dollars]

In [60]:
len([ent for ent in doc.ents if ent.label_=="MONEY"])

2

In [61]:
spacy.__version__

'2.0.16'

In [62]:
# line breaks as GPE
doc = nlp(u'Originally priced at $29.50,\nthe sweater was marked down to five dollars.')

show_ents(doc)

29.50 - MONEY - Monetary values, including unit
4 5 22 27



 - GPE - Countries, cities, states
6 7 28 29


five dollars - MONEY - Monetary values, including unit
13 15 60 72





Originally priced at $29.50,
the sweater was marked down to five dollars. yes


In [63]:
def remove_white_spaces(doc):
    doc.ents =[e for e in doc.ents if not e.text.isspace()]
    return doc

In [64]:
nlp.add_pipe(remove_white_spaces,after='ner')

In [65]:
doc = nlp(u'Originally priced at $29.50,\nthe sweater was marked down to five dollars.')

show_ents(doc) # removed the linebreaks

29.50 - MONEY - Monetary values, including unit
4 5 22 27


five dollars - MONEY - Monetary values, including unit
13 15 60 72





Originally priced at $29.50,
the sweater was marked down to five dollars. yes


#### Noun Chunks

In [66]:
doc = nlp(u"Autonomous cars shift insurance liability toward spunky manufacturers.")
for chunk in doc.noun_chunks:
    print(chunk.text+' - '+chunk.root.text+' - '+chunk.root.dep_+' - '+chunk.root.head.text)

Autonomous cars - cars - nsubj - shift
insurance liability - liability - dobj - shift
spunky manufacturers - manufacturers - pobj - toward


In [67]:
#len(doc.noun_chunks)

In [68]:
len(list(doc.noun_chunks))

3

### Visualizing

In [69]:
from spacy import displacy

In [70]:
doc = nlp(u'Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million. '
         u'By contrast, Sony sold only 7 thousand Walkman music players.')

In [71]:
displacy.render(doc,style='ent',jupyter=True)

In [72]:
displacy.render(doc,style='dep',jupyter=True)

In [73]:
for sent in doc.sents:
    displacy.render(nlp(sent.text),style='ent',jupyter=True) #or only sent

In [74]:
for sent in doc.sents:
    displacy.render(sent,style='ent',jupyter=True) #or only sent

In [75]:
sent.ents

[Sony, only 7 thousand, Walkman]

In [76]:
doc2 = nlp(u'Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million. '
         u'By contrast, my kids sold a lot of lemonade.')
for sent in doc2.sents:
    displacy.render(nlp(sent.text), style='ent', jupyter=True)

  "__main__", mod_spec)


In [77]:
for sent in doc2.sents:
    docx = nlp(sent.text)
    if docx.ents:
        displacy.render(docx, style='ent', jupyter=True)
    else:
        print(docx.text)

By contrast, my kids sold a lot of lemonade.


In [78]:
doc = nlp(u'Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million. '
         u'By contrast, Sony sold only 7 thousand Walkman music players.')

In [79]:
options ={'ents':['PRODUCT']}

In [80]:
displacy.render(doc, style='ent', jupyter=True,options=options)

In [81]:
options ={'ents':['PRODUCT','ORG']}

In [82]:
displacy.render(doc, style='ent', jupyter=True,options=options)

In [83]:
colors={'ORG':'red'}

options ={'ents':['PRODUCT','ORG'],'colors':colors}

displacy.render(doc, style='ent', jupyter=True,options=options)

In [84]:
colors={'ORG':'radial-gradient(yellow,green)','PRODUCT':'linear-gradient(45deg, red, #fc9ce7)'}

options ={'ents':['PRODUCT','ORG'],'colors':colors}

displacy.render(doc, style='ent', jupyter=True,options=options)

### Sentence Segmentation

In [85]:
doc = nlp(u'This is the first sentence. This is another sentence. This is the last sentence.')

In [86]:
for sent in doc.sents:
    print(sent)

This is the first sentence.
This is another sentence.
This is the last sentence.


In [87]:
# doc.sents[0]

In [88]:
print(doc[1])

is


In [89]:
type(list(doc.sents)[0])

spacy.tokens.span.Span

In [90]:
print(doc[0].is_sent_start)

None


In [91]:
doc2 = nlp(u'This is a sentence. This is a sentence. This is a sentence.')

In [92]:
for token in doc2:
    print(token.text,token.is_sent_start)

This None
is None
a None
sentence None
. None
This True
is None
a None
sentence None
. None
This True
is None
a None
sentence None
. None


In [93]:
doc3 = nlp(u'"Management is doing things right; leadership is doing the right things." -Peter Drucker')

In [94]:
doc3.text

'"Management is doing things right; leadership is doing the right things." -Peter Drucker'

In [95]:
for sent in doc3.sents:
    print(sent)
    print('\n')

"Management is doing things right; leadership is doing the right things."


-Peter Drucker




In [122]:
## ADD a segmentation rule
def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if token.text==';':
            doc[token.i+1].is_sent_start=True
    return doc

In [123]:
nlp.add_pipe(set_custom_boundaries,before='parser')

In [124]:
# nlp.remove_pipe('set_custom_boundaries')

In [125]:
nlp.pipe_names

['tagger', 'set_custom_boundaries', 'parser', 'ner', 'remove_white_spaces']

In [126]:
doc4 = nlp(u'"Management is doing things right; leadership is doing the right things." -Peter Drucker')

In [128]:
for sents in doc4.sents:
    print(sents)

"Management is doing things right;
leadership is doing the right things."
-Peter Drucker


In [97]:
## CHANGE SEGMENTATION RULE


In [130]:
nlp = spacy.load('en_core_web_sm')

In [131]:
nlp.pipe_names

['tagger', 'parser', 'ner']

In [132]:
mystring = u"This is a sentence. This is another.\n\nThis is a \nthird sentence."

In [135]:
print(mystring)

This is a sentence. This is another.

This is a 
third sentence.


In [136]:
doc = nlp(mystring)

In [140]:
for token in doc:
    print(token.text,token.tag_)

This DT
is VBZ
a DT
sentence NN
. .
This DT
is VBZ
another DT
. .


 _SP
This DT
is VBZ
a DT

 
third JJ
sentence NN
. .


In [137]:
for sent in doc.sents:
    print(sent)

This is a sentence.
This is another.


This is a 
third sentence.


In [138]:
from spacy.pipeline import SentenceSegmenter

In [141]:
def split_on_newlines(doc):
    start=0
    seen_newline =False
    
    for word in doc:
        if seen_newline:
            yield doc[start:word.i]
            start=word.i
            seen_newline=False
        elif word.text.startswith('\n'):
            seen_newline=True
            
    yield doc[start:]
    

In [142]:
sbd = SentenceSegmenter(nlp.vocab,strategy=split_on_newlines)

In [143]:
nlp.add_pipe(sbd)

In [144]:
doc = nlp(mystring)

In [147]:
for sent in doc.sents:
    print(sent)

This is a sentence. This is another.


This is a 

third sentence.
