In [1]:
"""
Named Entity Recognition (NER) is the process of locating & classifying named entity mentions
in unstructured text. Categories usually include person names, organizations, locations,
medical codes, time expressions, quantities, monetary values, percentages, etc.
"""
import spacy 
nlp = spacy.load('en_core_web_sm')
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text + ' // ' 
                  + ent.label_ + ' // ' 
                  + str(spacy.explain(ent.label_)))
            print(ent.start, ent.end,     #the token span's "start" & "end" index position in doc
                 ent.start_char, ent.end_char) #The entity text's "start" and "end" index in doc
    else:
        print('No entities in document')
doc=nlp(u'Hello where are we?')
print("doc:")
show_ents(doc) #no entities in doc
doc1=nlp(u"In May I paid 500 dollars to see The Mona Lisa at the Louvre in Paris.")
print("\n doc1:")
show_ents(doc1)


doc:
No entities in document

 doc1:
May // DATE // Absolute or relative dates or periods
1 2 3 6
500 dollars // MONEY // Monetary values, including unit
4 6 14 25
The Mona Lisa // WORK_OF_ART // Titles of books, songs, etc.
8 11 33 46
Louvre // LOC // Non-GPE locations, mountain ranges, bodies of water
13 14 54 60
Paris // GPE // Countries, cities, states
15 16 64 69


In [2]:
#Create an unknown entity and you'll need to manually set its recognition
doc2=nlp(u"Grubbo will sell $300m in stock.")
print("Before manually recognizing:")
show_ents(doc2)
from spacy.tokens import Span
ORG=doc.vocab.strings[u"ORG"] #this returns 381 if called, the hash of ORG
new_ent=Span(doc2,0,1,label=ORG) #from startindex up to but not including endindex
doc2.ents=list(doc2.ents) + [new_ent]
print("\n After manually recognizing:")
show_ents(doc2)

Before manually recognizing:
300 // MONEY // Monetary values, including unit
4 5 18 21

 After manually recognizing:
Grubbo // ORG // Companies, agencies, institutions, etc.
0 1 0 6
300 // MONEY // Monetary values, including unit
4 5 18 21


In [3]:
#Add a new named entity for non-proper nouns that you're searching for in the text
from spacy.matcher import PhraseMatcher
doc3=nlp(u"I got a new vacuum cleaner." u"I love my vacuum-cleaner.")

print("before custom PhraseMatcher:")
show_ents(doc3)

print("\n after custom PhraseMatcher:")
matcher = PhraseMatcher(nlp.vocab)
phrase_list = ['vacuum cleaner', 'vacuum-cleaner']
phrase_patterns = [nlp(text) for text in phrase_list]
matcher.add('newproduct', None, *phrase_patterns)
matches = matcher(doc3)
print(matches)

print("\n after new named entities added to show_ents function:")
from spacy.tokens import Span
PROD = doc.vocab.strings[u"PRODUCT"]
new_ents = [Span(doc3,match[1],match[2],label=PROD) for match in matches]
doc3.ents = list(doc3.ents) + new_ents
show_ents(doc3)

before custom PhraseMatcher:
No entities in document

 after custom PhraseMatcher:
[(2689272359382549672, 4, 6), (2689272359382549672, 10, 13)]

 after new named entities added to show_ents function:
vacuum cleaner // PRODUCT // Objects, vehicles, foods, etc. (not services)
4 6 12 26
vacuum-cleaner // PRODUCT // Objects, vehicles, foods, etc. (not services)
10 13 37 51


In [4]:
#How to catch multiple different named entities of the same type
doc4 = nlp(u"I bought for $500 and sold for 600 dollars")
len([ent for ent in doc4.ents if ent.label_ == "MONEY"])   #2


2

In [5]:
from spacy import displacy
print("before options specified:")
doc5=nlp(u"In May I paid 500 dollars to see The Mona Lisa at the Louvre in Paris."
        u"Sony sold almost 10,000 Walkman Music Players.")
for sent in doc5.sents: #better for printing each sentence on a new line
    displacy.render(nlp(sent.text),style='ent',jupyter=True)
print("\n after options specified:")
colors={'ORG':'green',
        'WORK_OF_ART':'radial-gradient(orange,purple)',
        'MONEY':'linear-gradient(90deg,green,yellow)'}
options = {'ents':['WORK_OF_ART','ORG','MONEY'],'colors':colors}
displacy.render(doc5,style='ent',jupyter=True,options=options)
#If you'd like to serve outside of Jupyter, uncomment this line
#displacy.serve(doc5,style='ent',options=options)

before options specified:



 after options specified:


In [9]:
import spacy 
nlp = spacy.load('en_core_web_sm')
doc6=nlp(u'This is a sentence. This is also a sentence. Surprisingly, this too is a sentence.')
print("Single word return:")
print(doc6[0]) # very easy to grab tokens by index from the doc itself, but not from doc.sents
print("\neach sentence return:")
for sent in doc6.sents:
    print(sent)

#this is the best way to return a sentence object, but it returns Span not String
print("\n Single sentence return:")
print(list(doc6.sents)[0])
print(type(list(doc6.sents)[0])) 

Single word return:
This

each sentence return:
This is a sentence.
This is also a sentence.
Surprisingly, this too is a sentence.

 Single sentence return:
This is a sentence.
<class 'spacy.tokens.span.Span'>


In [2]:
#add or change SENTENCE SEGMENTATION rules to better split text
import spacy
nlp = spacy.load('en_core_web_sm')
doc7 = nlp(u' "Management is bad; leadership is bad." -Gandhi')

print("1. default segmentation rules:")
for sent in doc7.sents:
    print(sent)
    print('\n')
    
print("\n 2. After adding segmentation rules:")
#we create a new component for our pipeline with additional sentence segmentation rules
#need to register the custom component with a name before adding to pipeline
@spacy.Language.component("set_custom_boundaries")
def set_custom_boundaries(doc):
    for token in doc[:-1]:  #we iterate from the start to the end -1...
        if token.text == ';':
            doc[token.i+1].is_sent_start = True  #to prevent a segmentation fault here
    return doc

# nlp.add_pipe("set_custom_boundaries",before="parser")
print("new pipeline:")
print(nlp.pipe_names)# print(set_custom_boundaries(doc7))
print("new output:")
for sent in doc7.sents:
    print(sent)

print("\n 3. changing segmentation rules:")
mystring=u"This is a sentence. This is another. \n\n This is a \nthird."
print("Source string:")
print(mystring)
doc8=nlp(mystring)
print("Separated by sentence default:")
for sentence in doc8.sents:
    print(sentence)



1. default segmentation rules:
 "Management is bad; leadership is bad."


-Gandhi



 2. After adding segmentation rules:
new pipeline:
['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']
new output:
 "Management is bad; leadership is bad."
-Gandhi

 3. changing segmentation rules:
Source string:
This is a sentence. This is another. 

 This is a 
third.
Separated by sentence default:
This is a sentence.
This is another. 

 
This is a 
third.


In [5]:
#This code is outdated. Currently working to update this with 2023 spacy.
print("\n 4. Separated by custom SentenceSegmenter")
from spacy.pipeline import SentenceSegmenter
def split_on_newline(doc):
    start = 0
    seen_newline=False
    for word in doc:
        if seen_newline:
            yield doc[start:word.i]
            start = word.i
            seen_newline=False
        elif word.text.startswith('\n'):
            seen_newline=True
    yield doc[start:]

segmented = SentenceSegmenter(nlp.vocab,strategy=split_on_newline)
nlp.add_pipe(segmented)
print("new pipeline:")
print(nlp.pipe_names)# print(set_custom_boundaries(doc7))
print("new output:")
doc9=nlp(mystring)
for sent in doc9.sents:
    print(sent)



 4. Separated by custom SentenceSegmenter


AttributeError: module 'spacy' has no attribute 'SentenceSegmenter'