In [3]:
# Perform standard imports
import spacy
nlp = spacy.load('en_core_web_sm')

In [4]:
# From Spacy Basics:
doc = nlp(u'This is the first sentence. This is another sentence. This is the last sentence.')

for sent in doc.sents:
    print(sent)

This is the first sentence.
This is another sentence.
This is the last sentence.


In [5]:
print(doc[1])

is


In [6]:
print(doc.sents[1])

TypeError: 'generator' object is not subscriptable

In [9]:
doc_sents = [sent for sent in doc.sents]

doc_sen = list(doc.sents)
doc_sents, doc_sen

([This is the first sentence.,
  This is another sentence.,
  This is the last sentence.],
 [This is the first sentence.,
  This is another sentence.,
  This is the last sentence.])

In [10]:
print(doc_sents[1])

This is another sentence.


In [11]:
type(doc_sents[1])

spacy.tokens.span.Span

In [12]:
print(doc_sents[1].start, doc_sents[1].end)

6 11


In [13]:
# Parsing the segmentation start tokens happens during the nlp pipeline
doc2 = nlp(u'This is a sentence. This is a sentence. This is a sentence.')

for token in doc2:
    print(token.is_sent_start, ' '+token.text)

True  This
False  is
False  a
False  sentence
False  .
True  This
False  is
False  a
False  sentence
False  .
True  This
False  is
False  a
False  sentence
False  .


In [14]:
# SPACY'S DEFAULT BEHAVIOR
doc3 = nlp(u'"Management is doing things right; leadership is doing the right things." -Peter Drucker')

for sent in doc3.sents:
    print(sent)

"Management is doing things right; leadership is doing the right things."
-Peter Drucker


In [17]:
# ADD A NEW RULE TO THE PIPELINE
from spacy.language import Language
@Language.component("custom_boundaries")
def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if token.text == ';':
            doc[token.i+1].is_sent_start = True
    return doc

nlp.add_pipe("custom_boundaries", before='parser')

nlp.pipe_names

['tok2vec',
 'tagger',
 'custom_boundaries',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'ner']

In [18]:
# Re-run the Doc object creation:
doc4 = nlp(u'"Management is doing things right; leadership is doing the right things." -Peter Drucker')

for sent in doc4.sents:
    print(sent)

"Management is doing things right;
leadership is doing the right things."
-Peter Drucker


In [19]:
# And yet the new rule doesn't apply to the older Doc object:
for sent in doc3.sents:
    print(sent)

"Management is doing things right; leadership is doing the right things."
-Peter Drucker


In [20]:
doc3[7]

leadership

In [21]:
# Try to change the .is_sent_start attribute:
doc3[7].is_sent_start = True

ValueError: [E043] Refusing to write to token.sent_start if its document is parsed, because this may cause inconsistent state.

In [22]:
nlp = spacy.load('en_core_web_sm')  # reset to the original

mystring = u"This is a sentence. This is another.\n\nThis is a \nthird sentence."

# SPACY DEFAULT BEHAVIOR:
doc = nlp(mystring)

for sent in doc.sents:
    print([token.text for token in sent])

['This', 'is', 'a', 'sentence', '.']
['This', 'is', 'another', '.', '\n\n']
['This', 'is', 'a', '\n', 'third', 'sentence', '.']


In [25]:
# CHANGING THE RULES

from spacy.language import Language
@Language.component("custom_segmenter")
def split_on_newlines(doc):
    start = 0
    seen_newline = False
    for word in doc:
        if seen_newline:
            yield doc[start:word.i]
            start = word.i
            seen_newline = False
        elif word.text.startswith('\n'): # handles multiple occurrences
            seen_newline = True
    yield doc[start:]      # handles the last group of tokens



nlp.add_pipe("custom_segmenter")

<function __main__.split_on_newlines(doc)>

In [None]:
from spacy.lang.en import English
nlp = English()
nlp.add_pipe("rest_countries", config={"label": "GPE"})

In [26]:
doc = nlp(mystring)
for sent in doc.sents:
    print([token.text for token in sent])
    

ValueError: [E005] Pipeline component 'custom_segmenter' returned <class 'generator'> instead of a Doc. If you're using a custom component, maybe you forgot to return the processed Doc?