In [1]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [2]:
doc = nlp(u'This is the firs sentece. This is another sentence. This is the last sentence')

In [4]:
for sent in doc.sents:
    print(sent)

This is the firs sentece.
This is another sentence.
This is the last sentence


In [7]:
doc = nlp(u'"Management is doing the right things; leadership is doing the right things." -Peter Drucker')

In [8]:
doc.text

'"Management is doing the right things; leadership is doing the right things." -Peter Drucker'

In [9]:
for sent in doc.sents:
    print(sent)

"Management is doing the right things; leadership is doing the right things."
-Peter Drucker


In [10]:
# Add segementation rule
def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if token.text == ';':
            doc[token.i + 1].is_sent_start = True
    return doc

In [11]:
nlp.add_pipe(set_custom_boundaries, before='parser')

In [12]:
nlp.pipe_names

['tagger', 'set_custom_boundaries', 'parser', 'ner']

In [13]:
doc4 = nlp(u'"Management is doing the right things; leadership is doing the right things." -Peter Drucker')

In [14]:
for sent in doc4.sents:
    print(sent)

"Management is doing the right things;
leadership is doing the right things."
-Peter Drucker


In [62]:
# Change segmentation rules

In [63]:
nlp = spacy.load('en_core_web_sm')

In [64]:
mystring = u"This is a sentence. This is another.\n\nThis is a  \nthird sentence"

In [65]:
print(mystring)

This is a sentence. This is another.

This is a  
third sentence


In [66]:
doc = nlp(mystring)

In [67]:
for sent in doc.sents:
    print(sent)

This is a sentence.
This is another.


This is a  
third sentence


In [68]:
def split_on_newline(doc):
    seen_newline = False
    start = 0
    for word in doc:
        if seen_newline == True:
            yield doc[start:word.i]
            start = word.i
            seen_newline=False
        elif word.text.startswith('\n'):
            seen_newline = True
    yield doc[start:]
            

In [69]:
from spacy.pipeline import SentenceSegmenter

In [70]:
sbd = SentenceSegmenter(nlp.vocab, strategy=split_on_newline)

In [71]:
nlp.add_pipe(sbd)

In [72]:
doc = nlp(mystring)

In [73]:
for sent in doc.sents:
    print(sent)

This is a sentence. This is another.


This is a  
third sentence
