In [32]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [33]:
doc = nlp(u"This is the first sentence. This is another sentence. This is the last sentence.")

In [34]:
for sent in doc.sents:
  print(sent)

This is the first sentence.
This is another sentence.
This is the last sentence.


In [35]:
doc.sents[0]

TypeError: 'generator' object is not subscriptable

In [None]:
list(doc.sents)[0]

In [None]:
type(list(doc.sents)[0])

In [37]:
doc = nlp(u'"Management is doing the right things; leadership is doing the right things." - Peter Drucker')

In [38]:
for sent in doc.sents:
  print(sent)
  print('\n')

"Management is doing the right things; leadership is doing the right things."


- Peter Drucker




In [39]:
# Add a segmentation rule
from spacy.language import Language

@Language.component("set_custom_boundaries")
def set_custom_boundaries(doc):
  for token in doc[:-1]:
    if token.text == ';':
      doc[token.i+1].is_sent_start = True
  return doc

In [None]:
doc[:-1]

In [None]:
nlp.add_pipe("set_custom_boundaries", before="parser")

print(nlp.pipe_names)

['tok2vec', 'tagger', 'set_custom_boundaries', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


In [42]:
doc4 = nlp(
    '"Management is doing the right things; leadership is doing the right things." - Peter Drucker'
)

In [43]:
for sent in doc4.sents:
  print(sent)

"Management is doing the right things;
leadership is doing the right things."
- Peter Drucker


In [None]:
# Change segmentation rules

In [44]:
nlp = spacy.load('en_core_web_sm')

In [45]:
mystring = u"This is a sentence. This is another.\n\nThis is \nthird sentence."

In [46]:
print(mystring)

This is a sentence. This is another.

This is 
third sentence.


In [47]:
doc = nlp(mystring)

In [54]:
sentences = [sent.text.strip() for sent in doc.sents]

# Display the sentences
for sentence in sentences:
    print(sentence)

This is a sentence.
This is another.
This is 
third sentence.


In [51]:
@Language.component("split_on_newlines")
def split_on_newlines(doc):
  start = 0
  seen_newline = False

  for token in doc:
    if seen_newline:
        doc[start].is_sent_start = True
        start = token.i
        seen_newline = False
    elif token.text.startswith("\n"):
        seen_newline = True

  return doc

nlp.add_pipe("split_on_newlines", before="parser")

<function __main__.split_on_newlines(doc)>

In [53]:
doc = nlp(mystring)

for sent in doc.sents:
  print(sent)

This is a sentence.
This is another.


This is 
third sentence.
