In [1]:
import spacy 
nlp = spacy.load('en_core_web_sm')

In [2]:
# From Spacy Basics:
doc = nlp(u'This is the first sentence. This is another sentence. This is the last sentence.')

for sent in doc.sents:
    print(sent)

This is the first sentence.
This is another sentence.
This is the last sentence.


In [3]:
doc2 = nlp(u'"In the comming hours; the legendary manchester United will not be legendary anymore against Arsenal Fc HAHa" -> Peter Drury')

In [5]:
for sent in doc2.sents:
    print(sent)
    print('\n')

"In the comming hours the legendary manchester United will not be legendary anymore against Arsenal Fc HAHa" -> Peter Drury




# Adding a new rule to the Pipelines 

In [16]:
from spacy.language import Language

# ADD A NEW RULE TO THE PIPELINE
@Language.component("set_custom_boundaries")  # Assign a name to the custom component
def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if token.text == ';':
            doc[token.i].is_sent_start = True
    return doc

nlp.add_pipe("set_custom_boundaries", before="parser")  # Add the custom component by its name

print(nlp.pipe_names)


['tok2vec', 'tagger', 'set_custom_boundaries', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


In [17]:
doc3 = nlp(u'"In the comming hours; the legendary manchester United will not be legendary anymore against Arsenal Fc HAHa" -> Peter Drury')

In [18]:
for sent in doc3.sents:
    print(sent)

"In the comming hours
; the legendary manchester United will not be legendary anymore against Arsenal Fc HAHa" -> Peter Drury


In [19]:
# Changing the segmentation rules ie to break on line breaks etc

In [20]:
# Reload the spacy lib
nlp = spacy.load('en_core_web_sm')

In [21]:
mystring = u"This is a sentence. This is another.\n\nThis is a \nthird sentence."
print(mystring)

This is a sentence. This is another.

This is a 
third sentence.


In [22]:
doc = nlp(mystring)

In [23]:
for sentence in doc.sents:
    print(sentence)

This is a sentence.
This is another.


This is a 
third sentence.


In [27]:
import spacy
from spacy.language import Language

@Language.component("custom_sentencizer")
def custom_sentencizer(doc):
    for i, token in enumerate(doc[:-2]):
        # Define sentence start if pipe + titlecase token
        if token.text == "|" and doc[i + 1].is_title:
            doc[i + 1].is_sent_start = True
        else:
            # Explicitly set sentence start to False otherwise, to tell
            # the parser to leave those tokens alone
            doc[i + 1].is_sent_start = False
    return doc

nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("custom_sentencizer", before="parser")  # Insert before the parser
doc = nlp(mystring)
for sent in doc.sents:
    print(sent.text)


This is a sentence. This is another.

This is a 
third sentence.


In [28]:
doc = nlp(mystring)
for sent in doc.sents:
    print([token.text for token in sent])

['This', 'is', 'a', 'sentence', '.', 'This', 'is', 'another', '.', '\n\n', 'This', 'is', 'a', '\n', 'third', 'sentence', '.']
