In [78]:
import attr
import os
import spacy
import re

from cached_property import cached_property
from glob import glob

In [79]:
nlp = spacy.load('en')
nlp.add_pipe(nlp.create_pipe('sentencizer'))

In [101]:
def clean_text(text):
    return re.sub('[\s]{2,}|\n', ' ', text.strip())

In [109]:
@attr.s
class Sentence:
    tokens = attr.ib()

In [110]:
@attr.s
class Segment:
    
    path = attr.ib()
    
    def text(self):
        with open(self.path) as fh:
            return clean_text(fh.read())
        
    @cached_property
    def doc(self):
        return nlp(self.text(), disable=['parser', 'tagger', 'ner'])
    
    def sentences(self):
        for sent in self.doc.sents:
            yield Sentence([t.text for t in sent])

In [111]:
@attr.s
class NewspaperCorpus:
    
    root = attr.ib()
    
    def paths(self):
        return glob(os.path.join(self.root, '**/*.txt'), recursive=True)
    
    def segments(self):
        for path in self.paths():
            yield Segment(path)
            
    def sentenecs(self):
        for segment in self.segments():
            yield from segment.sentences()

In [112]:
c = NewspaperCorpus('../data/kathy2012/newspapers2012/')

In [113]:
sents = c.sentenecs()

In [114]:
seg = next(c.segments())

In [115]:
list(seg.sentences())[50]

Sentence(tokens=['The', 'sprawling', 'complex', 'has', 'two', 'minarets', 'rising', 'over', 'an', 'expressway', 'that', 'leads', 'to', 'the', 'John', 'F.', 'Kennedy', 'International', 'Airport', '.'])