In [166]:
import attr
import os
import spacy
import re

import numpy as np

from cached_property import cached_property
from glob import glob
from torchtext.vocab import Vectors
from tqdm import tqdm_notebook

In [79]:
nlp = spacy.load('en')
nlp.add_pipe(nlp.create_pipe('sentencizer'))

In [117]:
vectors = Vectors('glove.840B.300d.txt')

In [101]:
def clean_text(text):
    return re.sub('[\s]{2,}|\n', ' ', text.strip())

In [155]:
@attr.s
class Sentence:
    
    text = attr.ib()
        
    @cached_property
    def doc(self):
        return nlp(self.text, disable=['parser', 'tagger', 'ner'])
    
    def tokens(self):
        return [t.text for t in self.doc]
    
    def vector_dan(self):
        return np.mean([vectors[t].tolist() for t in self.tokens()], 0)

In [171]:
@attr.s
class Segment:
    
    path = attr.ib()
    
    def text(self):
        with open(self.path) as fh:
            return clean_text(fh.read())
        
    @cached_property
    def doc(self):
        return nlp(self.text(), disable=['tokenizer', 'parser', 'tagger', 'ner'])
    
    def sentences(self):
        for sent in self.doc.sents:
            yield Sentence(sent.text)

In [172]:
@attr.s
class NewspaperCorpus:
    
    root = attr.ib()
    
    def paths(self):
        return glob(os.path.join(self.root, '**/*.txt'), recursive=True)
    
    def segments(self):
        for path in tqdm_notebook(self.paths()):
            yield Segment(path)
            
    def sentences(self):
        for segment in self.segments():
            yield from segment.sentences()

In [173]:
c = NewspaperCorpus('../data/kathy2012/newspapers2012/')

In [174]:
sents = {
    i: sent
    for i, sent in enumerate(c.sentences())
}




In [175]:
len(sents)

785611