In [120]:
import attr
import os
import spacy
import re

import numpy as np

from cached_property import cached_property
from glob import glob
from torchtext.vocab import Vectors

In [79]:
nlp = spacy.load('en')
nlp.add_pipe(nlp.create_pipe('sentencizer'))

In [117]:
vectors = Vectors('glove.840B.300d.txt')

In [101]:
def clean_text(text):
    return re.sub('[\s]{2,}|\n', ' ', text.strip())

In [155]:
@attr.s
class Sentence:
    
    text = attr.ib()
        
    @cached_property
    def doc(self):
        return nlp(self.text, disable=['parser', 'tagger', 'ner'])
    
    def tokens(self):
        return [t.text for t in self.doc]
    
    def vector_dan(self):
        return np.mean([vectors[t].tolist() for t in self.tokens()], 0)

In [156]:
@attr.s
class Segment:
    
    path = attr.ib()
    
    def text(self):
        with open(self.path) as fh:
            return clean_text(fh.read())
        
    @cached_property
    def doc(self):
        return nlp(self.text(), disable=['tokenizer', 'parser', 'tagger', 'ner'])
    
    def sentences(self):
        for sent in self.doc.sents:
            yield Sentence(sent.text)

In [157]:
@attr.s
class NewspaperCorpus:
    
    root = attr.ib()
    
    def paths(self):
        return glob(os.path.join(self.root, '**/*.txt'), recursive=True)
    
    def segments(self):
        for path in self.paths():
            yield Segment(path)
            
    def sentences(self):
        for segment in self.segments():
            yield from segment.sentences()

In [158]:
c = NewspaperCorpus('../data/kathy2012/newspapers2012/')

In [159]:
sents = c.sentences()

In [160]:
seg = next(c.segments())

In [161]:
list(seg.sentences())[50].vector_dan()

array([ 1.16079751e-01,  7.67797520e-02,  1.58425460e-02,  9.04020471e-03,
        1.88838151e-01,  1.61405149e-02, -6.11104304e-02,  2.02422897e-01,
        6.94794003e-02,  2.17762578e+00,  2.60233564e-02, -3.99143423e-02,
        9.87007338e-02, -1.24269025e-04, -7.27600504e-02, -1.31157401e-01,
       -1.05119752e-01,  8.20595861e-01, -6.12178512e-02, -1.10416892e-02,
       -3.85753071e-02,  7.89622011e-02, -4.14466657e-02,  5.20343993e-02,
        4.64679023e-02, -1.03831300e-01, -1.68361682e-01,  3.77670506e-02,
        1.74237892e-02, -1.21406998e-02,  1.46070765e-01,  3.04711652e-01,
       -4.56274953e-03,  6.00779916e-03,  1.23606851e-01, -1.08725413e-01,
        9.71560003e-03, -2.03691004e-02,  3.89082047e-02, -9.23848990e-02,
        1.05770700e-01, -9.55240485e-02, -1.40809872e-02, -2.01666050e-01,
        5.04097565e-02, -2.95507797e-02, -1.92805402e-01, -4.16759603e-02,
       -1.34415001e-01,  8.08078210e-02, -3.37444960e-02,  4.11042401e-02,
       -5.18174527e-02,  