In [199]:
import attr
import os
import spacy
import re

import numpy as np

from annoy import AnnoyIndex
from cached_property import cached_property
from glob import glob
from torchtext.vocab import Vectors
from tqdm import tqdm_notebook
from itertools import islice

In [79]:
nlp = spacy.load('en')
nlp.add_pipe(nlp.create_pipe('sentencizer'))

In [117]:
vectors = Vectors('glove.840B.300d.txt')

In [101]:
def clean_text(text):
    return re.sub('[\s]{2,}|\n', ' ', text.strip())

In [224]:
@attr.s
class Sentence:
    
    text = attr.ib()
        
    @cached_property
    def doc(self):
        return nlp(self.text, disable=['parser', 'tagger', 'ner'])
    
    def tokens(self):
        return [t.text for t in self.doc]
    
    def vector_dan(self):
        
        embeds = [
            vectors[t].squeeze().tolist()
            for t in self.tokens()
        ]
        
        return np.mean(embeds, 0)

In [225]:
@attr.s
class Segment:
    
    path = attr.ib()
    
    def text(self):
        with open(self.path) as fh:
            return clean_text(fh.read())
        
    @cached_property
    def doc(self):
        return nlp(self.text(), disable=['tokenizer', 'parser', 'tagger', 'ner'])
    
    def sentences(self):
        for sent in self.doc.sents:
            yield Sentence(sent.text)

In [226]:
@attr.s
class NewspaperCorpus:
    
    root = attr.ib()
    
    def paths(self):
        return glob(os.path.join(self.root, '**/*.txt'), recursive=True)
    
    def segments(self):
        for path in tqdm_notebook(self.paths()):
            yield Segment(path)
            
    def sentences(self):
        for segment in self.segments():
            yield from segment.sentences()

In [227]:
c = NewspaperCorpus('../data/kathy2012/newspapers2012/')

In [253]:
sents = {
    i: sent
    for i, sent in enumerate(islice(c.sentences(), 100000))
}

In [254]:
len(sents)

100000

In [255]:
idx_dan = AnnoyIndex(300)

for i, sent in tqdm_notebook(sents.items()):
    idx_dan.add_item(i, sent.vector_dan())
    
idx_dan.build(10)




True

In [262]:
query = Sentence("I'm worried about the economy.")

for ri in idx_dan.get_nns_by_vector(query.vector_dan(), 10):
    print(sents[ri].text, '\n')

I think there’s concern out there. 

I feel very good about where I’m at. 

I’m sure the neighbors up the road think I’m a lunatic. 

I just hate the idea of losing it.” 

I’m just not comfortable about something like this and saw no point to it. 

I count all the states I’m glad I’m not the governor of.” 

I’m so hurt, I guess I’m just look- ing for some input into this. 

And this election isn’t just about what we think about unions. 

I’m already on the bal- lot. 

I’m writing now because I have a problem. 

