In [70]:
import attr
import os
import spacy
import re
import torch

import numpy as np
import pandas as pd

from annoy import AnnoyIndex
from cached_property import cached_property
from glob import glob
from tqdm import tqdm_notebook
from itertools import islice
from boltons.iterutils import chunked_iter
from datetime import datetime as dt

from sent_order.models import kt_regression as kt_reg

In [None]:
sent_encoder = torch.load(
    '../../plot-ordering/data/models/new/kt-reg/sent_encoder.68.bin',
    map_location={'cuda:0': 'cpu'}
)

In [101]:
nlp = spacy.load('en')
nlp.add_pipe(nlp.create_pipe('sentencizer'))

In [102]:
def clean_text(text):
    return re.sub('[\s]{2,}|\n', ' ', text.strip())

In [103]:
@attr.s
class Sentence:
    
    text = attr.ib()
        
    @cached_property
    def doc(self):
        return nlp(self.text, disable=['parser', 'tagger', 'ner'])
    
    def tokens(self):
        return [t.text for t in self.doc]
    
    def sent_order_variable(self):
        return kt_reg.Sentence(self.tokens()).variable()
    
    def embedding(self):
        x = self.sent_order_variable()
        return sent_encoder([x])[0].data.tolist()

In [104]:
@attr.s
class Segment:
    
    path = attr.ib()
    
    def text(self):
        with open(self.path) as fh:
            return clean_text(fh.read())
        
    @cached_property
    def doc(self):
        return nlp(self.text(), disable=['tokenizer', 'parser', 'tagger', 'ner'])
    
    @cached_property
    def paper_name(self):
        return self.path.split(os.sep)[-3]
    
    @cached_property
    def date(self):
        return dt.strptime(self.path.split(os.sep)[-2], '%m-%d-%Y')
    
    def sentence_texts(self):
        for sent in self.doc.sents:
            yield sent.text

In [105]:
@attr.s
class NewspaperCorpus:
    
    root = attr.ib()
    
    def paths(self):
        return glob(os.path.join(self.root, '**/*.txt'), recursive=True)
    
    def segments(self):
        for path in tqdm_notebook(self.paths()):
            yield Segment(path)
            
    def df_rows(self, skim=None):
        for segment in self.segments():
            for text in segment.sentence_texts():
                yield dict(paper_name=segment.paper_name, date=segment.date, text=text)

In [106]:
c = NewspaperCorpus('../data/kathy2012/newspapers2012/')

In [78]:
sent_db = pd.DataFrame(list(c.df_rows()))

In [80]:
sent_db

Unnamed: 0,date,paper_name,text
0,2012-01-31,Ashland Daily Post,"NATION THE DAILY PRESS – Ashland, WI – Tuesday..."
1,2012-01-31,Ashland Daily Post,"It is the race you make it,’’ an upset-minded ..."
2,2012-01-31,Ashland Daily Post,"In the race’s final hours, former Massachusett..."
3,2012-01-31,Ashland Daily Post,From Sioux City in the western part of the sta...
4,2012-01-31,Ashland Daily Post,That and the $13 million or more already spent...
5,2012-01-31,Ashland Daily Post,Romney had one eye on his GOP rivals and anoth...
6,2012-01-31,Ashland Daily Post,"The president has been ‘‘a great divider, the ..."
7,2012-01-31,Ashland Daily Post,"Later, before a noisy crowd in Marion, he pre-..."
8,2012-01-31,Ashland Daily Post,We’re going to win this thing with all of our ...
9,2012-01-31,Ashland Daily Post,Texas Rep. Ron Paul flew into the state accom-...


In [107]:
sent_idx = AnnoyIndex(1000)

id_text = list(sent_db['text'].iteritems())

for chunk in chunked_iter(tqdm_notebook(id_text), 1000):
    
    ids, texts = zip(*chunk)
    
    x = [Sentence(t).sent_order_variable() for t in texts]
    x = sent_encoder(x)
    
    for i, v in zip(ids, x):
        sent_idx.add_item(i, v.data.tolist())
    
sent_idx.build(10)




True

In [90]:
def query(text, n=10):
    sent = Sentence(text)
    for ri in sent_idx.get_nns_by_vector(sent.embedding(), n):
        print(sent_db.iloc[ri].text, '\n')

In [92]:
query("This is wonderful.")

This is just so wonderful.” 

This is just so wonderful.” 

This is a great way to spend a cold winter evening. 

This is one of the characteristics that make our country great. 

This is a great opportunity for us today,’’ Walker said. ‘‘ 

This is great research-based information about how to be successful with whatev- er the topic is.” 

This is a wonder- ful opportunity. “ 

This is by far the most ambitious thing I’ve done in my career. 

This is a very good venue for local artists to show their work,” Lorber said. “ 

This is all because we’re in the process of negotiating the permanent rule,’’ she said. 



In [98]:
with open('../data/sent_db.json', 'w') as fh:
    print(sent_db.to_json(orient='records'), file=fh)

In [94]:
sent_idx.save('../data/sent_idx.bin')

True