In [82]:
import attr
import os
import spacy
import re
import torch
import ujson

import dateutil.parser
import numpy as np
import pandas as pd

from annoy import AnnoyIndex
from cached_property import cached_property
from glob import glob
from tqdm import tqdm_notebook
from itertools import islice
from boltons.iterutils import chunked_iter
from datetime import datetime as dt
from collections import UserDict

from sent_order.models import kt_regression as kt_reg

In [26]:
sent_encoder = torch.load(
    '../../plot-ordering/data/models/new/kt-reg/sent_encoder.68.bin',
    map_location={'cuda:0': 'cpu'}
)



In [27]:
nlp = spacy.load('en')
nlp.add_pipe(nlp.create_pipe('sentencizer'))

In [28]:
def clean_text(text):
    return re.sub('[\s]{2,}|\n', ' ', text.strip())

In [29]:
@attr.s
class Sentence:
    
    text = attr.ib()
        
    @cached_property
    def doc(self):
        return nlp(self.text, disable=['parser', 'tagger', 'ner'])
    
    def tokens(self):
        return [t.text for t in self.doc]
    
    def sent_order_x(self):
        return kt_reg.Sentence(self.tokens()).variable()
    
    def embedding(self):
        x = self.sent_order_x()
        return sent_encoder([x])[0].data.tolist()

In [30]:
@attr.s
class NewspaperSegment:
    
    path = attr.ib()
    
    def text(self):
        with open(self.path) as fh:
            return clean_text(fh.read())
        
    @cached_property
    def doc(self):
        return nlp(self.text(), disable=['tokenizer', 'parser', 'tagger', 'ner'])
    
    @cached_property
    def paper_name(self):
        return self.path.split(os.sep)[-3]
    
    @cached_property
    def date(self):
        return dt.strptime(self.path.split(os.sep)[-2], '%m-%d-%Y')
    
    def sentence_texts(self):
        for sent in self.doc.sents:
            yield sent.text

In [56]:
@attr.s
class NewspaperCorpus:
    
    root = attr.ib()
    
    def paths(self):
        return glob(os.path.join(self.root, '**/*.txt'), recursive=True)
    
    def segments(self):
        for path in tqdm_notebook(self.paths()):
            yield NewspaperSegment(path)
            
    def df_rows(self):
        for segment in self.segments():
            for text in segment.sentence_texts():
                yield dict(paper_name=segment.paper_name, date=segment.date, text=text)
                
    def df(self, skim=None):
        return pd.DataFrame(list(islice(self.df_rows(), skim)))

In [94]:
class YoutubeTranscript(UserDict):
    
    @classmethod
    def from_path(cls, path):
        with open(path) as fh:
            return cls(ujson.load(fh))
        
    @cached_property
    def doc(self):
        return nlp(self['transcript'], disable=['parser', 'tagger', 'ner'])
    
    @cached_property
    def published_at(self):
        return dateutil.parser.parse(self['published_at'])
    
    def sentence_texts(self, size=20):
        for chunk in chunked_iter(self.doc, 20):
            yield ' '.join([t.text for t in chunk])

In [95]:
@attr.s
class YoutubeCorpus:
    
    root = attr.ib()
    
    def paths(self):
        return glob(os.path.join(self.root, '**/*.json'), recursive=True)
    
    def transcripts(self):
        for path in tqdm_notebook(self.paths()):
            yield YoutubeTranscript.from_path(path)
            
    def df_rows(self):
        for transcript in self.transcripts():
            for text in transcript.sentence_texts():
                yield dict(
                    channel_title=transcript['channel_title'], 
                    title=transcript['title'], 
                    published_at=transcript.published_at,
                    text=text,
                )
                
    def df(self):
        return pd.DataFrame(list(self.df_rows()))

In [96]:
def build_index(text_series):
    
    sent_idx = AnnoyIndex(1000)

    id_text = list(text_series.iteritems())

    for chunk in chunked_iter(tqdm_notebook(id_text), 100):

        ids, texts = zip(*chunk)

        x = [Sentence(t).sent_order_x() for t in texts]
        x = sent_encoder(x)

        for i, v in zip(ids, x):
            sent_idx.add_item(i, v.data.tolist())

    sent_idx.build(10)
    
    return sent_idx

In [70]:
news_corpus = NewspaperCorpus('../data/kathy2012/newspapers2012/')

In [71]:
news_df = news_corpus.df(10000)




In [73]:
news_idx = build_index(news_df['text'])




In [97]:
yt_corpus = YoutubeCorpus('../data/kathy2012/youtube2012/')

In [98]:
yt_df = yt_corpus.df()




In [99]:
yt_idx = build_index(yt_df['text'])




In [76]:
def query(df, idx, text, n=10):
    for ri in idx.get_nns_by_vector(Sentence(text).embedding(), n):
        print(df.iloc[ri].text, '\n')

In [77]:
query(news_df, news_idx, "We have to act now to stop climate change.")

We have to be thoughtful about it.’’ 

We think the savings are real.’’ 

We keep it low-key and I'm surrounded by the unconditional love I craved as a child. 

And we’ve got smart guys that they want it, they almost need it. 

We the people have lost control. 

We have so many lost souls in here. 

We need to make sure that changes to our mining laws continue that tradition. 

We need to make sure that changes to our mining laws continue that tradition. 

We need the people in the state to have jobs, to work,’’ he said. 

We need that money to come up here. 



In [102]:
query(yt_df, yt_idx, "This is terrible.")

it is the largest city the most important city in terms of elections and so it really is good to 

this are you waiting for friends you walk it in should we walk in yeah what grade you 're going 

always has been with the states should it be I mean you know I know it always has been absolutely 

this bill was cynically transformed from a memorial resolution to an endorsement of President Bush 's failed policies the Republicans 

to be open to they 're going to require college degrees to get those so we got to find a 

important this election is to real life to President Barack Obama and select me to the United States Senate because 

this little piece out there at all by the way we 're talking about this before in terms of some 

this is the building blocks begin here and not only to the staff but to all of us who are 

the people listening to this speech tonight may not have voted for me but I want you to know that 

back to the start of this incredible country today we remember their