In [130]:
import attr
import os
import spacy
import re
import torch
import ujson

import dateutil.parser
import numpy as np
import pandas as pd

from annoy import AnnoyIndex
from cached_property import cached_property
from glob import glob
from tqdm import tqdm_notebook
from itertools import islice
from boltons.iterutils import chunked_iter, windowed_iter
from datetime import datetime as dt
from collections import UserDict

from sent_order.models import kt_regression as kt_reg

In [131]:
sent_encoder = torch.load(
    '../../plot-ordering/data/models/new/kt-reg/sent_encoder.68.bin',
    map_location={'cuda:0': 'cpu'}
)



In [27]:
nlp = spacy.load('en')
nlp.add_pipe(nlp.create_pipe('sentencizer'))

In [28]:
def clean_text(text):
    return re.sub('[\s]{2,}|\n', ' ', text.strip())

In [29]:
@attr.s
class Sentence:
    
    text = attr.ib()
        
    @cached_property
    def doc(self):
        return nlp(self.text, disable=['parser', 'tagger', 'ner'])
    
    def tokens(self):
        return [t.text for t in self.doc]
    
    def sent_order_x(self):
        return kt_reg.Sentence(self.tokens()).variable()
    
    def embedding(self):
        x = self.sent_order_x()
        return sent_encoder([x])[0].data.tolist()

In [30]:
@attr.s
class NewspaperSegment:
    
    path = attr.ib()
    
    def text(self):
        with open(self.path) as fh:
            return clean_text(fh.read())
        
    @cached_property
    def doc(self):
        return nlp(self.text(), disable=['tokenizer', 'parser', 'tagger', 'ner'])
    
    @cached_property
    def paper_name(self):
        return self.path.split(os.sep)[-3]
    
    @cached_property
    def date(self):
        return dt.strptime(self.path.split(os.sep)[-2], '%m-%d-%Y')
    
    def sentence_texts(self):
        for sent in self.doc.sents:
            yield sent.text

In [56]:
@attr.s
class NewspaperCorpus:
    
    root = attr.ib()
    
    def paths(self):
        return glob(os.path.join(self.root, '**/*.txt'), recursive=True)
    
    def segments(self):
        for path in tqdm_notebook(self.paths()):
            yield NewspaperSegment(path)
            
    def df_rows(self):
        for segment in self.segments():
            for text in segment.sentence_texts():
                yield dict(paper_name=segment.paper_name, date=segment.date, text=text)
                
    def df(self, skim=None):
        return pd.DataFrame(list(islice(self.df_rows(), skim)))

In [160]:
class YoutubeTranscript(UserDict):
    
    @classmethod
    def from_path(cls, path):
        with open(path) as fh:
            return cls(ujson.load(fh))
        
    @cached_property
    def doc(self):
        return nlp(self['transcript'], disable=['parser', 'tagger', 'ner'])
    
    @cached_property
    def published_at(self):
        return dateutil.parser.parse(self['published_at'])
    
    def sentence_texts(self, size=10):
        for chunk in windowed_iter(self.doc, size):
            yield ' '.join([t.text for t in chunk])

In [137]:
@attr.s
class YoutubeCorpus:
    
    root = attr.ib()
    
    def paths(self):
        return glob(os.path.join(self.root, '**/*.json'), recursive=True)
    
    def transcripts(self):
        for path in self.paths():
            yield YoutubeTranscript.from_path(path)
            
    def df_rows(self):
        for transcript in self.transcripts():
            for text in transcript.sentence_texts():
                yield dict(
                    channel_title=transcript['channel_title'], 
                    title=transcript['title'], 
                    published_at=transcript.published_at,
                    text=text,
                )
                
    def df(self):
        return pd.DataFrame(list(self.df_rows()))

In [138]:
@attr.s
class KathyTranscript:
    
    path = attr.ib()
    
    def lines(self):
        with open(self.path) as fh:
            for line in fh.read().splitlines():
                yield line
                
    def text(self):
        return ' '.join(self.lines())
        
    @cached_property
    def doc(self):
        return nlp(self.text(), disable=['tokenizer', 'parser', 'tagger', 'ner'])
    
    @cached_property
    def basename(self):
        return os.path.basename(self.path)
    
    def sentence_texts(self):
        for sent in self.doc.sents:
            yield sent.text

In [139]:
@attr.s
class KathyCorpus:
    
    root = attr.ib()
    
    def paths(self):
        return glob(os.path.join(self.root, '**/*.txt'), recursive=True)
    
    def transcripts(self):
        for path in self.paths():
            yield KathyTranscript(path)
            
    def df_rows(self):
        for transcript in self.transcripts():
            for text in transcript.sentence_texts():
                yield dict(
                    basename=transcript.basename,
                    text=text,
                )
                
    def df(self):
        return pd.DataFrame(list(self.df_rows()))

In [140]:
def build_index(text_series):
    
    sent_idx = AnnoyIndex(1000)

    id_text = list(text_series.iteritems())

    for chunk in chunked_iter(tqdm_notebook(id_text), 100):

        ids, texts = zip(*chunk)

        x = [Sentence(t).sent_order_x() for t in texts]
        x = sent_encoder(x)

        for i, v in zip(ids, x):
            sent_idx.add_item(i, v.data.tolist())

    sent_idx.build(10)
    
    return sent_idx

In [121]:
news_corpus = NewspaperCorpus('../data/kathy2012/newspapers2012/')

In [122]:
news_df = news_corpus.df(10000)




In [123]:
news_idx = build_index(news_df['text'])




In [161]:
yt_corpus = YoutubeCorpus('../data/kathy2012/youtube2012/')

In [162]:
yt_df = yt_corpus.df()

In [163]:
yt_idx = build_index(yt_df['text'])




In [127]:
kathy_corpus = KathyCorpus('../data/kathy2012/transcripts2012/')

In [128]:
kathy_df = kathy_corpus.df()

In [129]:
kathy_idx = build_index(kathy_df['text'])




In [151]:
def query(df, idx, text, n=10):
    for ri in idx.get_nns_by_vector(Sentence(text).embedding(), n):
        print(df.iloc[ri].text, '\n')

In [152]:
query(news_df, news_idx, "This is wonderful.")

This is just so wonderful.” 

This is just so wonderful.” 

This is a great way to spend a cold winter evening. 

This is one of the characteristics that make our country great. 

This is a great opportunity for us today,’’ Walker said. ‘‘ 

This is great research-based information about how to be successful with whatev- er the topic is.” 

This is a wonder- ful opportunity. “ 

This is by far the most ambitious thing I’ve done in my career. 

This is a very good venue for local artists to show their work,” Lorber said. “ 

This is all because we’re in the process of negotiating the permanent rule,’’ she said. 



In [166]:
query(yt_df, yt_idx, "This is wonderful.")

this is a huge victory for Wisconsin 's middle class 

this has been an incredible journey one that has shown 

this is at a time when we 're supposed to 

this important piece of the challenge that lies ahead of 

this is an exciting day this is a day of 

this is one of our exciting days most exciting days 

this is one of our exciting days most exciting days 

this has been an incredible journey 14 months long this 

but this shows Kosovo as a whole a lot of 

this fundamental fairness issue we know from Tommy Thompson 's 



In [135]:
query(kathy_df, kathy_idx, "This is wonderful.")

It's wonderful. 

That's wonderful. 

It's really great. 

This is Wisconsin. 

So great. 

This is very helpful. 

It's phenomenal. 

It's a good one. 

This has been such a great conversation. 

This has been... this welfare stuff has been getting out of control for years. 

