In [1]:
import attr
import os
import spacy
import re
import torch
import ujson

import dateutil.parser
import numpy as np
import pandas as pd

from annoy import AnnoyIndex
from cached_property import cached_property
from glob import glob
from tqdm import tqdm_notebook
from itertools import islice
from boltons.iterutils import chunked_iter, windowed_iter
from datetime import datetime as dt
from collections import UserDict

from sent_order.models import kt_regression as kt_reg

In [2]:
sent_encoder = torch.load(
    '../../plot-ordering/data/models/new/kt-reg/sent_encoder.68.bin',
    map_location={'cuda:0': 'cpu'}
)



In [3]:
nlp = spacy.load('en')
nlp.add_pipe(nlp.create_pipe('sentencizer'))

In [4]:
def clean_text(text):
    return re.sub('[\s]{2,}|\n', ' ', text.strip())

In [5]:
@attr.s
class Sentence:
    
    text = attr.ib()
        
    @cached_property
    def doc(self):
        return nlp(self.text, disable=['parser', 'tagger', 'ner'])
    
    def tokens(self):
        return [t.text for t in self.doc]
    
    def sent_order_x(self):
        return kt_reg.Sentence(self.tokens()).variable()
    
    def embedding(self):
        x = self.sent_order_x()
        return sent_encoder([x])[0].data.tolist()

In [6]:
@attr.s
class NewspaperSegment:
    
    path = attr.ib()
    
    def text(self):
        with open(self.path) as fh:
            return clean_text(fh.read())
        
    @cached_property
    def doc(self):
        return nlp(self.text(), disable=['tokenizer', 'parser', 'tagger', 'ner'])
    
    @cached_property
    def paper_name(self):
        return self.path.split(os.sep)[-3]
    
    @cached_property
    def date(self):
        return dt.strptime(self.path.split(os.sep)[-2], '%m-%d-%Y')
    
    def sentence_texts(self):
        for sent in self.doc.sents:
            yield sent.text

In [7]:
@attr.s
class NewspaperCorpus:
    
    root = attr.ib()
    
    def paths(self):
        return glob(os.path.join(self.root, '**/*.txt'), recursive=True)
    
    def segments(self):
        for path in tqdm_notebook(self.paths()):
            yield NewspaperSegment(path)
            
    def df_rows(self):
        for segment in self.segments():
            for text in segment.sentence_texts():
                yield dict(paper_name=segment.paper_name, date=segment.date, text=text)
                
    def df(self, skim=None):
        return pd.DataFrame(list(islice(self.df_rows(), skim)))

In [8]:
class YoutubeTranscript(UserDict):
    
    @classmethod
    def from_path(cls, path):
        with open(path) as fh:
            return cls(ujson.load(fh))
        
    @cached_property
    def doc(self):
        return nlp(self['transcript'], disable=['parser', 'tagger', 'ner'])
    
    @cached_property
    def published_at(self):
        return dateutil.parser.parse(self['published_at'])
    
    def sentence_texts(self, size=10):
        for chunk in windowed_iter(self.doc, size):
            yield ' '.join([t.text for t in chunk])

In [9]:
@attr.s
class YoutubeCorpus:
    
    root = attr.ib()
    
    def paths(self):
        return glob(os.path.join(self.root, '**/*.json'), recursive=True)
    
    def transcripts(self):
        for path in self.paths():
            yield YoutubeTranscript.from_path(path)
            
    def df_rows(self):
        for transcript in self.transcripts():
            for text in transcript.sentence_texts():
                yield dict(
                    channel_title=transcript['channel_title'], 
                    title=transcript['title'], 
                    published_at=transcript.published_at,
                    text=text,
                )
                
    def df(self):
        return pd.DataFrame(list(self.df_rows()))

In [10]:
@attr.s
class KathyTranscript:
    
    path = attr.ib()
    
    def lines(self):
        with open(self.path) as fh:
            for line in fh.read().splitlines():
                yield line
                
    def text(self):
        return ' '.join(self.lines())
        
    @cached_property
    def doc(self):
        return nlp(self.text(), disable=['tokenizer', 'parser', 'tagger', 'ner'])
    
    @cached_property
    def basename(self):
        return os.path.basename(self.path)
    
    def sentence_texts(self):
        for sent in self.doc.sents:
            yield sent.text

In [11]:
@attr.s
class KathyCorpus:
    
    root = attr.ib()
    
    def paths(self):
        return glob(os.path.join(self.root, '**/*.txt'), recursive=True)
    
    def transcripts(self):
        for path in self.paths():
            yield KathyTranscript(path)
            
    def df_rows(self):
        for transcript in self.transcripts():
            for text in transcript.sentence_texts():
                yield dict(
                    basename=transcript.basename,
                    text=text,
                )
                
    def df(self):
        return pd.DataFrame(list(self.df_rows()))

In [12]:
def build_index(text_series):
    
    sent_idx = AnnoyIndex(1000)

    id_text = list(text_series.iteritems())

    for chunk in chunked_iter(tqdm_notebook(id_text), 1000):

        ids, texts = zip(*chunk)

        x = [Sentence(t).sent_order_x() for t in texts]
        x = sent_encoder(x)

        for i, v in zip(ids, x):
            sent_idx.add_item(i, v.data.tolist())

    sent_idx.build(10)
    
    return sent_idx

In [15]:
news_corpus = NewspaperCorpus('../data/kathy2012/newspapers2012/')

In [16]:
news_df = news_corpus.df()




In [17]:
news_idx = build_index(news_df['text'])




In [18]:
yt_corpus = YoutubeCorpus('../data/kathy2012/youtube2012/')

In [19]:
yt_df = yt_corpus.df()

In [20]:
yt_idx = build_index(yt_df['text'])




In [13]:
kathy_corpus = KathyCorpus('../data/kathy2012/transcripts2012/')

In [14]:
kathy_df = kathy_corpus.df()

In [15]:
kathy_idx = build_index(kathy_df['text'])




In [16]:
def query(df, idx, text, n=10):
    for ri in idx.get_nns_by_vector(Sentence(text).embedding(), n):
        print(df.iloc[ri].text, '\n')

In [25]:
query(news_df, news_idx, "This is wonderful.")

This is excellent. 

This makes us feel wonderful. 

This is good, good education policy reform.” 

This is great for children, grandchildren, best friends and families. 

This is a great day for the citizens of Wiscon- sin,’’ she said. ‘‘ 

This is amazing,” he said. “ 

This is a great day for the citi- zens of Wisconsin,” she said. “ 

This is a great day for the citizens of Wiscon- sin,” she said. “ 

This is great for board members, who come from a wide variety of disciplines, such as business, educa- tion, technology, and gov- ernment. “ 

This is great news for the state,” said Walker campaign spokeswoman Ciara Matthews. “( 



In [26]:
query(yt_df, yt_idx, "This is wonderful.")

this is a huge victory for Wisconsin 's middle class 

this has been an incredible journey one that has shown 

this is at a time when we 're supposed to 

this important piece of the challenge that lies ahead of 

this is an exciting day this is a day of 

this is one of our exciting days most exciting days 

this is one of our exciting days most exciting days 

this has been an incredible journey 14 months long this 

but this shows Kosovo as a whole a lot of 

this fundamental fairness issue we know from Tommy Thompson 's 



In [27]:
query(kathy_df, kathy_idx, "This is wonderful.")

It's wonderful. 

That's wonderful. 

It's really great. 

This is Wisconsin. 

So great. 

This is very helpful. 

It's phenomenal. 

It's a good one. 

This has been such a great conversation. 

This has been... this welfare stuff has been getting out of control for years. 



In [29]:
query(kathy_df, kathy_idx, "our kids are really well educated, but we need to take politics out of the education system.")

I mean our kids are really well educated, but we need to take politics out of the education system. 

Our tax system is too complicated, that's a national problem, of course. 

Our kids. 

Our children! 

I don't think cutting the budget of education means we're cutting the quality of education. 

I know you guys must have talked about the gridlock in Congress. 

I just cannot stomach listening to teachers complain, because they have to pay 10% of their health insurance premium. 

I think the insurance companies have way too much power. 

I think were really, were in a climate of like my side and your side and were throwing balls at each other instead of like trying to make a plan. 

I can see a situation with the school board; I think they might be going too far the other way. 



In [31]:
query(news_df, news_idx, "Our kids are really well educated, but we need to take politics out of the education system.")

Our schools are all of the age where they need to have major maintenance, enough of patching the patches. 

Our kids here are reeling right now,” he said. “ 

Our shelter system turned away more than 2,000 people last year due to lack of space. 

Our leaders need to bring our nation back to what it once was, and do so with liberty and justice for all. 

Our city needs to be a place where big projects can succeed, despite elaborate bu- reaucracy. 

Our citizens live in a very nice village and they need to pay for that village.” 

Our intellectual depth increasingly resembles floor wax; shiny on top, but lacking depth. 

Our hope is to have as high a number of kids as possible take advan- tage of this really, really excellent program,” said Gilbertson. “ 

Our city politicians should be jumping at the chance and opportuni- ty to save millions of dollars that we tax- payers would not have to pay. 

Our entire community is affected by the failure to do so. 



In [32]:
query(news_df, news_idx, "I wish both sides would work together and accomplish something instead of being on the campaign trail all the time.")

I hope that one of them is willing to do it for the sake of our state. 

I think that with the bud- get restraints and cutting taxes, people think we are leaving the children with a better financial future,” he said. “ 

I think they’ve to some degree employed a “gin up the base” strategy, which might ultimately keep Walker in the governor’s mansion but isn’t what a lot of voters signed up for. 

I wish I could do that in my business but I can’t because I don’t have the govern- ment giving me free money! 

I think in the case of everyone but the gover- nor, the leaks need to just stop. 

I think the environmental concerns are important, but the jobs are important, too,” he said. “ 

I think candidates should be able to campaign on their values instead of how much money they raise,” Warren said. 

I suspect that both major parties can see the stupidity and risk of the Electoral College, but both gamble they will game the sys- tem. 

I guess those folks need to be reminded that the loss 

In [33]:
query(news_df, news_idx, "The government was not meant to run health insurance.")

The bottom line is that a lot more people will be able to acquire and afford health insurance. 

The bottom line is that a lot more people will be able to acquire and afford health insurance. 

The government did not build universities or houses. 

The referendum was not whether or not to buy the (school). 

The presidency was a position he did not seek or desire. 

The schools were cre- ated as a way to further education because children were only required to be in school through the age of 14, Cameron said. 

The banks have insurance for rob- beries so they are not the victims. 

The water was too high for him to walk out. 

The meeting was good for the community,” Andrews said. “ 

The state was armed with witness accounts that the moon rocks sur- vived the fire and were not lumped with debris. “ 



In [40]:
query(news_df, news_idx, "my major concern is the fact that we're just going to pass this debt on to the kids, and I think that's embarrassing.", 20)

My concern is the severity of the citation. 

I want to assure you that this was not simply a campaign gim- mick, it is the honest truth and it is very much who I am. 

I think there is a legitimate concern or fear out there on the part of people that by giving these waivers, states might be ‘let off the hook’ in terms of ac- countability, and I think what you’ll find is just the opposite,” Wilhoit said. 

I do want the project to happen but can’t get to the TIF yes given my district.” 

My goal is to explain the forces that shaped him is to explain the forces that shaped him and why he thinks and acts as he does. 

My concern has been borne out,” Baldwin said, charging the deals have “led to a very signiﬁ - cant decrease in (Wisconsin’s) manufacturing sector, and if you Please see BALDWIN, Page A11 STEVE APPS — State Journal U.S. Rep. Tammy Baldwin, D- Madison, defends her congressional voting record Thursday during a meeting with the State Journal editorial board. 

My goal is to cut

In [42]:
query(news_df, news_idx, "My God, I've been here almost forty years.")

We’ve loved you since you created the modern era of politics-as-blood-sport in the House two decades ago. 

So I say ﬁ ve years ago. 

The lot used to house Burger King, which closed several years ago. 

My longest stint on radio and TV was with Mark Belling. 

It started as a whim a quarter century ago. 

My current favorite is “The Boy Who Came Back from Heaven.” 

The state Department of Justice sued Michael Todd Messmer of Oak Creek and Marilyn L. Broerman of Charlotte, N.C., two years ago. 

The museum opened 18 months ago. 

began in earnest decades ago. 

I wrote about it a few weeks ago. 



In [44]:
query(kathy_df, kathy_idx, "The money going into this campaign is so frustrating to see.")

And the media is such a, the way they do these ads and they pull this little something that this guy said and paste it in here. 

The political system that we have right now is this or that. 

That's why it's such a bitter battle, and there's so much money involved because you know how much money this loss of, you know, collective bargaining is for the unions, you know. 

A lot of them this weekend on the road. 

So small places are ignored and farmers in particular, do you feel? 

This has been... this welfare stuff has been getting out of control for years. 

There is no way to do it. 

And part of the problem is that so many people dont recognize that this is such a complex situation. 

Now with recalls is this the way the whole world is going to be now. 

So Ive just taken so much of this personally. 



In [46]:
query(news_df, news_idx, "They should mind their own business.")

They mind their own business. 

They want “their” country back. 

They’re going to make up their own numbers.” 

They’re going to make up their own numbers.” 

They’re going to make up their own numbers.’’ 

They’re going to make up their own numbers.” 

They say they want flexibility from the federal government on mandated programs to allow them to do more with less. 

They are not intended to be objective in their assessments, but rather they want you to think they are, and that’s the big lie. 

They must evaluate, their students and themselves. 

They make their own videos. 



In [53]:
query(news_df, news_idx, "My major concern is the fact that we're just going to pass this debt on to the kids.")

My concern is, if in fact this person had a permit to carry this weapon under the new concealed-carry law, are there no rules or established protocol addressing the man- ner in which a concealed weapon is carried? 

My main objective right now is to show that we can provide as good a location that is more con- venient, more central to their 470 employees,” Soglin said Monday. 

My main objective right now is to show that we can provide as good a location that is more con- venient, more central to their 470 employees,” Soglin said Monday. 

My concern is why, if the principal knew this in advance, why didn’t he inform us?” 

Our main concern is for the well-being of all drivers, students and passengers involved in the accident that occurred in Waukesha this morning,” Viviani said in the email. “ 

We don’t know where (the culprits) are getting this information.” 

Our thoughts are with the families of those who were lost, and we wish the wounded a speedy recov- ery,” it said. “ 

Our th

In [40]:
query(kathy_df, kathy_idx, "Every morning, noon, night, is some negativity about a person on TV.")

Every morning, noon, night, is some negativity about a person on TV. 

Do you feel like if you contacted city council, like your alderperson, that person would be responsive and listen to what you had to say? 

Your private life should be private, you know, even if you're in public office. 

So you can buy a house, buy a car, have kids, try and raise a family and start with $40,000 in the hole. 

Unless you're a multimillionaire, you cannot afford to run a campaign. 

You cannot be constantly bickering and fighting and getting nothing done. 

Do you know what a ticket to Seattle is right now? 

Do people know that every time we have an election we spend X amount of taxpayer dollars? 

And you know whats really cool in terms of politics is how many people know Citizens United? 

Do you think they'll have a good season next year? 



In [43]:
query(kathy_df, kathy_idx, "You cannot be constantly bickering and fighting and getting nothing done.")

You cannot be constantly bickering and fighting and getting nothing done. 

Or does it not necessarily matter  so much? 

You don’t necessarily have to know who that alderperson is. [ 

You know they're not, in… they really have to start looking at that as young people because the days of the retirement plans and stuff they kind of got them, they're just kind of weeding those out. 

You've got a pile of money here, you've got a pile of bills here you've got to pay. 

You are not getting a ride there. 

Why the hell is it someone else's fault because someone didn't have kids until late in life? 

You know what all that is about. 

You'll never get in. 

You know, how can you say this is this fault or that fault? 

