In [149]:
import attr
import os
import spacy
import re
import torch
import sent2vec
import ujson

import dateutil.parser
import numpy as np
import pandas as pd

from annoy import AnnoyIndex
from cached_property import cached_property
from glob import glob
from tqdm import tqdm_notebook
from itertools import islice
from boltons.iterutils import chunked_iter, windowed_iter
from datetime import datetime as dt
from collections import UserDict
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from torchtext.vocab import Vectors

from sent_order.models import kt_regression as kt_reg

In [140]:
vectors = Vectors('glove.840B.300d.txt')

In [None]:
sent_encoder = torch.load(
    '../../plot-ordering/data/models/new/kt-reg/sent_encoder.68.bin',
    map_location={'cuda:0': 'cpu'}
)

In [150]:
sent2vec_model = sent2vec.Sent2vecModel()
sent2vec_model.load_model('../data/wiki_unigrams.bin')

In [3]:
nlp = spacy.load('en')
nlp.add_pipe(nlp.create_pipe('sentencizer'))

In [4]:
def clean_text(text):
    return re.sub('[\s]{2,}|\n', ' ', text.strip())

In [151]:
@attr.s
class Sentence:
    
    text = attr.ib()
        
    @cached_property
    def doc(self):
        return nlp(self.text, disable=['parser', 'tagger', 'ner'])
    
    def tokens(self):
        return [t.text for t in self.doc]
    
    def sent_order_x(self):
        return kt_reg.Sentence(self.tokens()).variable()
    
    def sent_order_vector(self):
        x = self.sent_order_x()
        return sent_encoder([x])[0].data.tolist()
    
    def cbow_vector(self):
        embeds = [vectors[t].squeeze().tolist() for t in self.tokens()]
        return np.mean(embeds, 0)
    
    def sent2vec_vector(self):
        return sent2vec_model.embed_sentence(self.text)

In [6]:
@attr.s
class NewspaperSegment:
    
    path = attr.ib()
    
    def text(self):
        with open(self.path) as fh:
            return clean_text(fh.read())
        
    @cached_property
    def doc(self):
        return nlp(self.text(), disable=['tokenizer', 'parser', 'tagger', 'ner'])
    
    @cached_property
    def paper_name(self):
        return self.path.split(os.sep)[-3]
    
    @cached_property
    def date(self):
        return dt.strptime(self.path.split(os.sep)[-2], '%m-%d-%Y')
    
    def sentence_texts(self):
        for sent in self.doc.sents:
            yield sent.text

In [7]:
@attr.s
class NewspaperCorpus:
    
    root = attr.ib()
    
    def paths(self):
        return glob(os.path.join(self.root, '**/*.txt'), recursive=True)
    
    def segments(self):
        for path in tqdm_notebook(self.paths()):
            yield NewspaperSegment(path)
            
    def df_rows(self):
        for segment in self.segments():
            for text in segment.sentence_texts():
                yield dict(paper_name=segment.paper_name, date=segment.date, text=text)
                
    def df(self, skim=None):
        return pd.DataFrame(list(islice(self.df_rows(), skim)))

In [9]:
news_corpus = NewspaperCorpus('../data/kathy2012/newspapers2012/')

In [10]:
news_df = news_corpus.df()




In [131]:
skim_df = news_df.head(100000)

# doc2vec

In [164]:
doc2vec_docs = []
for id, text in tqdm_notebook(skim_df['text'].iteritems()):
    tokens = Sentence(text).tokens()
    doc2vec_docs.append(TaggedDocument(tokens, [id]))




In [None]:
doc2vec_model = Doc2Vec(dm=0, dbow_words=1, vector_size=300, window=8, min_count=5, epochs=10, workers=8)
doc2vec_model.build_vocab(docs)
doc2vec_model.train(doc2vec_docs, total_examples=model.corpus_count, epochs=model.epochs)

In [119]:
doc2vec_idx = AnnoyIndex(300)

for i, v in tqdm_notebook(enumerate(model.docvecs.vectors_docs)):
    doc2vec_idx.add_item(i, v)

doc2vec_idx.build(10)




True

In [161]:
def query_doc2vec(text, n=10):
    vector = doc2vec_model.infer_vector(Sentence(text).tokens())
    for ri in doc2vec_idx.get_nns_by_vector(vector, n):
        print(skim_df.iloc[ri].text, '\n')

In [162]:
query_doc2vec('This is fantastic.')

No more “hum- bug.” 

This is all a complete lie. 

It’s nationwide.” 

Every delegate matters this year.” 

This community is non- smoking. 

Get out.” 

day of Christmas shopping. 

This community is non- smoking. 

This is unbelievable. 

This is no mainstream media conspiracy. 



In [163]:
query_doc2vec("I'm worried about the future.")

And we’re not likely to in the future.” 

It provides an example of what you can do,” he said. “ 

They do about 600 shows a year. 

I was disappointed that they’re not holding it here. 

We are not going to get bogged down with a lot of rules,” he said. 

We are not here to tell you what to think,” he said, intro- ducing the discussion. “ 

And they did. 

It sounds like they don’t want to be there,’’ Cullen said. ‘‘ 

I’m hoping there will be a quick solution,” Gregoire said. “ 

Somebody’s going to pay for this and it’s going to be taxpayers.” 



# Sentence ordering

In [124]:
sent_order_idx = AnnoyIndex(1000)

id_text = list(skim_df['text'].iteritems())

for chunk in chunked_iter(tqdm_notebook(id_text), 1000):

    ids, texts = zip(*chunk)

    x = [Sentence(t).sent_order_x() for t in texts]
    x = sent_encoder(x)

    for i, v in zip(ids, x):
        sent_order_idx.add_item(i, v.data.tolist())

sent_order_idx.build(10)




True

In [136]:
def query_sent_order(text, n=10):
    vector = Sentence(text).sent_order_vector()
    for ri in sent_order_idx.get_nns_by_vector(vector, n):
        print(skim_df.iloc[ri].text, '\n')

In [166]:
query_sent_order('This is fantastic.')

This is unbelievable. 

This is great for children, grandchildren, best friends and families. 

This is very exciting,” Miller said. “ 

This is one impressive wordsmith. 

This is Wisconsin,” a confident Ng said. “ 

This is a way of encouraging use — it’s just a rather crude way.” 

This isn’t Nevada. 

This is a whole new level of politics,” said Duffy. “ 

This is a ﬁ ne example of the Elks “Caring and Sharing.” 

This hearing aid is amazing! 



In [160]:
query_sent_order("I'm worried about the future.")

I love him very much and see myself with him in the future. 

I'm worried that my distance is affecting her health. 

I'm afraid she won't make it that long. 

I think we all hope it won’t mean the end of Kodak because it still has a lot to offer.” 

I'm afraid for her safety. 

I hope we meet again some day. 

I wish she would have discussed her plan with me in advance. 

I believe any time you do something good and excit- ing it will attract people to your location. 

I have serious concerns. “ 

I think anybody that can beat Obama is great,” he said. “ 



# CBOW

In [144]:
cbow_idx = AnnoyIndex(300)

for id, text in tqdm_notebook(skim_df['text'].iteritems()):
    vector = Sentence(text).cbow_vector()
    cbow_idx.add_item(id, vector)
    
cbow_idx.build(10)




True

In [147]:
def query_cbow(text, n=10):
    vector = Sentence(text).cbow_vector()
    for ri in cbow_idx.get_nns_by_vector(vector, n):
        print(skim_df.iloc[ri].text, '\n')

In [148]:
query_cbow('This is fantastic.')

This is unbelievable. 

This is just so wonderful.” 

This is just so wonderful.” 

This is ridiculous. 

This is horrible. 

It’s so wonderful. 

It’s just amazing. 

This is our liveli- hood. 

It’s not pretty. 

This is payback. 



In [159]:
query_cbow("I'm worried about the future.")

I have considered seeing a therapist, but I don’t know how I can do that with- out giving my parents an explanation about why I’m going. 

I feel very good about where I’m at. 

I’m very proud of what they did. 

I'm open to just about anything." 

I know I might be mis- taking love for infatua- tion — I’ve heard all this before. 

I’m so hurt, I guess I’m just look- ing for some input into this. 

I didn’t really have time to think about it. 

I count all the states I’m glad I’m not the governor of.” 

So I called dad, be- cause I wanted to know all the details.” 

Until read- ing this book I knew little about this man, and now I want to know more. 



# sent2vec

In [152]:
sent2vec_idx = AnnoyIndex(600)

for id, text in tqdm_notebook(skim_df['text'].iteritems()):
    vector = Sentence(text).sent2vec_vector()
    sent2vec_idx.add_item(id, vector)
    
sent2vec_idx.build(10)




True

In [153]:
def query_sent2vec(text, n=10):
    vector = Sentence(text).sent2vec_vector()
    for ri in sent2vec_idx.get_nns_by_vector(vector, n):
        print(skim_df.iloc[ri].text, '\n')

In [157]:
query_sent2vec("This is fantastic.")

DARRELL PENDERGRASS But now, Shep is old. 

This is horrible. 

Christ is Risen. 

Spring Has Sprung ◆ First Communion Mother’s Day is Coming! 

Spring Has Sprung ◆ First Communion Mother’s Day is Coming! 

Ashland Spring Has Sprung ◆ First Communion Mother’s Day is Coming! 

Everyone is welcome. 

Everyone is welcome. 

What is … An editorial? 

Registration is required. 



In [158]:
query_sent2vec("I'm worried about the future.")

She's becoming increasingly con- trolling and worried about my soul. 

We do not know how to address this with government policies, even though the nation has worried about it for almost 50 years. 

Leo doesn’t know much about the Healing Circle Run/Walk, but he knows it’s important to his grandfather and tribe. 

Congress is asking the administration for docu- ments about the attack, in hopes of building a timeline of what the gov- ernment knew and when. “ 

And this election isn’t just about what we think about unions. 

OK, we'll return to it again and think about it." 

Until read- ing this book I knew little about this man, and now I want to know more. 

I didn’t know about these, did you?” 

Now they’re talking about doing it at future slumber parties, so I’m not sure how to handle it. 

Wu declined to comment on what Apple might know about scalpers buying iPhones for resale. 

