In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt

%matplotlib inline
mpl.style.use('bmh')

In [110]:
import numpy as np
import ujson

from glob import glob
from collections import Counter, defaultdict
from itertools import islice
from boltons.iterutils import windowed
from tqdm import tqdm_notebook
from gensim.models import KeyedVectors

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [111]:
vectors = KeyedVectors.load_word2vec_format(
    '../data/vectors/GoogleNews-vectors-negative300.bin.gz',
    binary=True,
)

In [113]:
class Corpus:
    
    def __init__(self, pattern, skim=None):
        self.pattern = pattern
        self.skim = skim
        
    def lines(self):
        for path in glob(self.pattern):
            with open(path) as fh:
                for line in fh:
                    yield line.strip()

    def abstracts(self):
        lines = self.lines()
        if self.skim:
            lines = islice(lines, self.skim)
        for line in tqdm_notebook(lines, total=self.skim):
            raw = ujson.loads(line)
            yield Abstract(raw)
            
    def xy(self):
        for abstract in self.abstracts():
            yield from abstract.xy()

In [114]:
class Abstract(dict):
    
    def xy(self):
        for i, sent in enumerate(self['sentences']):
            
            root_idx = sent['dep'].index('ROOT')
            root = sent['token'][root_idx]
            
            if root in vectors:
                x = vectors[sent['token'][root_idx]]
                y = i / (len(self['sentences'])-1)
                yield x, y

In [115]:
train = Corpus('/Users/dclure/Projects/sent-order/data/train.json/*.json', 700000)

In [116]:
train_x, train_y = zip(*train.xy())




In [117]:
model = LinearRegression()

In [118]:
fit = model.fit(train_x, train_y)

In [119]:
dev = Corpus('/Users/dclure/Projects/sent-order/data/dev.json/*.json', 100000)

In [120]:
dev_x, dev_y = zip(*dev.xy())




In [121]:
r2_score(dev_y, fit.predict(dev_x))

0.10346986407524694

In [123]:
correct = Counter()
total = Counter()

for ab in dev.abstracts():
    
    try:
        x, _ = zip(*ab.xy())

        order = list(fit.predict(x).argsort().argsort())

        if sorted(order) == order:
            correct[len(order)] += 1

        total[len(order)] += 1
        
    except:
        pass




Exception in thread Thread-40:
Traceback (most recent call last):
  File "/usr/local/Cellar/python3/3.6.2/Frameworks/Python.framework/Versions/3.6/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/Users/dclure/Projects/plot-ordering/env/lib/python3.6/site-packages/tqdm/_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "/usr/local/bin/../Cellar/python3/3.6.2/bin/../Frameworks/Python.framework/Versions/3.6/lib/python3.6/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration






In [124]:
for slen in sorted(correct.keys()):
    print(slen, correct[slen] / total[slen])

1 1.0
2 0.676801683324566
3 0.3378504382420588
4 0.13614972091496114
5 0.040683229813664595
6 0.01137371944825361
7 0.0018542757417102968
8 0.0006060606060606061
10 0.0003243593902043464


In [125]:
sum(correct.values()) / sum(total.values())

0.15354307086141722