In [43]:
import numpy as np
import ujson

from glob import glob
from collections import Counter, defaultdict
from itertools import islice
from boltons.iterutils import windowed
from tqdm import tqdm_notebook
from gensim.models import KeyedVectors

from sklearn.metrics import r2_score

from keras.models import Sequential
from keras.layers import Dense, LSTM, Convolution1D
from keras.preprocessing.sequence import pad_sequences
from keras.wrappers.scikit_learn import KerasRegressor

In [2]:
w2v = KeyedVectors.load_word2vec_format(
    '../data/vectors/GoogleNews-vectors-negative300.bin.gz',
    binary=True,
)

In [3]:
class Corpus:
    
    def __init__(self, pattern, skim=None):
        self.pattern = pattern
        self.skim = skim
        
    def lines(self):
        for path in glob(self.pattern):
            with open(path) as fh:
                for line in fh:
                    yield line.strip()

    def abstracts(self):
        lines = self.lines()
        if self.skim:
            lines = islice(lines, self.skim)
        for line in tqdm_notebook(lines, total=self.skim):
            raw = ujson.loads(line)
            yield Abstract(raw)
            
    def xy(self):
        for abstract in self.abstracts():
            yield from abstract.xy()

In [4]:
class Abstract(dict):
    
    def xy(self):
        for i, sent in enumerate(self['sentences']):
            
            root_idx = sent['dep'].index('ROOT')
            root = sent['token'][root_idx]
            
            if root in w2v:
                x = w2v[sent['token'][root_idx]]
                y = i / (len(self['sentences'])-1)
                yield x, y

In [28]:
train = Corpus('../data/train.json/*.json', 100000)

In [29]:
train_x, train_y = zip(*train.xy())




In [47]:
model = Sequential()
model.add(Dense(256, input_shape=train_x[0].shape, kernel_initializer='normal', activation='relu'))
model.add(Dense(128, kernel_initializer='normal', activation='relu'))
model.add(Dense(128, kernel_initializer='normal', activation='relu'))
model.add(Dense(1, kernel_initializer='normal'))
model.compile(loss='mean_squared_error', optimizer='adam')

In [None]:
model.fit(np.array(train_x), np.array(train_y), batch_size=100)

Epoch 1/10
Epoch 2/10

In [39]:
dev = Corpus('../data/dev.json/*.json', 10000)

In [40]:
dev_x, dev_y = zip(*dev.xy())




In [41]:
r2_score(np.array(dev_y), model.predict(np.array(dev_x)))

0.12623397276663517