In [3]:
import numpy as np
import ujson

from glob import glob
from collections import Counter, defaultdict
from itertools import islice
from boltons.iterutils import windowed
from tqdm import tqdm_notebook
from gensim.models import KeyedVectors

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [2]:
w2v = KeyedVectors.load_word2vec_format(
    '../data/vectors/GoogleNews-vectors-negative300.bin.gz',
    binary=True,
)

In [22]:
class Corpus:
    
    def __init__(self, pattern, skim=None):
        self.pattern = pattern
        self.skim = skim
        
    def lines(self):
        for path in glob(self.pattern):
            with open(path) as fh:
                for line in fh:
                    yield line.strip()

    def abstracts(self):
        lines = self.lines()
        if self.skim:
            lines = islice(lines, self.skim)
        for line in tqdm_notebook(lines, total=self.skim):
            raw = ujson.loads(line)
            yield Abstract(raw)
            
    def xy(self):
        for abstract in self.abstracts():
            yield from abstract.xy()

In [23]:
class Abstract(dict):
    
    def xy(self):
        for i, sent in enumerate(self['sentences']):
            
            root_idx = sent['dep'].index('ROOT')
            root = sent['token'][root_idx]
            
            if root in w2v:
                x = w2v[sent['token'][root_idx]]
                y = i / (len(self['sentences'])-1)
                yield x, y

In [26]:
train = Corpus('../data/train.json/*.json', 10000)

In [27]:
train_x, train_y = zip(*train.xy())

In [32]:
model = Sequential()
model.add(Dense(128, input_shape=train_x[0].shape, kernel_initializer='normal', activation='relu'))
model.add(Dense(1, kernel_initializer='normal'))
model.compile(loss='mean_squared_error', optimizer='adam')

In [33]:
model.fit(train_x, train_y, batch_size=100)

KeyboardInterrupt: 