In [3]:
import ujson

from itertools import islice
from tqdm import tqdm_notebook
from glob import glob
from collections import Counter

from gensim.models import KeyedVectors

from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.preprocessing.sequence import pad_sequences
from keras.wrappers.scikit_learn import KerasRegressor

from sklearn.metrics import r2_score

In [2]:
w2v = KeyedVectors.load_word2vec_format(
    '../data/vectors/GoogleNews-vectors-negative300.bin.gz',
    binary=True,
)

In [4]:
class Corpus:
    
    def __init__(self, pattern, skim=None):
        self.pattern = pattern
        self.skim = skim
        
    def lines(self):
        for path in glob(self.pattern):
            with open(path) as fh:
                for line in fh:
                    yield line.strip()

    def abstracts(self):
        lines = self.lines()
        if self.skim:
            lines = islice(lines, self.skim)
        for line in tqdm_notebook(lines, total=self.skim):
            raw = ujson.loads(line)
            yield Abstract.from_raw(raw)
            
    def xy(self):
        for abstract in self.abstracts():
            yield from abstract.xy()

In [5]:
class Abstract:
    
    @classmethod
    def from_raw(cls, raw):
        return cls([Sentence(s['token']) for s in raw['sentences']])
    
    def __init__(self, sentences):
        self.sentences = sentences
            
    def xy(self):
        for i, sent in enumerate(self.sentences):
            x = sent.token_vectors()
            y = i / (len(self.sentences)-1)
            yield x, y

In [6]:
class Sentence:
    
    def __init__(self, tokens):
        self.tokens = tokens
        
    def token_vectors(self):
        return [w2v[t] for t in self.tokens if t in w2v]

In [9]:
train = Corpus('../data/train.json/*.json', 100000)

In [10]:
train_x, train_y = zip(*train.xy())




In [11]:
train_x = pad_sequences(train_x, 20, padding='post', dtype=float)

In [12]:
train_y = list(train_y)

In [13]:
model = Sequential()
model.add(LSTM(256, input_shape=train_x[0].shape, activation='relu'))
model.add(Dense(128, kernel_initializer='normal', activation='relu'))
model.add(Dense(128, kernel_initializer='normal', activation='relu'))
model.add(Dense(128, kernel_initializer='normal', activation='relu'))
model.add(Dense(1, kernel_initializer='normal'))
model.compile(loss='mean_squared_error', optimizer='adam')

In [14]:
model.fit(train_x, train_y, batch_size=100)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x10ecdeac8>

In [15]:
dev = Corpus('../data/dev.json/*.json', 10000)

In [16]:
dev_x, dev_y = zip(*dev.xy())




In [17]:
dev_x = pad_sequences(dev_x, 20, padding='post', dtype=float)

In [18]:
dev_y = list(dev_y)

In [19]:
r2_score(dev_y, model.predict(dev_x))

-9.6276080733925795e-05

In [62]:
correct = Counter()
total = Counter()

for ab in dev.abstracts():
    
    x, _ = zip(*ab.xy())
    x = pad_sequences(x, 20, padding='post', dtype=float)
    
    preds = model.predict(x)
    order = list(preds[:,0].argsort().argsort())

    if sorted(order) == order:
        correct[len(order)] += 1

    total[len(order)] += 1




In [63]:
for slen in sorted(correct.keys()):
    print(slen, correct[slen] / total[slen])

2 0.8291571753986332
3 0.5174563591022444
4 0.2303894679100384
5 0.09532483302975106
6 0.030730897009966777
7 0.006329113924050633
8 0.0029895366218236174


In [64]:
sum(correct.values()) / sum(total.values())

0.218