In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt

%matplotlib inline
mpl.style.use('bmh')

In [2]:
import numpy as np

import ujson
import attr
import random
import torch

from glob import glob
from tqdm import tqdm_notebook
from itertools import islice
from boltons.iterutils import pairwise, chunked_iter
from collections import Counter

from gensim.models import KeyedVectors

from torch import nn
from torch.utils.data import DataLoader
from torch.autograd import Variable
from torch.nn import functional as F

In [3]:
vectors = KeyedVectors.load_word2vec_format(
    '../data/vectors/GoogleNews-vectors-negative300.bin.gz',
    binary=True,
)

In [88]:
class Corpus:
    
    def __init__(self, pattern, skim=None):
        self.pattern = pattern
        self.skim = skim
        
    def lines(self):
        for path in glob(self.pattern):
            with open(path) as fh:
                for line in fh:
                    yield line.strip()

    def abstracts(self):
        lines = self.lines()
        if self.skim:
            lines = islice(lines, self.skim)
        for line in tqdm_notebook(lines, total=self.skim):
            raw = ujson.loads(line)
            yield Abstract.from_raw(raw)
            
    def xy(self):
        for abstract in self.abstracts():
            yield from abstract.xy()

In [89]:
@attr.s
class Abstract:
    
    sentences = attr.ib()
    
    @classmethod
    def from_raw(cls, raw):
        return cls([Sentence(s['token']) for s in raw['sentences']])
            
    def xy(self):
        for i, sent in enumerate(self.sentences):
            x = sent.tensor()
            y = i / (len(self.sentences)-1)
            y = torch.FloatTensor([y])
            yield x, y

In [90]:
@attr.s
class Sentence:
    
    tokens = attr.ib()
    
    def tensor(self, dim=300, pad=50):
        x = [vectors[t] for t in self.tokens if t in vectors]
        x += [np.zeros(dim)] * pad
        x = x[:pad]
        x = list(reversed(x))
        x = np.array(x)
        x = torch.from_numpy(x)
        x = x.float()
        return x

In [95]:
class Model(nn.Module):
    
    def __init__(self, embed_dim, hidden_dim):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.rnn = nn.RNN(embed_dim, hidden_dim, nonlinearity='relu', batch_first=True)
        self.hidden2y = nn.Linear(hidden_dim, 1)
        
    def init_hidden(self):
        return Variable(torch.zeros(1, 1, self.hidden_dim))
        
    def forward(self, x):
        hidden = self.init_hidden()
        rnn_out, hidden = self.rnn(x, hidden)
        y = self.hidden2y(hidden)
        return y

In [164]:
train = Corpus('../data/train.json/*.json', 10000)

In [165]:
train_xy = list(train.xy())




In [166]:
loader = DataLoader(train_xy, 100)

In [167]:
torch.manual_seed(1)

<torch._C.Generator at 0x10cb6eca8>

In [168]:
model = Model(300, 150)

In [169]:
criterion = nn.MSELoss()

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [None]:
train_loss = []
for epoch in range(30):
    
    print(f'Epoch {epoch}')
    
    epoch_loss = 0
    for x, y in tqdm_notebook(loader):

        x = Variable(x)
        y = Variable(y)

        model.zero_grad()
        model.init_hidden()

        y_pred = model(x)

        loss = criterion(y_pred, y)
        loss.backward()

        optimizer.step()
        
        epoch_loss += loss.data[0]
        
    epoch_loss /= len(loader)
    train_loss.append(epoch_loss)
    print(epoch_loss)

Epoch 0



0.10099632808156173
Epoch 1



0.08590844847594274
Epoch 2



0.07984011503261926
Epoch 3



0.07596221305152963
Epoch 4



0.07247867372205488
Epoch 5



0.06872502510820179
Epoch 6



0.06578824269242557
Epoch 7



0.0630592135643261
Epoch 8



0.05981664300366516
Epoch 9



0.05803895000797665
Epoch 10



0.05466449746453518
Epoch 11



0.052941272769220254
Epoch 12



0.051493097480981985
Epoch 13



0.05074800804714399
Epoch 14



0.04923782921413507
Epoch 15



0.047822701460837655
Epoch 16



0.04676928788901317
Epoch 17


In [None]:
plt.plot(train_loss)

In [None]:
dev = Corpus('../data/dev.json/*.json', 1000)

In [None]:
correct = Counter()
total = Counter()

for ab in dev.abstracts():
    
    x, _ = zip(*ab.xy())

    x = Variable(torch.stack(x))
    y = model(x)
    
    y = np.array(y.data[0].tolist())[:,0]
    
    order = list(y.argsort().argsort())
    
    if sorted(order) == order:
        correct[len(order)] += 1
        
    total[len(order)] += 1

In [None]:
for slen in sorted(correct.keys()):
    print(slen, correct[slen] / total[slen])

In [None]:
sum(correct.values()) / sum(total.values())