In [8]:
import matplotlib as mpl
import matplotlib.pyplot as plt

%matplotlib inline
mpl.style.use('bmh')

In [18]:
import numpy as np

import attr
import os
import click
import torch
import ujson

from gensim.models import KeyedVectors
from boltons.iterutils import pairwise, chunked_iter
from tqdm import tqdm_notebook
from glob import glob
from itertools import islice
from scipy import stats

from sklearn.metrics import classification_report, accuracy_score

from torch import nn
from torch.nn import functional as F
from torch.autograd import Variable

In [19]:
vectors = KeyedVectors.load('../data/vectors/vectors.bin')

In [20]:
def read_abstracts(path):
    for path in glob(os.path.join(path, '*.json')):
        with open(path) as fh:
            for line in fh:
                raw = ujson.loads(line.strip())
                yield Abstract.from_raw(raw)

In [21]:
class Corpus:

    def __init__(self, path, skim):
        reader = islice(read_abstracts(path), skim)
        self.abstracts = list(tqdm_notebook(reader, total=skim))

    def batches(self, size):
        for chunk in chunked_iter(tqdm_notebook(self.abstracts), size):
            yield AbstractBatch(chunk)

In [22]:
@attr.s
class Abstract:

    sentences = attr.ib()

    @classmethod
    def from_raw(cls, raw):
        return cls([Sentence(s['token']) for s in raw['sentences']])

    def tensor(self):
        tensors = [s.tensor() for s in self.sentences]
        return torch.stack(tensors)

In [23]:
@attr.s
class Sentence:

    tokens = attr.ib()

    def tensor(self, dim=300, pad=50):
        x = [vectors[t] for t in self.tokens if t in vectors]
        x += [np.zeros(dim)] * pad
        x = x[:pad]
        x = list(reversed(x))
        x = np.array(x)
        x = torch.from_numpy(x)
        x = x.float()
        return x

In [24]:
class SentenceEncoder(nn.Module):

    def __init__(self, lstm_dim=128):
        super().__init__()
        self.lstm_dim = lstm_dim
        self.lstm = nn.LSTM(300, lstm_dim, batch_first=True)

    def forward(self, x):
        h0 = Variable(torch.zeros(1, len(x), self.lstm_dim).type(ftype))
        c0 = Variable(torch.zeros(1, len(x), self.lstm_dim).type(ftype))
        _, (hn, cn) = self.lstm(x, (h0, c0))
        return hn

In [25]:
class Model(nn.Module):

    def __init__(self, input_dim=128, lstm_dim=128):
        super().__init__()
        self.lstm_dim = lstm_dim
        self.lstm = nn.LSTM(input_dim, lstm_dim, batch_first=True)
        self.out = nn.Linear(lstm_dim, 1)

    def forward(self, x):
        h0 = Variable(torch.zeros(1, len(x), self.lstm_dim).type(ftype))
        c0 = Variable(torch.zeros(1, len(x), self.lstm_dim).type(ftype))
        _, (hn, cn) = self.lstm(x, (h0, c0))
        y = self.out(hn)
        return y.view(len(x))

In [26]:
train = Corpus('../data/train.json/', 1000)




In [27]:
torch.manual_seed(1)

sent_encoder = SentenceEncoder()
model = Model()

In [28]:
model = torch.load('../data/model.pt')

TypeError: <module '__main__'> is a built-in class