In [58]:
import numpy as np
import attr
import ujson
import torch

from itertools import islice
from tqdm import tqdm_notebook
from glob import glob

from gensim.models import KeyedVectors

In [2]:
vectors = KeyedVectors.load_word2vec_format(
    '../data/vectors/GoogleNews-vectors-negative300.bin.gz',
    binary=True,
)

In [107]:
class Corpus:
    
    def __init__(self, pattern, skim=None):
        self.pattern = pattern
        self.skim = skim
        
    def lines(self):
        for path in glob(self.pattern):
            with open(path) as fh:
                for line in fh:
                    yield line.strip()

    def abstracts(self):
        lines = self.lines()
        if self.skim:
            lines = islice(lines, self.skim)
        for line in tqdm_notebook(lines, total=self.skim):
            raw = ujson.loads(line)
            yield Abstract.from_raw(raw)
            
    def xy(self):
        for abstract in self.abstracts():
            yield from abstract.xy()

In [114]:
@attr.s
class Abstract:
    
    sentences = attr.ib()
    
    @classmethod
    def from_raw(cls, raw):
        return cls([Sentence(s['token']) for s in raw['sentences']])
            
    def xy(self):
        for i, sent in enumerate(self.sentences):
            try:
                x = sent.tensor()
                y = i / (len(self.sentences)-1)
                yield x, y
            except RuntimeError as e:
                pass

In [115]:
@attr.s
class Sentence:
    
    tokens = attr.ib()
    
    def tensor(self):
        x = np.array([vectors[t] for t in self.tokens if t in vectors])
        x = torch.from_numpy(x)
        return x

In [116]:
train = Corpus('../data/train.json/*.json', 1000)

In [117]:
train_x, train_y = zip(*train.xy())


