In [80]:
import gensim
import re
import numpy as np

from collections import Counter

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn import metrics

In [46]:
vectors = gensim.models.KeyedVectors.load_word2vec_format('../data/vectors/GoogleNews-vectors-negative300.bin.gz', binary=True)

In [47]:
def iter_corpus(path):
    with open(path) as fh:
        lines = fh.read().splitlines()
        abs_lines = []
        for line in lines:
            if line:
                abs_lines.append(line)
            else:
                yield Abstract.from_lines(abs_lines)
                abs_lines = []

In [64]:
@attr.s
class Abstract:
    
    identifier = attr.ib()
    tags = attr.ib()
    sentences = attr.ib()
    
    @classmethod
    def from_lines(cls, lines):
        identifier = lines[0]
        tags = lines[1].split()
        sentences = lines[2:]
        return cls(identifier, tags, sentences)
    
    def tokenized_sentences(self):
        for sent in self.sentences:
            yield re.findall('[a-z]+', sent, re.I)
            
    def sentence_feature_dicts(self):
        for tokens in self.tokenized_sentences():
            yield Counter(tokens)
    
    def sentence_tensors(self):
        for sent in self.tokenized_sentences():
            yield [vectors[t] for t in sent if t in vectors]

In [97]:
test = iter_corpus('../data/abstracts/test.txt')

In [98]:
dv = DictVectorizer()

In [99]:
train_xy = []
for abstract in test:
    for i, d in enumerate(abstract.sentence_feature_dicts()):
        train_xy.append((d, i))

In [100]:
train_x, train_y = zip(*train_xy)

In [101]:
len(train_x)

594130

In [102]:
train_x = dv.fit_transform(train_x)

In [103]:
train_x

<594130x114060 sparse matrix of type '<class 'numpy.float64'>'
	with 12748411 stored elements in Compressed Sparse Row format>

In [79]:
model = LinearRegression().fit(train_x, train_y)

In [104]:
valid = iter_corpus('../data/abstracts/valid.txt')

In [105]:
valid_xy = []
for abstract in valid:
    for i, d in enumerate(abstract.sentence_feature_dicts()):
        valid_xy.append((d, i))

In [106]:
valid_x, valid_y = zip(*valid_xy)

In [107]:
valid_x = dv.transform(valid_x)

In [108]:
y_pred = model.predict(valid_x)

In [109]:
metrics.r2_score(valid_y, y_pred)

0.086363796775093649