In [8]:
import re
import numpy as np
import attr

from collections import Counter
from wordfreq import top_n_list

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn import metrics

In [9]:
topn = set(top_n_list('en', 100))

In [10]:
def iter_corpus(path):
    with open(path) as fh:
        lines = fh.read().splitlines()
        abs_lines = []
        for line in lines:
            if line:
                abs_lines.append(line)
            else:
                yield Abstract.from_lines(abs_lines)
                abs_lines = []

In [11]:
@attr.s
class Abstract:
    
    identifier = attr.ib()
    tags = attr.ib()
    sentences = attr.ib()
    
    @classmethod
    def from_lines(cls, lines):
        identifier = lines[0]
        tags = lines[1].split()
        sentences = lines[2:]
        return cls(identifier, tags, sentences)
    
    def tokenized_sentences(self):
        for sent in self.sentences:
            yield re.findall('[a-z]+', sent.lower())
            
    def sentence_feature_dicts(self):
        for tokens in self.tokenized_sentences():
            yield Counter([t for t in tokens if t in topn])

In [12]:
test = iter_corpus('../data/abstracts/test.txt')

In [13]:
dv = DictVectorizer()

In [14]:
train_xy = []
for abstract in test:
    for i, d in enumerate(abstract.sentence_feature_dicts()):
        train_xy.append((d, i))

In [15]:
train_x, train_y = zip(*train_xy)

In [16]:
len(train_x)

594130

In [17]:
train_x = dv.fit_transform(train_x)

In [18]:
train_x

<594130x94 sparse matrix of type '<class 'numpy.float64'>'
	with 4236972 stored elements in Compressed Sparse Row format>

In [19]:
model = LinearRegression().fit(train_x, train_y)