In [57]:
import attr
import re
import numpy as np

from collections import Counter, defaultdict
from itertools import islice

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [58]:
class Corpus:
    
    def __init__(self, path, skim=None):
        self.path = path
        self.skim = skim
        
    def lines(self):
        with open(self.path) as fh:
            for line in fh:
                yield line.strip()
    
    def abstract_lines(self):
        lines = []
        for line in self.lines():
            if line:
                lines.append(line)
            else:
                yield lines
                lines = []

    def abstracts(self):
        ab_lines = self.abstract_lines()
        if self.skim:
            ab_lines = islice(ab_lines, self.skim)
        for lines in ab_lines:
            yield Abstract.from_lines(lines)
            
    def xy(self, vocab):
        for abstract in self.abstracts():
            yield from abstract.xy(vocab)
            
    def token_counts(self):
        counts = defaultdict(lambda: 0)
        for ab in self.abstracts():
            for tokens in ab.sentence_tokens():
                for token in tokens:
                    counts[token] += 1
        return Counter(counts)

In [59]:
@attr.s
class Abstract:
    
    identifier = attr.ib()
    tags = attr.ib()
    sentences = attr.ib()
    
    @classmethod
    def from_lines(cls, lines):
        return cls(lines[0], lines[1].split(), lines[2:])
    
    def sentence_tokens(self):
        for sent in self.sentences:
            yield re.findall('[a-z]+', sent.lower())
    
    def xy(self, vocab):
        for y, tokens in enumerate(self.sentence_tokens()):
            x = Counter([t for t in tokens if t in vocab])
            yield x, y

In [78]:
train = Corpus('../data/abstracts/train.txt', 1000000)

In [79]:
counts = train.token_counts()

In [80]:
vocab = set([k for k, _ in counts.most_common(2000)])

In [81]:
dv = DictVectorizer()

In [82]:
train_x, train_y = zip(*train.xy(vocab))

In [83]:
train_x = dv.fit_transform(train_x)

In [84]:
train_x

<4758878x2000 sparse matrix of type '<class 'numpy.float64'>'
	with 83076373 stored elements in Compressed Sparse Row format>

In [85]:
model = LinearRegression()

In [86]:
fit = model.fit(train_x, train_y)

In [87]:
test = Corpus('../data/abstracts/test.txt', 50000)

In [88]:
test_x, test_y = zip(*test.xy(vocab))

In [89]:
test_x = dv.transform(test_x)

In [90]:
r2_score(test_y, fit.predict(test_x))

0.21033759596707113

In [95]:
names = dv.get_feature_names()

In [96]:
bidx = fit.coef_.argsort()
eidx = np.flip(fit.coef_.argsort(), 0)

In [97]:
for i in bidx[:50]:
    print(fit.coef_[i], names[i])

-1.04391251609 let
-0.839570262001 report
-0.821624699506 often
-0.744173888617 investigate
-0.736926258363 aim
-0.72042082595 article
-0.710025531718 widely
-0.702355226738 recently
-0.700656541372 investigated
-0.696710484944 paper
-0.688601878405 usually
-0.649508898501 consider
-0.603694138812 telescope
-0.591801665283 studied
-0.585823048807 recent
-0.573573314178 study
-0.548325052036 goal
-0.547469191432 qcd
-0.540713387226 theoretically
-0.5247422734 attention
-0.523751509071 collider
-0.523110668401 review
-0.518296486362 superconductor
-0.515212021721 electroweak
-0.509930582242 phys
-0.508077087246 superconductors
-0.499669826799 note
-0.495807320365 supersymmetric
-0.474015518876 presents
-0.472864090316 globular
-0.459139291805 crystals
-0.454792581225 known
-0.453939958781 purpose
-0.452874395955 survey
-0.446413121966 examine
-0.438447377713 universe
-0.433179944735 parton
-0.422473808713 relativity
-0.41966849567 supersymmetry
-0.416090744843 branching
-0.415194041589 p

In [98]:
for i in eidx[:50]:
    print(fit.coef_[i], names[i])

2.4959325565 finally
1.88096397355 conclude
1.24508366065 furthermore
1.09707015432 suggests
1.02540010775 also
1.00784665863 moreover
0.965459172661 further
0.905914423004 suggest
0.889823427602 findings
0.883119498333 addition
0.868217642158 implications
0.851651339355 therefore
0.843390390501 illustrate
0.839950329613 indicates
0.800677596931 our
0.766996185709 thus
0.765746433514 likely
0.68659055861 suggesting
0.652716057733 could
0.647259823769 confirm
0.638348489099 future
0.59035618229 would
0.580813368898 overall
0.575410019273 might
0.556805393588 consistent
0.545282955054 art
0.543424534093 tested
0.541094731761 appears
0.538489506626 indeed
0.537078707391 companion
0.534550125838 will
0.533981919464 better
0.524746294705 hence
0.524567758745 confirmed
0.522371775071 then
0.522296846586 another
0.520586677892 instead
0.516657969146 estimated
0.511777121631 agreement
0.503175337279 msun
0.496264220058 example
0.490508057525 similar
0.486380125119 contrast
0.482750040874 find
