In [4]:
import attr
import re
import numpy as np

from collections import Counter
from itertools import islice

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [68]:
class Corpus:
    
    def __init__(self, path, skim=None):
        self.path = path
        self.skim = skim
        
    def lines(self):
        with open(self.path) as fh:
            for line in fh:
                yield line.strip()
    
    def abstract_lines(self):
        lines = []
        for line in self.lines():
            if line:
                lines.append(line)
            else:
                yield lines
                lines = []

    def abstracts(self):
        ab_lines = self.abstract_lines()
        if self.skim:
            ab_lines = islice(ab_lines, self.skim)
        for lines in ab_lines:
            yield Abstract.from_lines(lines)
            
    def xy(self, vocab):
        for abstract in self.abstracts():
            yield from abstract.xy(vocab)
            
    def token_counts(self):
        counts = Counter()
        for ab in self.abstracts():
            for tokens in ab.sentence_tokens():
                counts += Counter(tokens)
        return counts

In [69]:
@attr.s
class Abstract:
    
    identifier = attr.ib()
    tags = attr.ib()
    sentences = attr.ib()
    
    @classmethod
    def from_lines(cls, lines):
        return cls(lines[0], lines[1].split(), lines[2:])
    
    def sentence_tokens(self):
        for sent in self.sentences:
            yield re.findall('[a-z]+', sent.lower())
    
    def xy(self, vocab):
        for y, tokens in enumerate(self.sentence_tokens()):
            x = Counter([t for t in tokens if t in vocab])
            yield x, y

In [70]:
train = Corpus('../data/abstracts/train.txt', 10000)

In [71]:
counts = train.token_counts()

In [72]:
vocab = set([k for k, _ in counts.most_common(1000)])

In [73]:
dv = DictVectorizer()

In [74]:
train_x, train_y = zip(*train.xy(vocab))

In [80]:
train_x = dv.fit_transform(train_x)

In [81]:
train_x

<53739x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 819171 stored elements in Compressed Sparse Row format>

In [82]:
model = LinearRegression()

In [83]:
fit = model.fit(train_x, train_y)

In [84]:
test = Corpus('../data/abstracts/test.txt', 10000)

In [85]:
test_x, test_y = zip(*test.xy(vocab))

In [86]:
test_x = dv.transform(test_x)

In [87]:
r2_score(test_y, fit.predict(test_x))

0.16580631441074212

In [91]:
names = dv.get_feature_names()

In [92]:
bidx = fit.coef_.argsort()
eidx = np.flip(fit.coef_.argsort(), 0)

In [93]:
for i in bidx[:50]:
    print(fit.coef_[i], names[i])

-1.03581946915 report
-1.02835405239 let
-0.914497597299 investigate
-0.846844203299 al
-0.830184587498 article
-0.78163341155 recent
-0.757251426661 examine
-0.741650049954 investigated
-0.74088390302 carlo
-0.723502363229 consider
-0.720023497564 study
-0.688595456941 survey
-0.68076483802 recently
-0.64960052304 telescope
-0.647065037145 manifolds
-0.644762820453 paper
-0.619716594307 studied
-0.604969614124 laser
-0.58657661926 collisions
-0.576231418038 supersymmetric
-0.572544657412 superconducting
-0.571828226862 qcd
-0.558770243668 deep
-0.556927687653 neutrino
-0.548461252921 present
-0.539333841508 atomic
-0.536703146039 heat
-0.515898928237 introduce
-0.507475079512 introduced
-0.50589234049 quark
-0.498517965028 detector
-0.490626183995 canonical
-0.468951680846 basic
-0.466597914503 review
-0.463411717608 sequence
-0.458595662807 taking
-0.449025346291 active
-0.440712797268 infrared
-0.435981039214 known
-0.430630619422 decays
-0.42982338149 lhc
-0.425424811899 dispersion

In [94]:
for i in eidx[:50]:
    print(fit.coef_[i], names[i])

2.65772562297 finally
1.36091165965 furthermore
1.10032236784 suggests
1.05207885558 likely
1.03983679017 also
1.01671976192 could
0.971557985147 moreover
0.927273223944 suggest
0.914853856362 further
0.888437581213 therefore
0.862116291344 would
0.83410057832 our
0.805941867338 thus
0.763715659562 addition
0.760194778601 et
0.745486412053 future
0.742147928401 agreement
0.702527743588 example
0.683383316532 better
0.660295184184 consistent
0.625101118592 examples
0.614213598269 smaller
0.581573888967 explain
0.569837331063 less
0.540012897876 similar
0.530395535949 dust
0.522486797921 scenario
0.51678362572 fit
0.516417457861 monte
0.513047643445 uv
0.50172526165 required
0.498455515946 latter
0.491942143101 discussed
0.477243388645 application
0.470308389909 abundance
0.457692322662 shock
0.452193220649 additional
0.444230576243 should
0.43806749615 disk
0.437307457201 significant
0.431270219385 then
0.429391853738 section
0.427675138854 larger
0.425860355035 cases
0.425196246248 ear