In [4]:
import attr
import re
import numpy as np

from collections import Counter, defaultdict
from itertools import islice

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [5]:
class Corpus:
    
    def __init__(self, path, skim=None):
        self.path = path
        self.skim = skim
        
    def lines(self):
        with open(self.path) as fh:
            for line in fh:
                yield line.strip()
    
    def abstract_lines(self):
        lines = []
        for line in self.lines():
            if line:
                lines.append(line)
            else:
                yield lines
                lines = []

    def abstracts(self):
        ab_lines = self.abstract_lines()
        if self.skim:
            ab_lines = islice(ab_lines, self.skim)
        for lines in ab_lines:
            yield Abstract.from_lines(lines)
            
    def xy(self, vocab):
        for abstract in self.abstracts():
            yield from abstract.xy(vocab)
            
    def token_counts(self):
        counts = defaultdict(lambda: 0)
        for ab in self.abstracts():
            for tokens in ab.sentence_tokens():
                for token in tokens:
                    counts[token] += 1
        return Counter(counts)

In [6]:
@attr.s
class Abstract:
    
    identifier = attr.ib()
    tags = attr.ib()
    sentences = attr.ib()
    
    @classmethod
    def from_lines(cls, lines):
        return cls(lines[0], lines[1].split(), lines[2:])
    
    def sentence_tokens(self):
        for sent in self.sentences:
            yield re.findall('[a-z]+', sent.lower())
    
    def xy(self, vocab):
        sent_tokens = list(self.sentence_tokens())
        for i, tokens in enumerate(sent_tokens):
            x = Counter([t for t in tokens if t in vocab])
            y = i / (len(sent_tokens)-1)
            yield x, y

In [7]:
train = Corpus('../data/abstracts/train.txt', 1000000)

In [8]:
counts = train.token_counts()

In [9]:
vocab = set([k for k, _ in counts.most_common(2000)])

In [10]:
dv = DictVectorizer()

In [11]:
train_x, train_y = zip(*train.xy(vocab))

In [13]:
train_x = dv.fit_transform(train_x)

In [14]:
train_x

<4758878x2000 sparse matrix of type '<class 'numpy.float64'>'
	with 83076373 stored elements in Compressed Sparse Row format>

In [15]:
model = LinearRegression()

In [16]:
fit = model.fit(train_x, train_y)

In [17]:
test = Corpus('../data/abstracts/test.txt', 50000)

In [18]:
test_x, test_y = zip(*test.xy(vocab))

In [19]:
test_x = dv.transform(test_x)

In [20]:
r2_score(test_y, fit.predict(test_x))

0.27807718715938701

In [21]:
names = dv.get_feature_names()

In [22]:
bidx = fit.coef_.argsort()
eidx = np.flip(fit.coef_.argsort(), 0)

In [23]:
for i in bidx[:50]:
    print(fit.coef_[i], names[i])

-0.2196469005 let
-0.163229038677 often
-0.147029501012 report
-0.141095645196 usually
-0.138353329268 investigate
-0.131485351099 aim
-0.131182990641 paper
-0.128837459361 consider
-0.128657935972 widely
-0.128047283648 investigated
-0.12582735115 article
-0.125357963763 phys
-0.119351134853 recently
-0.110511990421 study
-0.110369806736 studied
-0.107565703088 theoretically
-0.104208645257 goal
-0.0977081210248 telescope
-0.0949798582031 presents
-0.0856493461534 superconductor
-0.0855591158579 qcd
-0.0846083395821 called
-0.0830839664757 photometry
-0.0827291581215 electroweak
-0.0813302230393 superconductors
-0.0801618758471 examine
-0.0798485175169 purpose
-0.0788851028823 relativity
-0.078548025611 supersymmetric
-0.07834622373 known
-0.0776480427997 review
-0.0751008840151 recent
-0.0734146320682 difficult
-0.0716923000833 typically
-0.0713598054977 survey
-0.0712463391858 crystals
-0.0695670070584 attention
-0.0679890418682 past
-0.0656842861938 task
-0.0656602326798 human
-0.0

In [24]:
for i in eidx[:50]:
    print(fit.coef_[i], names[i])

0.367760739846 finally
0.297268892857 conclude
0.21895105414 furthermore
0.204388266138 moreover
0.201514367237 also
0.187678693509 illustrate
0.166043314562 implications
0.15565249018 examples
0.153093124303 suggests
0.15308407343 further
0.15060439333 discussed
0.150043503949 addition
0.149034062511 findings
0.142817336891 application
0.135847771445 briefly
0.134114866671 suggest
0.124569584459 consequence
0.12454008341 indicates
0.119905669278 example
0.118330985559 our
0.117457865096 future
0.112244733099 suggesting
0.111784482636 agreement
0.109328628418 thus
0.107292076219 discussion
0.105541117623 particular
0.104465315487 therefore
0.100865534773 demonstrate
0.100354013203 argue
0.0988056814968 proof
0.0972607217136 then
0.0937141990988 demonstrated
0.0923514784713 results
0.0902866124149 find
0.0898367338515 result
0.0897251566289 could
0.0895961759623 explained
0.0891477674541 tested
0.0875348506369 indicating
0.0874712811641 these
0.0854088390328 this
0.0839150482928 confirm