- bigrams / trigrams
- pos tags
- word count
- avg word length

In [1]:
import attr
import re
import numpy as np

from collections import Counter, defaultdict
from itertools import islice
from boltons.iterutils import windowed

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [46]:
class Corpus:
    
    def __init__(self, path, skim=None):
        self.path = path
        self.skim = skim
        
    def lines(self):
        with open(self.path) as fh:
            for line in fh:
                yield line.strip()
    
    def abstract_lines(self):
        lines = []
        for line in self.lines():
            if line:
                lines.append(line)
            else:
                yield lines
                lines = []

    def abstracts(self):
        ab_lines = self.abstract_lines()
        if self.skim:
            ab_lines = islice(ab_lines, self.skim)
        for lines in ab_lines:
            yield Abstract.from_lines(lines)
            
    def xy(self, vocab):
        for abstract in self.abstracts():
            yield from abstract.xy(vocab)
            
    def ngram_counts(self, n):
        counts = defaultdict(lambda: 0)
        for ab in self.abstracts():
            for sent in ab.sentences:
                for ngram in sent.ngrams(n):
                    counts[ngram] += 1
        return Counter(counts)
            
    def most_common_ngrams(self, n, depth):
        counts = self.ngram_counts(n)
        return set([k for k, _ in counts.most_common(depth)])

In [47]:
@attr.s
class Abstract:
    
    identifier = attr.ib()
    tags = attr.ib()
    sentences = attr.ib()
    
    @classmethod
    def from_lines(cls, lines):
        sentences = list(map(Sentence, lines[2:]))
        return cls(lines[0], lines[1].split(), sentences)
    
    def sentence_tokens(self):
        for sent in self.sentences:
            yield re.findall('[a-z]+', sent.lower())
    
    def xy(self, vocab):
        for i, sent in enumerate(self.sentences):
            x = sent.features(vocab)
            y = i / (len(self.sentences)-1)
            yield x, y

In [48]:
class Sentence(str):
    
    def ngrams(self, n=1):
        for ng in windowed(re.findall('[a-z]+', self.lower()), n):
            yield '_'.join(ng)
            
    def ngram_counts(self, vocab, maxn=3):
        for n in range(1, maxn+1):
            counts = Counter(self.ngrams(n))
            for k, v in counts.items():
                if k in vocab:
                    yield k, v
                
    def _features(self, vocab):
        yield from self.ngram_counts(vocab)
        
    def features(self, vocab):
        return dict(self._features(vocab))

In [49]:
train = Corpus('../data/abstracts/train.txt', 100000)

In [50]:
vocab = (
    train.most_common_ngrams(1, 2000) |
    train.most_common_ngrams(2, 2000) |
    train.most_common_ngrams(3, 2000)
)

In [51]:
dv = DictVectorizer()

In [52]:
train_x, train_y = zip(*train.xy(vocab))

In [54]:
train_x = dv.fit_transform(train_x)

In [55]:
train_x

<537608x6000 sparse matrix of type '<class 'numpy.float64'>'
	with 13879702 stored elements in Compressed Sparse Row format>

In [56]:
model = LinearRegression()

In [57]:
fit = model.fit(train_x, train_y)

In [58]:
test = Corpus('../data/abstracts/test.txt', 50000)

In [59]:
test_x, test_y = zip(*test.xy(vocab))

In [60]:
test_x = dv.transform(test_x)

In [61]:
r2_score(test_y, fit.predict(test_x))

0.31164644653867835

In [62]:
names = dv.get_feature_names()

In [63]:
bidx = fit.coef_.argsort()
eidx = np.flip(fit.coef_.argsort(), 0)

In [64]:
for i in bidx[:50]:
    print(fit.coef_[i], names[i])

-0.378732606239 particular_we
-0.318681743544 other_hand
-0.276923208589 of_the_art
-0.225336885815 this_note
-0.223746192725 the_context_of
-0.21295682045 in_addition_to
-0.208830371631 we_study
-0.206363332024 the_help_of
-0.197868956181 with_respect_to
-0.19515577133 this_paper
-0.1945329612 let
-0.183986751552 de_sitter
-0.178615046657 is_considered
-0.178231787319 functional_theory
-0.17324156592 an_application_we
-0.163054934971 the_importance_of
-0.160459851645 in_terms_of
-0.158772493683 this_article
-0.157906053727 is_presented
-0.156809169835 we_investigate
-0.156689843623 is_one_of
-0.155257792639 is_studied
-0.155004558615 we_report
-0.151518151056 often
-0.1478558958 usually
-0.142693107995 the_first_part
-0.141433957471 in_a_recent
-0.140921001432 next_to
-0.140265717868 phys
-0.137217232219 is_devoted_to
-0.135471331932 known_as
-0.133587063545 we_present_results
-0.133567407607 we_present
-0.133113996164 has_been_studied
-0.131176804579 and_therefore
-0.129824568371 com

In [65]:
for i in eidx[:50]:
    print(fit.coef_[i], names[i])

0.460668856414 in_particular_we
0.448326464864 the_other_hand
0.375605860872 the_art
0.320759381303 finally
0.251145020152 application_we
0.249359130003 as_a_corollary
0.227539946345 here_we_report
0.207757513213 conclude
0.20053314286 in_this_paper
0.200286770583 in_addition
0.200069308624 with_respect
0.196665660085 comment_on_the
0.196442763941 the_importance
0.189607266924 furthermore
0.182297927864 with_the_help
0.180462888464 monte_carlo
0.180097999793 sitter
0.177293204526 of_this_paper
0.172533917045 an_application
0.170214127339 can_be_understood
0.167598592001 the_proof
0.167023906793 light_on_the
0.166510419429 in_terms
0.165677632105 findings
0.16458813856 implications
0.163934881651 moreover
0.16370468044 density_functional_theory
0.157082418976 able_to
0.151442958338 in_addition_we
0.149677075407 consequence
0.148872269619 digital_sky_survey
0.14715021519 examples
0.13925908221 illustrate
0.138743381508 the_proposed
0.138592746662 also
0.138365638846 are_shown
0.137464291