- bigrams / trigrams
- pos tags
- word count
- avg word length

In [64]:
import attr
import re
import numpy as np

from collections import Counter, defaultdict
from itertools import islice
from boltons.iterutils import windowed
from textblob import TextBlob
from cached_property import cached_property
from tqdm import tqdm_notebook

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [75]:
class Corpus:
    
    def __init__(self, path, skim=None):
        self.path = path
        self.skim = skim
        
    def lines(self):
        with open(self.path) as fh:
            for line in fh:
                yield line.strip()
    
    def abstract_lines(self):
        lines = []
        for line in self.lines():
            if line:
                lines.append(line)
            else:
                yield lines
                lines = []

    def abstracts(self):
        ab_lines = self.abstract_lines()
        if self.skim:
            ab_lines = islice(ab_lines, self.skim)
        for lines in tqdm_notebook(ab_lines, total=self.skim):
            yield Abstract.from_lines(lines)
            
    def xy(self, vocab):
        for abstract in self.abstracts():
            yield from abstract.xy(vocab)
            
    def ngram_counts(self, n):
        counts = defaultdict(lambda: 0)
        for ab in self.abstracts():
            for sent in ab.sentences:
                for ngram in sent.tokens:
                    counts[ngram] += 1
        return Counter(counts)
            
    def most_common_ngrams(self, n, depth):
        counts = self.ngram_counts(n)
        return set([k for k, _ in counts.most_common(depth)])

In [66]:
@attr.s
class Abstract:
    
    identifier = attr.ib()
    tags = attr.ib()
    sentences = attr.ib()
    
    @classmethod
    def from_lines(cls, lines):
        sentences = list(map(Sentence, lines[2:]))
        return cls(lines[0], lines[1].split(), sentences)
    
    def sentence_tokens(self):
        for sent in self.sentences:
            yield re.findall('[a-z]+', sent.lower())
    
    def xy(self, vocab):
        for i, sent in enumerate(self.sentences):
            x = sent.features(vocab)
            y = i / (len(self.sentences)-1)
            yield x, y

In [67]:
class Sentence:
    
    def __init__(self, text):
        self.blob = TextBlob(text)
        
    @cached_property
    def tokens(self):
        return list(self.blob.tokens.lower())
    
    @cached_property
    def tags(self):
        return [pos for _, pos in self.blob.tags]
    
    def token_ngrams(self, n=1):
        for ng in windowed(self.tokens, n):
            yield '_'.join(ng)
    
    def tag_ngrams(self, n=1):
        for ng in windowed(self.tags, n):
            yield '_'.join(ng)
            
    def token_ngram_counts(self, vocab=None, maxn=3):
        for n in range(1, maxn+1):
            counts = Counter(self.token_ngrams(n))
            for k, v in counts.items():
                if not vocab or k in vocab:
                    yield f'_{k}', v
            
    def tag_ngram_counts(self, maxn=3):
        for n in range(1, maxn+1):
            counts = Counter(self.tag_ngrams(n))
            for k, v in counts.items():
                yield f'_{k}', v
                    
    def word_count(self):
        return len(self.tokens)
                
    def _features(self, vocab=None):
        yield from self.token_ngram_counts(vocab)
        yield from self.tag_ngram_counts()
        yield 'word_count', self.word_count()
        
    def features(self, vocab=None):
        return dict(self._features(vocab))

In [68]:
s = Sentence('Does this work?')

In [69]:
s.features(vocab)

{'_?': 1,
 '_DT': 1,
 '_DT_NN': 1,
 '_NN': 1,
 '_VBZ': 1,
 '_VBZ_DT': 1,
 '_VBZ_DT_NN': 1,
 '_does': 1,
 '_this': 1,
 '_work': 1,
 'word_count': 4}

In [78]:
train = Corpus('../data/abstracts/train.txt', 100000)

In [73]:
vocab = (
    train.most_common_ngrams(1, 2000) |
    train.most_common_ngrams(2, 2000) |
    train.most_common_ngrams(3, 2000)
)










In [76]:
dv = DictVectorizer()

In [79]:
train_x, train_y = zip(*train.xy(vocab))




Exception in thread Thread-10:
Traceback (most recent call last):
  File "/usr/local/Cellar/python3/3.6.2/Frameworks/Python.framework/Versions/3.6/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/Users/dclure/Projects/plot-ordering/env/lib/python3.6/site-packages/tqdm/_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "/usr/local/bin/../Cellar/python3/3.6.2/bin/../Frameworks/Python.framework/Versions/3.6/lib/python3.6/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration






In [80]:
train_x = dv.fit_transform(train_x)

In [81]:
train_x

<537608x19410 sparse matrix of type '<class 'numpy.float64'>'
	with 36863190 stored elements in Compressed Sparse Row format>

In [82]:
model = LinearRegression()

In [83]:
fit = model.fit(train_x, train_y)

In [84]:
test = Corpus('../data/abstracts/test.txt', 50000)

In [85]:
test_x, test_y = zip(*test.xy(vocab))




In [86]:
test_x = dv.transform(test_x)

In [88]:
r2_score(test_y, fit.predict(test_x))

0.2874716160167281

In [89]:
names = dv.get_feature_names()

In [90]:
bidx = fit.coef_.argsort()
eidx = np.flip(fit.coef_.argsort(), 0)

In [91]:
for i in bidx[:50]:
    print(fit.coef_[i], names[i])

-1.97729958413 _VBP_POS_NNP
-1.7858672818 _VBP_POS_JJ
-1.75479987469 _WP_NNP_FW
-1.74683511691 _VBP_CD_PDT
-1.43630120798 _JJR_NNS_PDT
-1.37589765579 _WP_NNP_VBG
-1.31621541914 _DT_SYM_VB
-1.29938732449 _VBG_CD_NNPS
-1.29418290778 _VBP_EX_JJ
-1.25068506219 _NN_WP_CD
-1.2403710758 _CD_CD_NNPS
-1.20856110567 _CC_VBP_VBD
-1.16961240819 _RP_CD_PRP
-1.1695100011 _VB_NNS_PDT
-1.16900952554 _IN_POS_JJ
-1.16864035487 _PRP$_JJS_VBZ
-1.1617502757 _CC_WRB_JJR
-1.15643231369 _VBN_CD_FW
-1.12246027399 _RBR_NNS_JJR
-1.11746242379 _RBS_NNS_TO
-1.10953841158 _FW_CD_EX
-1.10057559815 _WRB_NNP_WRB
-1.08253594402 _EX_CD_JJR
-1.07127560348 _NNS_NNS_PDT
-1.06204471811 _NNP_DT_EX
-1.05188860811 _NN_JJS
-1.03819031383 _PRP$_NNS_JJR
-1.03378498517 _EX_VBD_NN
-1.02830669878 _RBR_NNS_VBZ
-1.02454904922 _RB_NNS_PDT
-1.02043412895 _WP_NNP_VBN
-1.01604754848 _VBG_PRP$_SYM
-1.01260840685 _WDT_NNS_PDT
-0.994820073008 _VBZ_NNS_JJR
-0.991999881253 _VBG_NNP_RBR
-0.989609659332 _POS_VBP_NNP
-0.985906096925 _FW_VBZ_NNP
-

In [92]:
for i in eidx[:50]:
    print(fit.coef_[i], names[i])

1.37065422552 _NNP_WRB_JJS
1.35836709825 _JJ_RP_VBN
1.35371162617 _DT_WRB_JJS
1.23341771585 _NN_WRB_JJS
1.19686541189 _RB_WRB_JJS
1.19278556375 _VBZ_FW_CC
1.15788562133 _NNS_WRB_JJS
1.13517450849 _WRB_VBD_IN
1.13273439557 _VBP_WRB_JJS
1.10697602148 _CC_NNPS_VBN
1.09815106827 _WP_JJ_SYM
1.08840136071 _EX_JJ_CD
1.06987544278 _WP_VBZ_VB
1.06941031868 _NNP_POS_POS
1.06612360962 _PRP_VBN_PRP
1.05868207224 _VB_WRB_JJS
1.04298104308 _VBN_WP_NNS
1.04244598952 _FW_TO_CD
1.01494958372 _WDT_NN_JJS
1.01412248917 _WDT_VBP_POS
1.00203451634 _CD_WRB_JJS
0.995243287313 _VBD_JJ_POS
0.985411157077 _SYM_NN_WDT
0.956414726854 _JJ_RP_CD
0.947220918372 _VBP_VBZ_VB
0.94613930224 _PRP_MD_VBZ
0.941623441394 _VBN_POS_VBP
0.915929314579 _WDT_NN_PDT
0.911308878872 _POS_CC_WDT
0.90999938567 _VBZ_VB_VBG
0.907092339621 _PRP_RBR_IN
0.902050632295 _VBZ_VB_RBR
0.89961313827 _IN_FW_VBN
0.896184110376 _EX_VBD_NNS
0.892431022165 _PRP_IN_VBP
0.890647691314 _NNPS_NN_CD
0.888307351543 _VBZ_FW_DT
0.88319763986 _PRP_NN_WP
0.88