1. just bow
1. pos
1. sent len, word len, etc

In [117]:
import attr
import re
import numpy as np

from collections import Counter
from itertools import islice
from tqdm import tqdm_notebook as bar

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [67]:
class Corpus:
    
    def __init__(self, path):
        self.path = path
        
    def lines(self):
        with open(self.path) as fh:
            for line in fh:
                yield line.strip()
    
    def abstract_lines(self):
        lines = []
        for line in self.lines():
            if line:
                lines.append(line)
            else:
                yield lines
                lines = []
                
    def abstracts(self):
        for lines in self.abstract_lines():
            yield Abstract.from_lines(lines)
            
    def xy(self):
        for abstract in self.abstracts():
            yield from abstract.xy()

In [68]:
@attr.s
class Abstract:
    
    identifier = attr.ib()
    tags = attr.ib()
    sentences = attr.ib()
    
    @classmethod
    def from_lines(cls, lines):
        return cls(lines[0], lines[1].split(), lines[2:])
    
    def xy(self):
        for y, sent in enumerate(self.sentences):
            tokens = re.findall('[a-z]+', sent.lower())
            x = dict(Counter(tokens))
            yield x, y

In [73]:
train = Corpus('../data/abstracts/train.txt')

In [103]:
dv = DictVectorizer()

In [104]:
train_x, train_y = zip(*islice(train.xy(), 300000))

In [105]:
train_x = dv.fit_transform(train_x)

In [106]:
train_x

<300000x68321 sparse matrix of type '<class 'numpy.float64'>'
	with 6363071 stored elements in Compressed Sparse Row format>

In [107]:
model = LinearRegression()

In [108]:
fit = model.fit(train_x, train_y)

In [109]:
test = Corpus('../data/abstracts/test.txt')

In [110]:
test_x, test_y = zip(*islice(test.xy(), 100000))

In [111]:
test_x = dv.transform(test_x)

In [112]:
r2_score(test_y, fit.predict(test_x))

0.019257919496291853

In [115]:
names = dv.get_feature_names()

In [135]:
bidx = fit.coef_.argsort()
eidx = np.flip(fit.coef_.argsort(), 0)

In [136]:
for i in bidx[:50]:
    print(fit.coef_[i], names[i])

-19.6203649077 osteoclast
-18.5355845999 trabecular
-17.9417328524 jswt
-17.8524440701 driemel
-15.870660574 madc
-15.7539119351 dsls
-14.7806331809 medal
-14.6025215955 beringei
-14.6025215955 graueri
-14.5399094982 dehydration
-13.6006905655 adt
-13.5092456511 noy
-13.3484313982 awdrat
-13.1112707521 lstar
-12.5992748528 visone
-12.5715964877 mad
-12.4504417208 fltz
-12.2332370437 arkhipov
-12.159701377 moghanjoughi
-12.0216646274 titius
-11.9909223093 perles
-11.9298114633 anowski
-11.8530596779 derepression
-11.5418878314 saraykoy
-11.4951830493 nonpositve
-11.4927466852 aec
-11.4475839104 lanzhou
-11.0821181309 scala
-10.9876494666 wvb
-10.8207547164 taskforce
-10.6124515354 psoriatic
-10.5283636017 winternitz
-10.3999152757 midday
-10.3112545438 sandhas
-10.2914132211 cavi
-9.9905480091 solstice
-9.98230403353 silicalite
-9.92906623756 schlag
-9.84131704077 dcls
-9.81328330168 lascu
-9.75710266655 scab
-9.75710266655 blotch
-9.70761722998 phonetically
-9.545980135 lox
-9.54495913

In [134]:
for i in eidx[:50]:
    print(fit.coef_[i], names[i])

21.9188804699 superfilters
18.8427232628 chalonge
17.9248559228 dcg
16.6531617316 quarrks
15.5868113333 kno
15.5025898373 heterochromatin
15.2781215252 kulkarni
14.6977091709 ldust
14.4990358164 fukue
14.2208955092 neutrinno
14.1238048741 superpolynomially
13.7912389505 staining
13.7677104394 derma
13.6519161378 aodvers
13.6399354759 opalescent
13.519131663 resemblances
13.4906333744 reification
13.147405749 derenzo
12.8290086543 evora
12.4396013098 grasmannian
12.1291636125 seh
12.0091803957 zhai
12.0078366158 fvm
11.9696597097 nonreversibility
11.8748029108 artir
11.7604292988 mtminer
11.7343740297 interm
11.7078532589 methylstyrene
11.6913473731 bosonn
11.6913410287 albumin
11.6055853223 femur
11.5623085894 welland
11.5285165703 evening
11.5079287314 subsuming
11.3851701523 lepp
11.3718280425 microct
11.2504217227 maria
11.1626172626 warned
11.1360955612 streaked
11.1335989017 joyal
10.9682770438 infelicitous
10.9603912796 curbing
10.9386723446 graviational
10.8795327156 antidiamond