In [1]:
import ujson
import numpy as np

from glob import glob
from itertools import islice

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [2]:
def read_xy(pattern):
    for path in glob(pattern):
        with open(path) as fh:
            for line in fh:
                row = ujson.loads(line.strip())
                del row['x']['avg_word_len']
                yield row['x'], row['y']

In [3]:
iter_dev = read_xy('/Users/dclure/Projects/sent-order/data/xy-dev.json/*.json')

In [4]:
x_dev, y_dev = zip(*islice(iter_dev, 200000))

In [5]:
iter_test = read_xy('/Users/dclure/Projects/sent-order/data/xy-test.json/*.json')

In [6]:
x_test, y_test = zip(*islice(iter_test, 50000))

In [8]:
dv = DictVectorizer()

In [9]:
x_dev = dv.fit_transform(x_dev)

In [10]:
x_dev

<200000x14364 sparse matrix of type '<class 'numpy.float64'>'
	with 45585470 stored elements in Compressed Sparse Row format>

In [11]:
model = LinearRegression()

In [12]:
fit = model.fit(x_dev, y_dev)

In [13]:
x_test = dv.transform(x_test)

In [14]:
y_test_pred = fit.predict(x_test)

In [15]:
r2_score(y_test, y_test_pred)

0.30951990923549777

In [16]:
names = dv.get_feature_names()

In [17]:
bidx = fit.coef_.argsort()
eidx = np.flip(fit.coef_.argsort(), 0)

In [18]:
for i in bidx[:100]:
    print(fit.coef_[i], names[i])

-0.727891925867 _pos2_DET_NOUN
-0.507867739868 _tag2_,_WP$
-0.497477218914 _pos3_ADP_PROPN_NUM
-0.472919532695 _text1_whose
-0.458204547581 _text1_'s
-0.441488851993 _pos2_DET_X
-0.421528259557 _shape1_\xxxx$-xxxx
-0.418161270193 _tag3_IN_DT_NNS
-0.414845723092 _tag3_IN_DT_NN
-0.409494118103 _pos2_DET_PUNCT
-0.399446457502 _pos2_DET_PART
-0.383796688512 _text3_are_able_to
-0.379854243894 _pos2_DET_ADV
-0.359992641433 _shape1_^{dd}$Xx
-0.358321037223 _pos2_NUM_PART
-0.351983442893 _text2_by_means
-0.348087713562 _shape1_Xx(ddd
-0.342928555616 _text2_with_respect
-0.336671274742 _text3_so_-_called
-0.328417137891 _pos2_ADV_SPACE
-0.320956141101 _pos2_DET_SYM
-0.314189355137 _shape1_x^d$.
-0.305616497115 _text2_Monte_Carlo
-0.303817228887 _pos2_DET_ADJ
-0.29654298905 _pos2_DET_SPACE
-0.29065619199 _pos2_NOUN_PUNCT
-0.289024340478 _pos2_NOUN_ADV
-0.286907584714 _pos2_VERB_NUM
-0.284679217661 _pos2_DET_INTJ
-0.280051751153 _pos2_NOUN_NOUN
-0.279212593464 _tag2_WRB_TO
-0.276399771199 _pos3_P

In [19]:
for i in eidx[:100]:
    print(fit.coef_[i], names[i])

0.472738195899 _text2_,_whose
0.472659814075 _tag1_WP$
0.46555235012 _tag3_IN_NNP_CD
0.43760165545 _shape1_'x
0.429910012944 _pos3_ADP_DET_NOUN
0.425799581286 _shape2_$_\xxxx$-xxxx
0.393924598614 _text3_with_respect_to
0.380937970751 _text3_by_means_of
0.379184507723 _shape2_$_^{dd}$Xx
0.345505765989 _text2_are_able
0.337536348885 _tag2_DT_NN
0.330444325937 _tag2_DT_NNS
0.329694222758 _text1_Carlo
0.303510297992 _tag3_VBD_IN_DT
0.292643094623 _shape2_Xx(ddd_)
0.289718271945 _tag3_VBP_IN_DT
0.287378786893 _tag3_VB_IN_DT
0.287355144759 _text1_Finally
0.284195501191 _tag3_VBZ_IN_DT
0.277887475701 _tag3_VBN_IN_DT
0.269807112501 _tag3_VBG_IN_DT
0.268568226274 _shape2_d,d_)
0.264752825224 _shape1_xxxx.(Xxxxx
0.251788480419 _text2_the_-
0.236873703524 _shape1_dd^{dd
0.230808918478 _tag2_CD_TO
0.224311952845 _tag2_NN_MD
0.222730509816 _text3_,_due_to
0.215294205506 _text1_conclude
0.212025284611 _tag2_CD_POS
0.20868416362 _tag2_UH_JJ
0.208089914901 _shape1_xx/dddd
0.206705805416 _text2_in_agre