In [67]:
import ujson
import numpy as np

from glob import glob
from itertools import islice

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [35]:
def read_xy(pattern):
    for path in glob(pattern):
        with open(path) as fh:
            for line in fh:
                row = ujson.loads(line.strip())
                del row['x']['avg_word_len']
                yield row['x'], row['y']

In [75]:
iter_dev = read_xy('/Users/dclure/Projects/sent-order/data/xy-dev.json/*.json')

In [76]:
x_dev, y_dev = zip(*islice(iter_dev, 200000))

In [77]:
iter_test = read_xy('/Users/dclure/Projects/sent-order/data/xy-test.json/*.json')

In [78]:
x_test, y_test = zip(*islice(iter_test, 50000))

In [79]:
dv = DictVectorizer()

In [80]:
x_dev = dv.fit_transform(x_dev)

In [81]:
x_dev

<200000x4015 sparse matrix of type '<class 'numpy.float64'>'
	with 35934760 stored elements in Compressed Sparse Row format>

In [82]:
model = LinearRegression()

In [83]:
fit = model.fit(x_dev, y_dev)

In [86]:
x_test = dv.transform(x_test)

In [87]:
y_test_pred = fit.predict(x_test)

In [92]:
r2_score(y_test, y_test_pred)

0.31009089071311269

In [93]:
names = dv.get_feature_names()

In [94]:
bidx = fit.coef_.argsort()
eidx = np.flip(fit.coef_.argsort(), 0)

In [95]:
for i in bidx[:100]:
    print(fit.coef_[i], names[i])

-0.448116082149 _text3_so_-_called
-0.403654007143 _text2_with_respect
-0.370664589409 _tag3_IN_DT_NNS
-0.364305585213 _tag3_IN_DT_NN
-0.32944212287 _pos2_NOUN_CCONJ
-0.300620222034 _pos2_DET_NOUN
-0.271931607388 _text1_whose
-0.269959080234 _text1_Let
-0.269414145886 _text1_'s
-0.246465593199 _text3_the_framework_of
-0.246106200174 _text2_This_paper
-0.244868714664 _text2_by_means
-0.240463839836 _text3_First_,_we
-0.234039189102 _text2_We_study
-0.227879762816 _pos2_NUM_NOUN
-0.196899347145 _text2_We_report
-0.188669963755 _text2_We_present
-0.186171391632 _text3_is_one_of
-0.172689783179 _text3_In_addition_to
-0.172253865174 _text3_._Rev_.
-0.171848396247 _text3_The_aim_of
-0.167929630171 _text2_We_introduce
-0.154651853022 _text2_We_consider
-0.148213874286 _text3_XMM_-_Newton
-0.146939986971 _pos2_NOUN_ADP
-0.142004334248 _text2_allows_us
-0.140076878609 _text3_is_presented_.
-0.139732949954 _text3_is_given_.
-0.138453736905 _text2_gt_;
-0.138453736905 _text3_&_gt_;
-0.13554914179

In [96]:
for i in eidx[:100]:
    print(fit.coef_[i], names[i])

0.427495672322 _text3_with_respect_to
0.373971473458 _pos3_ADP_DET_NOUN
0.319716088044 _tag2_DT_NNS
0.317993172032 _tag2_NNS_CC
0.315765827044 _text1_Finally
0.307168213594 _tag2_NN_CC
0.304853105424 _text2_-_called
0.301754686602 _tag2_DT_NN
0.289396932015 _shape1_'x
0.273722039127 _text3_by_means_of
0.24579499559 _tag1_WP$
0.220448704317 _tag2_CD_NN
0.220171938381 _tag2_CD_NNS
0.207475817532 _text1_Furthermore
0.204333079454 _text1_Moreover
0.199819610605 _text2_In_addition
0.199227701329 _text2_so_-
0.188232535535 _text3_Phys_._Rev
0.176822551478 _text1_examples
0.17652684554 _text2_These_results
0.174565501767 _text2_As_an
0.169927039533 _text2_In_particular
0.160231392114 _text3_other_hand_,
0.157300520184 _text1_Carlo
0.154004051112 _text3_We_conclude_that
0.151341401735 _text2_the_framework
0.149966040159 _text3_,_we_can
0.148829872059 _text2_conclude_that
0.148589123677 _text2_We_also
0.146663766515 _text2_As_a
0.144865888286 _text1_suggests
0.142904057375 _text3_The_method_is
