In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
import numpy as np
from scipy.stats import ttest_ind

import extra_scripts.create_vectors as cv
import extra_scripts.utils as utils
from word_task.non_neural.predict import getXy

In [2]:
train_items = cv.load_prediction_items("../data/cmv_triples_train_token.jsonlist.gz")

2019-09-19 07:37:11,582 - Loading ../data/cmv_triples_train_token.jsonlist.gz.
100%|██████████| 26329/26329 [00:34<00:00, 757.48it/s] 
2019-09-19 07:37:50,361 - Loading ../data/cmv_triples_train_token.jsonlist.gz.npy.
2019-09-19 07:38:42,603 - Splitting vectors into instances.
100%|██████████| 26329/26329 [00:08<00:00, 2965.76it/s]
2019-09-19 07:38:52,339 - Extracting unique stems.
100%|██████████| 26329/26329 [01:50<00:00, 238.61it/s]


In [3]:
def split_by_stop(items):
    items_stop = [item for item in items if utils.is_stop(item[0])]
    items_stopless = [item for item in items if not utils.is_stop(item[0])]
    return items, items_stop, items_stopless

In [4]:
train_all, train_stop, train_stopless = split_by_stop(train_items)
print(len(train_all), len(train_stop), len(train_stopless))

X_train_all, y_train_all = getXy(train_all)
print(X_train_all.shape, y_train_all.shape)

X_train_stop, y_train_stop = getXy(train_stop)
print(X_train_stop.shape, y_train_stop.shape)

X_train_stopless, y_train_stopless = getXy(train_stopless)
print(X_train_stopless.shape, y_train_stopless.shape)

5900801 1553843 4346958
(5900801, 66) (5900801,)
(1553843, 66) (1553843,)
(4346958, 66) (4346958,)


In [5]:
def ttest(X, y):
    y_not = np.logical_not(y)

    for feat_idx, feat in enumerate(cv.FEATURE_LABELS[:-1]):
        feature_vals = X[:, feat_idx]
    
        features_when_true = feature_vals[y]
        features_when_false = feature_vals[y_not]

        tstat, pval = ttest_ind(features_when_true, features_when_false, equal_var=False)
        print("{:17} pval: {:.10f} tstat: {:3.3f}".format(feat, round(pval, 10), round(tstat, 10)))

print("ALL STEMS")
ttest(X_train_all, y_train_all)
print("\nJUST STOPWORDS")
ttest(X_train_stop, y_train_stop)
print("\nJUST NONSTOPWORDS")
ttest(X_train_stopless, y_train_stopless)

ALL STEMS
OP_PC_LEN_DIFF    pval: 0.0000000000 tstat: -79.745
AVG_TOK_LEN_DIFF  pval: 0.0000000000 tstat: -36.176
OP_LEN            pval: 0.0000000000 tstat: -40.744
PC_LEN            pval: 0.0000000000 tstat: 47.550
OP_PC_POS_DIFF    pval: 0.0000000000 tstat: -72.749
DEPTH             pval: 0.0000000000 tstat: 29.572
IDF               pval: 0.0000000000 tstat: -797.627
STEM_CHARS        pval: 0.0000000000 tstat: -526.671
WORDNET_DEPTH_MIN pval: 0.0000000000 tstat: -22.706
WORDNET_DEPTH_MAX pval: 0.0000000000 tstat: -27.576
TRANSFER_PROB     pval: 0.0000000000 tstat: 757.474
OP_ADP            pval: 0.0000000000 tstat: 202.793
OP_PRON           pval: 0.0000000000 tstat: 177.726
OP_X              pval: 0.0000000000 tstat: -111.535
OP_DET            pval: 0.0000000000 tstat: 205.502
OP_ADJ            pval: 0.0000000000 tstat: -240.681
OP_PROPN          pval: 0.0000000000 tstat: -125.960
OP_VERB           pval: 0.0000000000 tstat: -35.238
OP_PART           pval: 0.0000000000 tstat: 73.063


OP_AUX            pval: 0.0000000000 tstat: 45.750
PC_ADP            pval: 0.0000000000 tstat: -65.221
PC_PRON           pval: 0.0000000000 tstat: -215.293
PC_X              pval: 0.0000000000 tstat: -234.451
PC_DET            pval: 0.0000000000 tstat: -166.622
PC_ADJ            pval: 0.0000000000 tstat: 16.598
PC_PROPN          pval: 0.0000000000 tstat: -27.208
PC_VERB           pval: 0.0000000000 tstat: 82.535
PC_PART           pval: 0.0000000000 tstat: -286.413
PC_CCONJ          pval: 0.0000000000 tstat: -251.703
PC_INTJ           pval: 0.0000000000 tstat: -167.884
PC_NOUN           pval: 0.0000000000 tstat: 136.509
PC_NUM            pval: 0.0000000000 tstat: -87.216
PC_ADV            pval: 0.0000000000 tstat: 38.536
PC_PUNCT          pval: 0.0000000000 tstat: -197.720
PC_SYM            pval: 0.0000000000 tstat: -315.060
PC_AUX            pval: 0.0000000000 tstat: -138.901
OP_SUBJ           pval: 0.0000000000 tstat: 90.624
OP_OBJ            pval: 0.0000000000 tstat: 42.047
OP_OTHER 