In [1]:
from collections import Counter
from functools import partial
import gc
from multiprocessing import Pool

import numpy as np
import pandas as pd
from scipy.stats import f_oneway
from scipy.spatial.distance import squareform
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from tqdm import trange

from make_embedding import *

%load_ext ipycache

  from IPython.utils.traitlets import Unicode


In [2]:
embedding = load_pickle(r'data/general_embedding.pkl').toarray()

In [3]:
vectorizer = load_pickle(r'data/general_vectorizer.pkl')
words = np.array(vectorizer.get_feature_names())
del vectorizer

In [4]:
data = load_pickle(r'data/clean.pkl')
categories = load_json(r'categories-general.json')
subset = select_categories(data, categories)
del data
data = subset

In [5]:
gc.collect()

0

In [6]:
%%cache anova.pkl anova_pvals
_selectors = [data.domain == group for group in categories]
_PVALUE_ID = 1
anova_pvals = np.array([
    f_oneway(*[
        embedding[selector, i] for selector in _selectors
    ])[_PVALUE_ID]
    for i in trange(embedding.shape[1])
])

[Skipped the cell's code and loaded variables anova_pvals from file '/home/gmrukwa/source/publication-domain-discernibility/anova.pkl'.]


100%|##########| 100000/100000 [00:46<00:00, 2169.13it/s]


In [7]:
def tukey(i):
    return pairwise_tukeyhsd(embedding[:, i], data.domain)

_n = embedding.shape[1]
_ends = _n // 4, _n // 2, 3 * (_n // 4), _n

In [8]:
%%cache tukey_1.pkl _tukey1
with Pool(maxtasksperchild=5) as pool:
    _tukey1 = list(pool.map(tukey, trange(_ends[0]), chunksize=1000))

[Skipped the cell's code and loaded variables _tukey1 from file '/home/gmrukwa/source/publication-domain-discernibility/tukey_1.pkl'.]


100%|##########| 25000/25000 [00:00<00:00, 944178.21it/s]


In [9]:
%%cache tukey_2.pkl _tukey2
with Pool(maxtasksperchild=5) as pool:
    _tukey2 = list(pool.map(tukey, trange(_ends[0], _ends[1]), chunksize=1000))

[Skipped the cell's code and loaded variables _tukey2 from file '/home/gmrukwa/source/publication-domain-discernibility/tukey_2.pkl'.]


100%|##########| 25000/25000 [00:00<00:00, 740833.69it/s]


In [10]:
%%cache tukey_3.pkl _tukey3
with Pool(maxtasksperchild=5) as pool:
    _tukey3 = list(pool.map(tukey, trange(_ends[1], _ends[2]), chunksize=1000))

[Skipped the cell's code and loaded variables _tukey3 from file '/home/gmrukwa/source/publication-domain-discernibility/tukey_3.pkl'.]


100%|##########| 25000/25000 [01:44<00:00, 238.75it/s]


In [11]:
%%cache tukey_4.pkl _tukey4
with Pool(maxtasksperchild=5) as pool:
    _tukey4 = list(pool.map(tukey, trange(_ends[2], _ends[3]), chunksize=1000))

[Skipped the cell's code and loaded variables _tukey4 from file '/home/gmrukwa/source/publication-domain-discernibility/tukey_4.pkl'.]


100%|##########| 25000/25000 [02:23<00:00, 174.49it/s]


In [12]:
tukey_results = _tukey1 + _tukey2 + _tukey3 + _tukey4

In [29]:
anova_alpha = 0.001
tukey_alpha = 0.001

In [30]:
# Bonferroni correction
anova_alpha /= embedding.shape[0]

In [31]:
np.sum(anova_pvals <= anova_alpha)

25716

In [32]:
n_unique = data.domain.nunique()
expected_for_marker = n_unique - 1

markers = [
    i for i, v in enumerate(tukey_results)
    if anova_pvals[i] <= anova_alpha
    and np.sum(v.pvalues <= tukey_alpha) == expected_for_marker
    and (squareform(v.pvalues <= tukey_alpha).sum(axis=0) == expected_for_marker).any()
]
upregulated_category = [
    categories[np.argmax(squareform(tukey_results[i].pvalues <= tukey_alpha).sum(axis=0) == expected_for_marker)]
    for i in markers
]
upregulated_counts = Counter(upregulated_category)

In [33]:
len(markers)

19368

In [34]:
tukey_results[markers[0]].summary()

group1,group2,meandiff,p-adj,lower,upper,reject
astronomy,computer science,0.0012,0.001,0.0007,0.0016,True
astronomy,medicine,0.0,0.9,-0.0004,0.0004,False
astronomy,physics,0.0,0.9,-0.0004,0.0005,False
astronomy,psychology,0.0001,0.9,-0.0003,0.0006,False
computer science,medicine,-0.0012,0.001,-0.0016,-0.0007,True
computer science,physics,-0.0011,0.001,-0.0016,-0.0006,True
computer science,psychology,-0.001,0.001,-0.0015,-0.0006,True
medicine,physics,0.0,0.9,-0.0005,0.0005,False
medicine,psychology,0.0001,0.9,-0.0004,0.0006,False
physics,psychology,0.0001,0.9,-0.0004,0.0006,False


In [35]:
words[markers[0]]

'aaai'

In [36]:
upregulated_counts

Counter({'computer science': 2536,
         'astronomy': 5260,
         'medicine': 7381,
         'psychology': 3021,
         'physics': 1170})