In [1]:
from collections import Counter
from functools import partial
import gc
from multiprocessing import Pool

import numpy as np
import pandas as pd
from scipy.stats import f_oneway
from scipy.spatial.distance import squareform
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from tqdm import trange

from make_embedding import *

%load_ext ipycache

  from IPython.utils.traitlets import Unicode


In [2]:
embedding = load_pickle(r'data/specialized_embedding.pkl').toarray()

In [3]:
vectorizer = load_pickle(r'data/specialized_vectorizer.pkl')
words = np.array(vectorizer.get_feature_names())
del vectorizer

In [4]:
data = load_pickle(r'data/clean.pkl')
categories = load_json(r'categories-specialized.json')
subset = select_categories(data, categories)
del data
data = subset

In [5]:
gc.collect()

0

In [6]:
%%cache anova_specialized.pkl anova_pvals
_selectors = [data.domain == group for group in categories]
_PVALUE_ID = 1
anova_pvals = np.array([
    f_oneway(*[
        embedding[selector, i] for selector in _selectors
    ])[_PVALUE_ID]
    for i in trange(embedding.shape[1])
])

[Skipped the cell's code and loaded variables anova_pvals from file '/home/gmrukwa/source/publication-domain-discernibility/anova_specialized.pkl'.]


100%|##########| 100000/100000 [00:40<00:00, 2471.19it/s]


In [7]:
def tukey(i):
    return pairwise_tukeyhsd(embedding[:, i], data.domain)

_n = embedding.shape[1]
_ends = _n // 4, _n // 2, 3 * (_n // 4), _n

In [8]:
%%cache tukey_1_specialized.pkl _tukey1
with Pool(maxtasksperchild=5) as pool:
    _tukey1 = list(pool.map(tukey, trange(_ends[0]), chunksize=1000))

[Skipped the cell's code and loaded variables _tukey1 from file '/home/gmrukwa/source/publication-domain-discernibility/tukey_1_specialized.pkl'.]


100%|##########| 25000/25000 [00:00<00:00, 615954.32it/s]


In [9]:
%%cache tukey_2_specialized.pkl _tukey2
with Pool(maxtasksperchild=5) as pool:
    _tukey2 = list(pool.map(tukey, trange(_ends[0], _ends[1]), chunksize=1000))

[Skipped the cell's code and loaded variables _tukey2 from file '/home/gmrukwa/source/publication-domain-discernibility/tukey_2_specialized.pkl'.]


100%|##########| 25000/25000 [00:00<00:00, 558519.67it/s]


In [10]:
%%cache tukey_3_specialized.pkl _tukey3
with Pool(maxtasksperchild=5) as pool:
    _tukey3 = list(pool.map(tukey, trange(_ends[1], _ends[2]), chunksize=1000))

[Skipped the cell's code and loaded variables _tukey3 from file '/home/gmrukwa/source/publication-domain-discernibility/tukey_3_specialized.pkl'.]


100%|##########| 25000/25000 [01:52<00:00, 221.25it/s]


In [11]:
%%cache tukey_4_specialized.pkl _tukey4
with Pool(maxtasksperchild=5) as pool:
    _tukey4 = list(pool.map(tukey, trange(_ends[2], _ends[3]), chunksize=1000))

[Skipped the cell's code and loaded variables _tukey4 from file '/home/gmrukwa/source/publication-domain-discernibility/tukey_4_specialized.pkl'.]


100%|##########| 25000/25000 [02:47<00:00, 149.14it/s]


In [12]:
tukey_results = _tukey1 + _tukey2 + _tukey3 + _tukey4

In [21]:
anova_alpha = 0.001
tukey_alpha = 0.001

In [22]:
# Bonferroni correction
anova_alpha /= embedding.shape[0]

In [23]:
np.sum(anova_pvals <= anova_alpha)

16545

In [24]:
n_unique = data.domain.nunique()
expected_for_marker = n_unique - 1

markers = [
    i for i, v in enumerate(tukey_results)
    if anova_pvals[i] <= anova_alpha
    and np.sum(v.pvalues <= tukey_alpha) == expected_for_marker
    and (squareform(v.pvalues <= tukey_alpha).sum(axis=0) == expected_for_marker).any()
]
upregulated_category = [
    categories[np.argmax(squareform(tukey_results[i].pvalues <= tukey_alpha).sum(axis=0) == expected_for_marker)]
    for i in markers
]
upregulated_counts = Counter(upregulated_category)

In [25]:
len(markers)

14287

In [26]:
tukey_results[markers[0]].summary()

group1,group2,meandiff,p-adj,lower,upper,reject
allergies,cancer,-0.0014,0.001,-0.0018,-0.001,True
allergies,cardiovascular,-0.0014,0.001,-0.0018,-0.001,True
allergies,neurodegenerative,-0.0014,0.001,-0.0019,-0.001,True
allergies,psychiatric,-0.0014,0.001,-0.0019,-0.001,True
cancer,cardiovascular,0.0,0.9,-0.0004,0.0004,False
cancer,neurodegenerative,0.0,0.9,-0.0004,0.0004,False
cancer,psychiatric,0.0,0.9,-0.0004,0.0004,False
cardiovascular,neurodegenerative,0.0,0.9,-0.0004,0.0004,False
cardiovascular,psychiatric,0.0,0.9,-0.0004,0.0004,False
neurodegenerative,psychiatric,0.0,0.9,-0.0004,0.0004,False


In [27]:
words[markers[0]]

'aaaai'

In [28]:
upregulated_counts

Counter({'allergies': 3396,
         'cancer': 2543,
         'cardiovascular': 3322,
         'neurodegenerative': 3657,
         'psychiatric': 1369})