# Load the data

In [1]:
import pandas as pd
import sys
sys.path.append("..")
import utilities, ontology

## Brain activation coordinates

In [2]:
act_bin = utilities.load_coordinates()
print("Document N={}, Structure N={}".format(act_bin.shape[0], act_bin.shape[1]))

Document N=18155, Structure N=118


## Terms for mental functions

In [3]:
version = 190325
dtm_bin = utilities.load_doc_term_matrix(version=version, binarize=True)

In [4]:
lexicon = utilities.load_lexicon(["cogneuro"])
lexicon = sorted(list(set(lexicon).intersection(dtm_bin.columns)))
len(lexicon)

1683

In [5]:
dtm_bin = dtm_bin[lexicon]
print("Document N={}, Term N={}".format(dtm_bin.shape[0], dtm_bin.shape[1]))

Document N=18155, Term N=1683


## Document splits

In [6]:
train, val = [[int(pmid.strip()) for pmid in open("../data/splits/{}.txt".format(split))] for split in ["train", "validation"]]
print("Training N={}, Validation N={}".format(len(train), len(val)))

Training N=12708, Validation N=3631


# Most strongly linked terms by PMI

Links are computed as PMI-weighted co-occurrences across the training set

In [7]:
import numpy as np

In [8]:
stm = ontology.load_stm(act_bin.loc[train], dtm_bin.loc[train]) 
print("Structure N={}, Term N={}".format(stm.shape[0], stm.shape[1]))

Structure N=118, Term N=1634


### Terms most strongly linked to the left amygdala

In [9]:
amyg_terms = stm.loc["left_amygdala"].sort_values(ascending=False)[:25]
amyg_terms

olfactory_stimulus_transduction                4.286718
auditory_system_function                       3.082745
letter_naming_task                             2.900424
eye_puff                                       2.494958
face_identification_task                       2.440891
pavlovian_conditioning_task                    2.035426
emotion_expression_identification              2.018034
waisinformation                                2.018034
social_norm_processing_task                    1.984133
offensive_aggression                           1.951343
counterconditioning                            1.829982
time_wall                                      1.747744
category_based_induction                       1.696451
wais_picture_completion                        1.671758
behavioral_inhibition_temperament              1.624130
mixed_gamble_task                              1.589841
passive_attention                              1.514129
face_maintenance                               1

In [10]:
freqs = dtm_bin[amyg_terms.index].mean() * 100
freqs

olfactory_stimulus_transduction                0.005508
auditory_system_function                       0.005508
letter_naming_task                             0.016524
eye_puff                                       0.005508
face_identification_task                       0.022032
pavlovian_conditioning_task                    0.033049
emotion_expression_identification              0.005508
waisinformation                                0.016524
social_norm_processing_task                    0.005508
offensive_aggression                           0.005508
counterconditioning                            0.038557
time_wall                                      0.011016
category_based_induction                       0.005508
wais_picture_completion                        0.016524
behavioral_inhibition_temperament              0.033049
mixed_gamble_task                              0.016524
passive_attention                              0.027541
face_maintenance                               0

In [11]:
max(freqs)

0.1817681079592399

In [12]:
max(freqs) * len(dtm_bin) / 100

33.00000000000001

### Structures most strongly linked to *face_identification_task*

In [13]:
stm["face_identification_task"].sort_values(ascending=False)[:10]

right_parahippocampal_gyrus_anterior_division    2.540766
left_frontal_medial_cortex                       2.467574
right_amygdala                                   2.465049
left_parahippocampal_gyrus_anterior_division     2.448795
right_frontal_medial_cortex                      2.442356
left_amygdala                                    2.440891
right_hippocampus                                2.129478
left_temporal_pole                               2.026462
right_temporal_pole                              2.001289
left_cingulate_gyrus_anterior_division           1.582475
Name: face_identification_task, dtype: float64

# Most strongly associated terms by *r<sub>pb</sub>*

In [14]:
from scipy.stats import pointbiserialr

In [15]:
lists, circuits = ontology.load_ontology(7)
circuits.loc[circuits["STRUCTURE"] == "left_amygdala"]

FileNotFoundError: [Errno 2] File b'lists/lists_k07_oplen.csv' does not exist: b'lists/lists_k07_oplen.csv'

In [None]:
list_lens = range(5, 26)
structures = list(circuits.loc[circuits["CLUSTER"] == 6, "STRUCTURE"])
centroid = np.mean(act_bin.loc[train, structures], axis=1)
R = pd.Series([pointbiserialr(dtm_bin.loc[train, word], centroid)[0] 
               for word in dtm_bin.columns], index=dtm_bin.columns)
R = R[R > 0].sort_values(ascending=False)[:max(list_lens)]
R = pd.DataFrame({"TOKEN": R.index, "R": R.values})
R.head()

In [None]:
freqs = dtm_bin[list(R["TOKEN"])].mean() * 100
freqs

In [None]:
max(freqs)

In [None]:
max(freqs) * len(dtm_bin) / 100