# Load the data

In [1]:
import pandas as pd
from utilities import *

## Brain activation coordinates

In [2]:
act_bin = load_coordinates()
print("Document N={}, Structure N={}".format(act_bin.shape[0], act_bin.shape[1]))

Document N=18155, Structure N=114


## Terms for mental functions

In [3]:
version = 190124
dtm_bin = load_doc_term_matrix(version=version, binarize=True)

In [4]:
lexicon = load_lexicon(["cogneuro"])
lexicon = sorted(list(set(lexicon).intersection(dtm_bin.columns)))
len(lexicon)

2208

In [5]:
dtm_bin = dtm_bin[lexicon]
print("Document N={}, Term N={}".format(dtm_bin.shape[0], dtm_bin.shape[1]))

Document N=18155, Term N=2208


## Document splits

In [6]:
train, val = [[int(pmid.strip()) for pmid in open("../data/splits/{}.txt".format(split))] for split in ["train", "validation"]]
print("Training N={}, Validation N={}".format(len(train), len(val)))

Training N=12708, Validation N=3631


# Most strongly linked terms by PMI

Links are computed as PMI-weighted co-occurrences across the training set

In [7]:
import numpy as np

In [8]:
stm = np.dot(act_bin.loc[train].transpose(), dtm_bin.loc[train])
stm = pd.DataFrame(stm, columns=dtm_bin.columns, index=act_bin.columns)
stm = pmi(stm, positive=False)
stm = stm.dropna(axis=1, how="all") # Drop terms with no co-occurrences
print("Structure N={}, Term N={}".format(stm.shape[0], stm.shape[1]))

Structure N=114, Term N=1637


### Terms most strongly linked to the left amygdala

In [9]:
amyg_terms = stm.loc["left_amygdala"].sort_values(ascending=False)[:25]
amyg_terms

olfactory_stimulus_transduction                4.273268
auditory_system_function                       3.174656
letter_naming_task                             2.886974
eye_puff                                       2.481509
face_identification_task                       2.427441
waisinformation                                2.039676
emotion_expression_identification              2.039676
pavlovian_conditioning_task                    2.021976
social_norm_processing_task                    1.970683
offensive_aggression                           1.937893
counterconditioning                            1.816532
time_wall                                      1.734294
category_based_induction                       1.708319
wais_picture_completion                        1.658308
behavioral_inhibition_temperament              1.646187
mixed_gamble_task                              1.587691
passive_attention                              1.500679
face_maintenance                               1

In [10]:
freqs = dtm_bin[amyg_terms.index].mean() * 100
freqs

olfactory_stimulus_transduction                0.005508
auditory_system_function                       0.005508
letter_naming_task                             0.016524
eye_puff                                       0.005508
face_identification_task                       0.027541
waisinformation                                0.016524
emotion_expression_identification              0.005508
pavlovian_conditioning_task                    0.033049
social_norm_processing_task                    0.005508
offensive_aggression                           0.005508
counterconditioning                            0.038557
time_wall                                      0.011016
category_based_induction                       0.005508
wais_picture_completion                        0.016524
behavioral_inhibition_temperament              0.033049
mixed_gamble_task                              0.016524
passive_attention                              0.027541
face_maintenance                               0

In [11]:
max(freqs)

0.1817681079592399

In [12]:
max(freqs) * len(dtm_bin) / 100

33.00000000000001

### Structures most strongly linked to *face_identification_task*

In [13]:
stm["face_identification_task"].sort_values(ascending=False)[:10]

right_parahippocampal_gyrus_anterior_division    2.526130
left_frontal_medial_cortex                       2.453793
right_amygdala                                   2.450999
left_parahippocampal_gyrus_anterior_division     2.435121
right_frontal_medial_cortex                      2.429052
left_amygdala                                    2.427441
right_hippocampus                                2.116207
left_temporal_pole                               2.013696
right_temporal_pole                              1.987670
left_cingulate_gyrus_anterior_division           1.569111
Name: face_identification_task, dtype: float64

# Most strongly associated terms by *r<sub>pb</sub>*

In [14]:
from scipy.stats import pointbiserialr

In [15]:
lists, circuits = load_ontology(7)
circuits.loc[circuits["STRUCTURE"] == "left_amygdala"]

Unnamed: 0,CLUSTER,STRUCTURE
84,6,left_amygdala


In [16]:
list_lens = range(5, 26)
structures = list(circuits.loc[circuits["CLUSTER"] == 6, "STRUCTURE"])
centroid = np.mean(act_bin.loc[train, structures], axis=1)
R = pd.Series([pointbiserialr(dtm_bin.loc[train, word], centroid)[0] 
               for word in dtm_bin.columns], index=dtm_bin.columns)
R = R[R > 0].sort_values(ascending=False)[:max(list_lens)]
R = pd.DataFrame({"TOKEN": R.index, "R": R.values})
R.head()

  r = r_num / r_den


Unnamed: 0,TOKEN,R
0,emotion,0.20432
1,fear,0.202945
2,memory,0.181393
3,valence,0.173991
4,facial_expression,0.17111


In [17]:
freqs = dtm_bin[list(R["TOKEN"])].mean() * 100
freqs

emotion                                   18.253925
fear                                      10.862021
memory                                    17.796750
valence                                   11.936106
facial_expression                          9.969705
declarative_memory                         5.023410
arousal                                   19.476728
emotional_memory                           4.026439
episodic_memory                           16.331589
recognition_memory                         9.909116
face                                      12.360231
remembering                               10.724318
emotional_expression                       6.383916
picture                                   13.742771
negative_emotion                           9.666758
recognition                               14.827871
encoding                                  12.448361
international_affective_picture_system     3.916277
memory_process                             4.874690
anxiety     

In [18]:
max(freqs)

19.476728174056735

In [19]:
max(freqs) * len(dtm_bin) / 100

3536.0