# Load the data

In [1]:
import pandas as pd
import sys
sys.path.append("..")
import utilities, ontology

## Brain activation coordinates

In [2]:
act_bin = utilities.load_coordinates()
print("Document N={}, Structure N={}".format(act_bin.shape[0], act_bin.shape[1]))

Document N=18155, Structure N=114


## Terms for mental functions

In [3]:
version = 190325
dtm_bin = utilities.load_doc_term_matrix(version=version, binarize=True)

In [4]:
lexicon = utilities.load_lexicon(["cogneuro"])
lexicon = sorted(list(set(lexicon).intersection(dtm_bin.columns)))
len(lexicon)

1683

In [5]:
dtm_bin = dtm_bin[lexicon]
print("Document N={}, Term N={}".format(dtm_bin.shape[0], dtm_bin.shape[1]))

Document N=18155, Term N=1683


## Document splits

In [6]:
train, val = [[int(pmid.strip()) for pmid in open("../data/splits/{}.txt".format(split))] for split in ["train", "validation"]]
print("Training N={}, Validation N={}".format(len(train), len(val)))

Training N=12708, Validation N=3631


# Most strongly linked terms by PMI

Links are computed as PMI-weighted co-occurrences across the training set

In [7]:
import numpy as np

In [9]:
stm = ontology.load_stm(act_bin.loc[train], dtm_bin.loc[train]) 
print("Structure N={}, Term N={}".format(stm.shape[0], stm.shape[1]))

Structure N=114, Term N=1634


### Terms most strongly linked to the left amygdala

In [10]:
amyg_terms = stm.loc["left_amygdala"].sort_values(ascending=False)[:25]
amyg_terms

olfactory_stimulus_transduction                4.273455
auditory_system_function                       3.174843
letter_naming_task                             2.887161
eye_puff                                       2.481696
face_identification_task                       2.427629
waisinformation                                2.039863
emotion_expression_identification              2.039863
pavlovian_conditioning_task                    2.022164
social_norm_processing_task                    1.970870
offensive_aggression                           1.938081
counterconditioning                            1.816720
time_wall                                      1.734482
category_based_induction                       1.708506
wais_picture_completion                        1.658496
behavioral_inhibition_temperament              1.646374
mixed_gamble_task                              1.587878
face_maintenance                               1.500867
passive_attention                              1

In [11]:
freqs = dtm_bin[amyg_terms.index].mean() * 100
freqs

olfactory_stimulus_transduction                0.005508
auditory_system_function                       0.005508
letter_naming_task                             0.016524
eye_puff                                       0.005508
face_identification_task                       0.022032
waisinformation                                0.016524
emotion_expression_identification              0.005508
pavlovian_conditioning_task                    0.033049
social_norm_processing_task                    0.005508
offensive_aggression                           0.005508
counterconditioning                            0.038557
time_wall                                      0.011016
category_based_induction                       0.005508
wais_picture_completion                        0.016524
behavioral_inhibition_temperament              0.033049
mixed_gamble_task                              0.016524
face_maintenance                               0.022032
passive_attention                              0

In [12]:
max(freqs)

0.1817681079592399

In [13]:
max(freqs) * len(dtm_bin) / 100

33.00000000000001

### Structures most strongly linked to *face_identification_task*

In [14]:
stm["face_identification_task"].sort_values(ascending=False)[:10]

right_parahippocampal_gyrus_anterior_division    2.527504
left_frontal_medial_cortex                       2.454311
right_amygdala                                   2.451786
left_parahippocampal_gyrus_anterior_division     2.435533
right_frontal_medial_cortex                      2.429094
left_amygdala                                    2.427629
right_hippocampus                                2.116216
left_temporal_pole                               2.013200
right_temporal_pole                              1.988026
left_cingulate_gyrus_anterior_division           1.569213
Name: face_identification_task, dtype: float64

# Most strongly associated terms by *r<sub>pb</sub>*

In [15]:
from scipy.stats import pointbiserialr

In [16]:
lists, circuits = ontology.load_ontology(7)
circuits.loc[circuits["STRUCTURE"] == "left_amygdala"]

Unnamed: 0,STRUCTURE,CLUSTER
83,left_amygdala,6


In [17]:
list_lens = range(5, 26)
structures = list(circuits.loc[circuits["CLUSTER"] == 6, "STRUCTURE"])
centroid = np.mean(act_bin.loc[train, structures], axis=1)
R = pd.Series([pointbiserialr(dtm_bin.loc[train, word], centroid)[0] 
               for word in dtm_bin.columns], index=dtm_bin.columns)
R = R[R > 0].sort_values(ascending=False)[:max(list_lens)]
R = pd.DataFrame({"TOKEN": R.index, "R": R.values})
R.head()

  r = r_num / r_den


Unnamed: 0,TOKEN,R
0,fear,0.192291
1,emotion,0.188638
2,memory,0.185362
3,declarative_memory,0.168019
4,facial_expression,0.162843


In [18]:
freqs = dtm_bin[list(R["TOKEN"])].mean() * 100
freqs

fear                                      10.817956
emotion                                   18.121730
memory                                    17.736161
declarative_memory                         5.006885
facial_expression                          9.903608
valence                                   11.897549
episodic_memory                           16.293032
emotional_memory                           4.009915
recognition_memory                         9.876067
arousal                                   19.410631
remembering                               10.674745
face                                      12.338199
recognition                               14.811347
picture                                   13.715230
encoding                                  12.453869
emotional_expression                       6.367392
negative_emotion                           9.617185
memory_process                             4.841641
recall                                    18.198843
retrieval   

In [19]:
max(freqs)

19.410630680253373

In [20]:
max(freqs) * len(dtm_bin) / 100

3524.0