# Load the lexicons

In [1]:
import pandas as pd
import preproc

In [2]:
ontol = pd.read_csv("../lexicon/lists_data-driven_lr.csv", index_col=None)
ontol.head()

Unnamed: 0,ORDER,CLUSTER,DOMAIN,TOKEN,R,ROC_AUC
0,1,6,MEMORY,memory,0.199004,0.655879
1,1,6,MEMORY,episodic_memory,0.174621,0.655879
2,1,6,MEMORY,declarative_memory,0.172933,0.655879
3,1,6,MEMORY,recognition_memory,0.16235,0.655879
4,1,6,MEMORY,remembering,0.157813,0.655879


In [3]:
lexicon_psyc = sorted(list(set(ontol["TOKEN"])))
len(lexicon_psyc)

90

In [4]:
anat = pd.read_csv("../lexicon/lexicon_brain.csv", index_col=None)
anat.head()

Unnamed: 0,HARVARD_OXFORD,TERMS,SOURCE,TYPE
0,accumbens,accumbens,Harvard-Oxford,term
1,accumbens,acb,NeuroNames,acronym
2,accumbens,nucleus accumbens,NeuroNames,term
3,accumbens,accumbens nucleus,NeuroNames,term
4,accumbens,nucleus accumbens septi,NeuroNames,term


In [5]:
# From "synonyms" notebook
struct2synonyms = { 
                    'amygdala': ['amygdalar'],
                    'hippocampus': ['hippocampal', 'hippocampal_formation'],
                    'cerebellum': ['cerebellar'],
                    'thalamus': ['thalamic'],
                    'middle_frontal_gyrus': ['mfg'],
                    'caudate': ['caudate_nucleus', 'caudate_head', 'caudate_nucleus_putamen'],
                    'precentral_gyrus': ['precentral'],
                    'superior_frontal_gyrus': ['sfg'],
                    'postcentral_gyrus': ['postcentral'],
                    'lingual_gyrus': ['lingual'],
                    'superior_parietal_lobule': ['spl', 'superior_parietal_gyrus'],
                    'brainstem': ['brain_stem'],
                    'insular_cortex': ['insula', 'insular', 'insular_region'],
                    'frontal_pole': ['frontopolar', 'frontal_operculum'],
                    'heschls_gyrus': ['heschl', 'hg', 'heschls'],
                    'planum_temporale': ['planum'],
                    'supplementary_motor_cortex': ['supplementary_motor_area', 'sma'],
                    'paracingulate_gyrus': ['paracingulate', 'paracentral'],
                    'planum_polare': ['planum'],
                    'subcallosal_cortex': ['subcallosal'],
                    'frontal_medial_cortex': ['fmc'],
                    'precuneous_cortex': ['precuneous'],
                    'frontal_operculum_cortex': ['frontal_operculumanterior']
                  }

In [6]:
lexicon_anat = []
for struct, struct_terms in struct2synonyms.items():
    lexicon_anat += struct_terms
len(lexicon_anat)

34

In [7]:
for term in anat["TERMS"]:
    term = preproc.preprocess_text(term).replace(" ", "_")
    lexicon_anat.append(term)
lexicon_anat = sorted(list(set(lexicon_anat)))
len(lexicon_anat)

281

# Load the corpus PMIDs

In [8]:
df = pd.read_csv("../metadata/metadata.csv", encoding="latin-1")
df.head()

Unnamed: 0,PMID,DOI,KEY,SOURCE,AUTHORS,YEAR,MONTH,JOURNAL,TITLE,PAGES,...,NUM_COORDINATES,MNI_COORDINATES,BRAINMAP_ID,BEHAVIORAL_DOMAIN,EXPERIMENT,DESCRIPTION,IMPACT_FACTOR_2018,CITATIONS,CITATIONS_PER_YEAR,N_SUBJECTS
0,1402966,,"Dolan R J, 1992",BrainMap,Dolan R J|Bench C J|Brown R G|Scott L C|Fristo...,1992,Sep,JOURNAL OF NEUROLOGY NEUROSURGERY & PSYCHIATRY,Regional cerebral blood flow abnormalities in ...,768-773,...,7.0,"-7.66,51.87,-8.33;-5.51,56.46,-4.28;-5.48,58.9...",6030020.0,"['Action.Rest', 'Action.Rest']","['Unimpaired > Impaired', 'Impaired > Unimpair...",Patients with depression who were cognitively ...,8.327,21,0.75,10.0
1,1410086,,"Bench C J, 1992",BrainMap,Bench C J|Friston K J|Brown R G|Scott L C|Frac...,1992,Aug,PSYCHOLOGICAL MEDICINE,The anatomy of melancholia - focal abnormaliti...,607-615,...,10.0,"-13.68,29.54,33.76;-15.76,23.53,38.75;-39.5,38...",6030018.0,"['Action.Rest', 'Action.Rest', 'Action.Rest']","['Normals > Depressed', 'Unimpaired > Impaired...",Patients with depression and normal subjects u...,5.641,94,3.357143,10.0
2,1448149,10.1038/360339a0,"Zatorre R J, 1992",BrainMap,Zatorre R J|Jones-Gotman M|Evans A C|Meyer E,1992,Nov,NATURE,Functional localization and lateralization of ...,339-340,...,6.0,"-21.56,5.58,-27.71;22.3,6.36,-21.58;18.98,41.4...",8110311.0,['Perception.Olfaction'],['Odor vs. Odorless'],Subjects underwent 2 conditions in which they ...,43.07,84,3.0,11.0
3,1486459,,"Demonet J F, 1992",BrainMap,Demonet J F|Chollet F|Ramsay S|Cardebat D|Nesp...,1992,Dec,BRAIN,The anatomy of phonological and semantic proce...,1753-1768,...,30.0,"-58.78,13.84,-0.09;-56.57,-13.98,-2.25;-56.6,-...",30434.0,"['Cognition.Language.Phonology', 'Cognition.La...","['Phonemes - Tones', 'Words - Tones', 'Words -...",Subjects underwent 3 conditions in which they ...,11.814,201,7.178571,9.0
4,1486460,,"Howard D, 1992",BrainMap,Howard D|Patterson K|Wise R J S|Brown W D|Fris...,1992,Dec,BRAIN,The cortical localization of the lexicons: Pos...,1769-1782,...,14.0,"-52.17,-30.19,7.98;56.78,-16.85,5.61;-52.1,-49...",4020008.0,"['Perception.Audition', 'Cognition.Language.Sp...","['Hear and Say - See and Say', 'Word Reading v...",Subjects underwent 4 conditions. Condition 1: ...,11.814,75,2.678571,12.0


In [9]:
pmids = list(df["PMID"].astype(int))
len(pmids)

18155

# Fit document-term matrices

## Corpus of full texts

In [10]:
records = [open("fulltexts/{}.txt".format(pmid), "r").read() for pmid in pmids]

In [11]:
dtm = preproc.fit_dtm(records, pmids, lexicon_psyc, "fulltexts_psyc")
dtm.sum()

action                      80133
anticipation                15714
arousal                     17789
articulation                 2806
attention                   84622
audition                     1041
auditory_perception           760
autobiographical_memory      5413
cognitive                   96273
cognitive_control           23911
cognitive_process            9113
context                     44732
covert                       6007
decision                    36045
decision_making             10841
declarative_memory           2021
emotion                     68157
emotional_memory             2190
encoding                    60711
encoding_task                2188
episodic_memory             11975
episodic_simulation           400
error_detection              2479
execution                   15072
face                       128325
face_perception              4451
facial_expression           18354
familiarity                 14862
fear                        25387
goal          

In [12]:
dtm = preproc.fit_dtm(records, pmids, lexicon_anat, "fulltexts_anat")
dtm.sum()

acb                                                81
accumbens                                         715
accumbens_nucleus                                  30
acgg                                                6
amg                                               594
amygdala                                       116973
amygdalar                                        2324
amygdaloid_body                                    19
amygdaloid_nuclear_complex                          2
amygdaloid_nucleus                                130
ang                                               748
anterior_central_gyrus                              3
anterior_cingulate                              52242
anterior_cingulate_gyrus                         5596
anterior_inferior_temporal_gyrus                  126
anterior_middle_temporal_gyrus                    333
anterior_mtg                                      196
anterior_nucleus                                  162
anterior_parahippocampal_gyr

## Corpus of abstracts

In [10]:
records = [open("abstracts/{}.txt".format(pmid), "r").read() for pmid in pmids]

In [11]:
dtm = preproc.fit_dtm(records, pmids, lexicon_psyc, "abstracts_psyc")
dtm.sum()

action                     3843
anticipation                795
arousal                     628
articulation                 86
attention                  3429
audition                     40
auditory_perception          32
autobiographical_memory     223
cognitive                  5647
cognitive_control          1370
cognitive_process           407
context                    2001
covert                      269
decision                   1578
decision_making             563
declarative_memory           95
emotion                    2585
emotional_memory             97
encoding                   2838
encoding_task                73
episodic_memory             586
episodic_simulation          16
error_detection              70
execution                   694
face                       5428
face_perception             138
facial_expression           768
familiarity                 502
fear                       1108
goal                        847
                           ... 
rehearsa

In [12]:
dtm = preproc.fit_dtm(records, pmids, lexicon_anat, "abstracts_anat")
dtm.sum()

acb                                               0
accumbens                                        25
accumbens_nucleus                                 0
acgg                                              0
amg                                              12
amygdala                                       5274
amygdalar                                       101
amygdaloid_body                                   0
amygdaloid_nuclear_complex                        0
amygdaloid_nucleus                                0
ang                                              12
anterior_central_gyrus                            0
anterior_cingulate                             3052
anterior_cingulate_gyrus                        216
anterior_inferior_temporal_gyrus                  4
anterior_middle_temporal_gyrus                   13
anterior_mtg                                      5
anterior_nucleus                                  3
anterior_parahippocampal_gyrus                    9
anterior_smg

# Sum across anatomical synonyms

In [13]:
lexicon_harvox = sorted(list(set(anat["HARVARD_OXFORD"])))
len(lexicon_harvox)

57

In [14]:
def sum_synonyms(dtm, lexicon):
    dtm_sums = pd.DataFrame(index=dtm.index)
    for struct in lexicon:
        synonyms = [term for term in anat.loc[anat["HARVARD_OXFORD"] == struct, "TERMS"]] # Synonyms from NeuroNames
        if struct in struct2synonyms.keys():
            synonyms += struct2synonyms[struct]
        synonyms = [preproc.preprocess_text(term).replace(" ", "_") for term in synonyms]
        synonyms = [term for term in synonyms if term in dtm.columns]
        dtm_sums[struct] = dtm[synonyms].sum(axis=1)
    return dtm_sums

## Corpus of full texts

In [18]:
dtm = pd.read_csv("dtm_fulltexts_anat.csv.gz", index_col=0, header=0)
dtm_sums = sum_synonyms(dtm, lexicon_harvox)
dtm_sums.to_csv("dtm_fulltexts_anat-syns.csv.gz")
dtm_sums.head()

Unnamed: 0,accumbens,amygdala,angular_gyrus,brainstem,caudate,central_opercular_cortex,cerebellum,cingulate_gyrus_anterior_division,cingulate_gyrus_posterior_division,cuneal_cortex,...,superior_temporal_gyrus_posterior_division,supplementary_motor_cortex,supracalcarine_cortex,supramarginal_gyrus_anterior_division,supramarginal_gyrus_posterior_division,temporal_fusiform_cortex_anterior_division,temporal_fusiform_cortex_posterior_division,temporal_occipital_fusiform_cortex,temporal_pole,thalamus
1402966,0,1,0,0,3,0,26,3,0,0,...,0,0,0,0,0,0,0,0,0,0
1410086,0,2,0,2,1,0,13,11,5,0,...,0,0,0,0,0,0,0,0,0,0
1448149,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1486459,0,0,0,0,2,0,0,2,5,0,...,0,2,0,0,0,0,0,0,0,0
1486460,0,0,0,0,5,0,0,4,0,2,...,1,0,0,0,0,0,0,0,0,0


In [19]:
dtm_sums.sum()

accumbens                                          8989
amygdala                                         120420
angular_gyrus                                       799
brainstem                                         30074
caudate                                           49447
central_opercular_cortex                           4629
cerebellum                                        91889
cingulate_gyrus_anterior_division                 60535
cingulate_gyrus_posterior_division                26127
cuneal_cortex                                     12848
frontal_medial_cortex                             12001
frontal_operculum_cortex                           3409
frontal_orbital_cortex                             5202
frontal_pole                                      16757
heschls_gyrus                                     12754
hippocampus                                       91985
inferior_frontal_gyrus_pars_opercularis            7221
inferior_frontal_gyrus_pars_triangularis        

## Corpus of abstracts

In [15]:
dtm = pd.read_csv("dtm_abstracts_anat.csv.gz", index_col=0, header=0)
dtm_sums = sum_synonyms(dtm, lexicon_harvox)
dtm_sums.to_csv("dtm_abstracts_anat-syns.csv.gz")

In [16]:
dtm_sums.sum()

accumbens                                         386
amygdala                                         5389
angular_gyrus                                      12
brainstem                                        1402
caudate                                          1748
central_opercular_cortex                          111
cerebellum                                       3710
cingulate_gyrus_anterior_division                3296
cingulate_gyrus_posterior_division               1331
cuneal_cortex                                     254
frontal_medial_cortex                             281
frontal_operculum_cortex                          120
frontal_orbital_cortex                             91
frontal_pole                                      426
heschls_gyrus                                     352
hippocampus                                      3934
inferior_frontal_gyrus_pars_opercularis           173
inferior_frontal_gyrus_pars_triangularis          106
inferior_temporal_gyrus_ante