In [1]:
import os
import pandas as pd
import numpy as np

# Corpus with coordinates

## Number of documents

In [2]:
meta = pd.read_csv("../data/metadata.csv", index_col=None, encoding="latin1")
meta = meta.dropna(subset=["PMID"])
meta["PMID"] = meta["PMID"].astype(int)
meta.head(1)

Unnamed: 0,PMID,DOI,KEY,SOURCE,AUTHORS,YEAR,MONTH,JOURNAL,TITLE,PAGES,VOLUME,ABSTRACT_URL,NUM_COORDINATES,MNI_COORDINATES,BRAINMAP_ID,BEHAVIORAL_DOMAIN,EXPERIMENT,DESCRIPTION
0,1402966,,"Dolan R J, 1992",BrainMap,Dolan R J|Bench C J|Brown R G|Scott L C|Fristo...,1992.0,Sep,"Journal of Neurology, Neurosurgery, and Psychi...",Regional cerebral blood flow abnormalities in ...,768-773,55,http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?...,7.0,"-7.66,51.87,-8.33;-5.51,56.46,-4.28;-5.48,58.9...",6030020.0,"['Action.Rest', 'Action.Rest']","['Unimpaired > Impaired', 'Impaired > Unimpair...",Patients with depression who were cognitively ...


In [3]:
# All with coordinates
coord = set(meta["PMID"])
len(coord)

18155

In [4]:
# Neurosynth
ns = set(meta.loc[meta["SOURCE"] == "Neurosynth", "PMID"])
len(ns)

12676

In [5]:
# BrainMap
bm = set(meta.loc[meta["SOURCE"] == "BrainMap", "PMID"])
len(bm)

3346

In [6]:
# ACE
ace = set(meta.loc[meta["SOURCE"] == "ACE", "PMID"])
len(ace)

2133

In [7]:
len(ns) + len(bm) + len(ace) == len(coord)

True

## Number of words

In [8]:
def compute_word_count(corpus):
    word_counts = []
    for pmid in corpus:
        file = "../../nlp/corpus/{}.txt".format(pmid)
        words = open(file, "r").read().replace("_", " ").split()
        word_counts.append(len(words))
    return word_counts

In [9]:
# Total word count
coord_words = compute_word_count(coord)
np.sum(coord_words)

128170267

In [10]:
np.mean(coord_words)

7059.777857339576

In [11]:
np.std(coord_words)

2203.8275671295714

# General neuroimaging corpus

## Number of documents

In [12]:
gen = set([int(file.replace(".txt", "")) for file in os.listdir("../../nlp/corpus") if not file.startswith(".")])
gen = gen.intersection([int(pmid.strip()) for pmid in open("../../pubmed/query_190428/pmids.txt").readlines()])
gen = gen.union(coord)
len(gen)

29828

In [13]:
gen_pm = [int(pmid.strip()) for pmid in open("../../pubmed/query_190428/pmids.txt").readlines()]
gen_pm = set(gen_pm).intersection(gen)
len(gen_pm)

20423

In [14]:
# In Neurosynth and PubMed
len(gen_pm.intersection(ns))

6146

In [15]:
# In Neurosynth but not PubMed
len(ns.difference(gen_pm))

6530

In [16]:
# In BrainMap and PubMed
len(gen_pm.intersection(bm))

1706

In [17]:
# In BrainMap but not PubMed
len(bm.difference(gen_pm))

1640

In [18]:
# In ACE and PubMed
len(gen_pm.intersection(ace))

898

In [19]:
# In ACE but not PubMed
len(ace.difference(gen_pm))

1235

In [20]:
# Just in PubMed
len(gen.difference(coord))

11673

## Number of words

In [21]:
gen_words = compute_word_count(gen)
np.sum(gen_words)

203261251

In [22]:
np.mean(gen_words)

6814.444515220598

In [23]:
np.std(gen_words)

2548.5354720450955

# Psychiatric neuroimaging corpus

## Number of documents

In [24]:
psy = set([int(file.replace(".txt", "")) for file in os.listdir("../../nlp/corpus") if not file.startswith(".")])
psy = set([int(file.replace(".txt", "")) for file in os.listdir("../../dsm/corpus") if not file.startswith(".")])
psy = psy.intersection([int(pmid.strip()) for pmid in open("../../dsm/query_190428/pmids.txt").readlines()])
psy = psy.union(coord)
len(psy)

26070

In [25]:
psy_pm = [int(pmid.strip()) for pmid in open("../../dsm/query_190214/pmids.txt").readlines()]
psy_pm = set(psy_pm).intersection(psy)
len(psy_pm)

13954

In [26]:
# In Neurosynth and PubMed
len(psy_pm.intersection(ns))

3794

In [27]:
# In Neurosynth but not PubMed
len(ns.difference(psy_pm))

8882

In [28]:
# In BrainMap and PubMed
len(psy_pm.intersection(bm))

1156

In [29]:
# In BrainMap but not PubMed
len(bm.difference(psy_pm))

2190

In [30]:
# In ACE and PubMEd
len(psy_pm.intersection(ace))

1089

In [31]:
# In ACE but not PubMEd
len(ace.difference(psy_pm))

1044

In [32]:
# Just from PubMed
len(psy.difference(coord))

7915

# Corpus diameters

In [33]:
from math import pi

In [34]:
labels = ["Neurosynth", "BrainMap", "ACE", "PubMed (General)", "PubMed (Psychiatric)"]
corpora = [ns, bm, ace, gen_pm, psy_pm]
scalar = 2500.0
for label, corpus in zip(labels, corpora):
    area = len(corpus) / scalar
    diameter = 2 * (area / pi) ** 0.5
    print("{:22s} {:8.3f}\t(n = {})".format(label, diameter, len(corpus)))

Neurosynth                2.541	(n = 12676)
BrainMap                  1.305	(n = 3346)
ACE                       1.042	(n = 2133)
PubMed (General)          3.225	(n = 20423)
PubMed (Psychiatric)      2.666	(n = 13954)
