In [2]:
import itertools, os
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split

# Combine metadata from sources

In [15]:
bm = pd.read_csv("brainmap/brainmap_metadata_180809.csv", header=0, index_col=None, encoding="cp858")
ns = pd.read_csv("neurosynth/neurosynth_180805.csv", header=0, index_col=None)
ac = pd.read_csv("ace/ace_180805.csv", header=0, index_col=None)

In [16]:
df = bm.copy()
df["PMID"] = pd.to_numeric(df["PMID"], downcast="integer")

In [17]:
ns["id"] = pd.to_numeric(ns["id"], downcast="integer")
ac["id"] = pd.to_numeric(ac["id"], downcast="integer")

In [18]:
df["SOURCE"] = "BrainMap"
df["MNI_COORDINATES"] = ""

In [19]:
# Load BrainMap coordinates
coord = {}
for splitter, study in itertools.groupby(open("brainmap/coordinates_180803.txt").readlines(), 
                                         lambda line: line == "\n"):
    if not splitter:
        study = list(study)
        key = study[0].replace("// ", "").split(": ")[0]
        if key in list(bm["KEY"]):
            try:
                pmid = int(bm.loc[bm["KEY"] == key, "PMID"])
            except:
                print(key)
            if pmid not in coord.keys():
                coord[pmid] = []
            for line in study:
                if not line.startswith("//"):
                    coord[pmid].append(line.replace("\t", ",").strip())

In [20]:
for i, row in df.iterrows():
    pmid = row["PMID"]
    if pmid in coord.keys():
        df.set_value(i, "MNI_COORDINATES", ";".join(coord[pmid]))

In [21]:
# Add Neurosynth data 
for pmid in sorted(list(set(ns["id"]))):
    if pmid not in list(df["PMID"]):
        rows = ns[ns["id"] == pmid]
        row = rows.iloc[0]
        dic = {"BRAINMAP_ID": [], "PMID": [pmid], "KEY": [row["authors"].split(",")[0] + ", " + str(row["year"])], 
               "1st_AUTHOR": [row["authors"].split(",")[0]], "AUTHORS": [row["authors"]], 
               "YEAR": [row["year"]], "TITLE": [row["title"]], "JOURNAL": [row["journal"]], 
               "VOLUME": [], "MONTH": [], "PAGES": [], "BEHAVIORAL_DOMAIN": [], "EXPERIMENT": [], "DESCRIPTION": [], 
               "ABSTRACT_URL": ["http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=PubMed&list_uids={}&dopt=Abstract".format(row["id"])], 
               "NUM_COORDINATES": [], "NUM_SUBJECTS": [], "DOI": [row["doi"]], "SOURCE": ["Neurosynth"], "MNI_COORDINATES": [";".join(list(rows["mni_coord"]))]}
        new_row = pd.DataFrame.from_dict(dic, orient="index").transpose()
        df = df.append(new_row, ignore_index=True)

In [22]:
# Add ACE data
for pmid in sorted(list(set(ac["id"]))):
    if pmid not in list(df["PMID"]):
        rows = ac[ac["id"] == pmid]
        row = rows.iloc[0]
        dic = {"BRAINMAP_ID": [], "PMID": [pmid], "KEY": [row["authors"].split(",")[0] + ", " + str(row["year"])], 
               "1st_AUTHOR": [row["authors"].split(",")[0]], "AUTHORS": [row["authors"]], 
               "YEAR": [row["year"]], "TITLE": [row["title"]], "JOURNAL": [row["journal"]], 
               "VOLUME": [], "MONTH": [], "PAGES": [], "BEHAVIORAL_DOMAIN": [], "EXPERIMENT": [], "DESCRIPTION": [], 
               "ABSTRACT_URL": ["http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=PubMed&list_uids={}&dopt=Abstract".format(row["id"])], 
               "NUM_COORDINATES": [], "NUM_SUBJECTS": [], "DOI": [row["doi"]], "SOURCE": ["ACE"], "MNI_COORDINATES": [";".join(list(rows["mni_coord"]))]}
        new_row = pd.DataFrame.from_dict(dic, orient='index').transpose()
        df = df.append(new_row, ignore_index=True)

In [23]:
len(set(list(df["PMID"])))

18197

In [37]:
columns = ["PMID", "DOI", "KEY", "SOURCE", "AUTHORS", "YEAR", "MONTH", "JOURNAL", 
           "TITLE", "PAGES", "VOLUME", "ABSTRACT_URL", "NUM_COORDINATES", "MNI_COORDINATES", 
           "BRAINMAP_ID", "BEHAVIORAL_DOMAIN", "EXPERIMENT", "DESCRIPTION"]

In [26]:
df.to_csv("metadata_raw_180811.csv", index=None, columns=columns)

# Filter by data availability

In [3]:
df = pd.read_csv("metadata_raw_180811.csv", index_col=None, header=0, encoding="ISO-8859-1")

In [7]:
coords_avail = [int(file.replace(".txt", "")) for file in os.listdir("../coordinates/0mm") if not file.startswith(".")]
texts_avail = [int(file.replace(".txt", "")) for file in os.listdir("../../nlp/corpus") if not file.startswith(".")]

In [8]:
df = df[df["PMID"].isin(coords_avail)]
df = df[df["PMID"].isin(texts_avail)]
len(df)

18155

In [9]:
# Check for duplicates
duplicates = [k for k, v in Counter(df["PMID"]).items() if v > 1]
duplicates

[]

In [10]:
df.to_csv("metadata_filt_180811.csv", index=None)

# Split into dev, train, and test sets

In [25]:
train_prop = 0.7
val_prop = 0.2
test_prop = 0.1

In [28]:
ids = [int(id) for id in df["PMID"]]
train, rest = train_test_split(ids, test_size=val_prop+test_prop, random_state=42)
val, test = train_test_split(rest, test_size=test_prop/(val_prop+test_prop), random_state=42)
print("Length of training set:   {:5d} ({:.2f}%)".format(len(train), 100*len(train)/len(ids)))
print("Length of validation set: {:5d} ({:.2f}%)".format(len(val), 100*len(val)/len(ids)))
print("Length of test set:       {:5d} ({:.2f}%)".format(len(test), 100*len(test)/len(ids)))
print("Assigned IDs:             {:5d} ({:.2f}%)".format(len(train)+len(val)+len(test), 100*(len(train)+len(val)+len(test))/len(ids)))

Length of training set:   12708 (70.00%)
Length of validation set:  3631 (20.00%)
Length of test set:        1816 (10.00%)
Assigned IDs:             18155 (100.00%)


In [29]:
with open("splits/train.txt", "w+") as file:
    file.write("\n".join([str(id) for id in train]))
with open("splits/validation.txt", "w+") as file:
    file.write("\n".join([str(id) for id in val]))
with open("splits/test.txt", "w+") as file:
    file.write("\n".join([str(id) for id in test]))

# Compile document-coordinate matrix

In [4]:
df = pd.read_csv("metadata_filt_180811.csv", index_col=None, header=0, encoding="ISO-8859-1")

In [12]:
inlab = open("../labels/harvard-oxford_148struct.csv", "r").readlines()[1:]
labels_bilateral = sorted(set([line.split(",")[2] for line in inlab]))

In [13]:
def gen_label_bilateral(label):
    parts_to_replace = ["_iiv", "_v", "_vi", "_vermis_vi", "_crus_i", "_vermis_crus_i", "_crus_ii", "_vermis_crus_ii", "_viib", "_vermis_viib", "_viiia", "_vermis_viiia", "_viiib", "_vermis_viiib", "_ix", "_vermis_ix", "_x", "_vermis_x"]
    for part in parts_to_replace:
        if label.endswith(part):
            label = label.replace(part, "_cerebellum")
    parts_to_remove = ["juxtapositional_lobule_cortex_(formerly_", ")", "_(includes_h1_and_h2"]
    for part in parts_to_remove:
        label = label.replace(part, "")
    return label.strip()

In [18]:
def prob_thres_dcm(df, prob, sigma=0, atlas="bilateral", labs=[]): 
    dcm = {}
    for pmid in sorted(list(df["PMID"])):
        dcm[pmid] = {}
        dcm[pmid]["PMID"] = int(pmid)
        lines = open("../coordinates/{}mm/{}.txt".format(sigma, int(pmid)), "r").readlines()
        hits = []
        for line in lines:
            for struct in line.split(","):
                if len(struct.split()) == 2:
                    label, p = struct.split()
                    if float(p) > prob:
                        if atlas == "unilateral":
                            hits += [gen_label_unilateral(label)]
                        elif atlas == "bilateral":
                            hits += [gen_label_bilateral(label)]
        for label in labs:
            count = hits.count(label)
            if count > 0:
                dcm[pmid][label] = 1
            else:
                dcm[pmid][label] = 0
    outfile = "../coordinates/dcm/dcm_{}mm_thres{}.csv".format(sigma, str(prob).replace(".", "p"))
    with open(outfile, "w+"):
        out = pd.DataFrame(dcm).transpose()
        out.to_csv(outfile, index=False, quoting=1, columns=["PMID"] + labs)

In [21]:
for sigma in [0, 5]:
#     for prob in range(0,100,5):
    prob = 0.01
    if not os.path.isfile("../coordinates/dcm/dcm_{}mm_thres{}.csv".format(sigma, str(prob).replace(".", "p"))):
        prob_thres_dcm(df, prob, sigma=sigma, atlas="bilateral", labs=labels_bilateral)

In [21]:
# Average probability by structure
def prob_mean_dcm(df, sigma=0, atlas="bilateral", labs=[]): 
    dcm = {}
    for pmid in sorted(list(df["PMID"])):
        dcm[pmid] = {}
        dcm[pmid]["PMID"] = int(pmid)
        lines = open("../coordinates/{}mm/{}.txt".format(sigma, int(pmid)), "r").readlines()
        hits = []
        for line in lines:
            for struct in line.split(","):
                if len(struct.split()) == 2:
                    label, p = struct.split()
                    if atlas == "unilateral":
                        hits += [(gen_label_unilateral(label), p)]
                    elif atlas == "bilateral":
                        hits += [(gen_label_bilateral(label), p)]
        for label in labs:
            probs = [float(p) for l, p in hits if l == label]
            if len(probs) > 0:
                dcm[pmid][label] = 0.01 * sum(probs) / len(probs)
            else:
                dcm[pmid][label] = 0
    outfile = "../coordinates/dcm/dcm_{}mm_mean.csv".format(sigma)
    with open(outfile, "w+"):
        out = pd.DataFrame(dcm).transpose()
        out.to_csv(outfile, index=False, quoting=1, columns=["PMID"] + labs)

In [22]:
for sigma in [0, 5]:
    prob_mean_dcm(df, sigma=sigma, atlas="bilateral", labs=labels_bilateral)

In [23]:
# Count by structure
def count_dcm(df, sigma=0, atlas="bilateral", labs=[]): 
    dcm = {}
    for pmid in sorted(list(df["PMID"])):
        dcm[pmid] = {}
        dcm[pmid]["PMID"] = int(pmid)
        lines = open("../coordinates/{}mm/{}.txt".format(sigma, int(pmid)), "r").readlines()
        hits = []
        for line in lines:
            for struct in line.split(","):
                if len(struct.split()) == 2:
                    label, p = struct.split()
                    if atlas == "unilateral":
                        hits += [(gen_label_unilateral(label), p)]
                    elif atlas == "bilateral":
                        hits += [(gen_label_bilateral(label), p)]
        for label in labs:
            probs = [float(p) for l, p in hits if l == label]
            dcm[pmid][label] = len(probs)
    outfile = "../coordinates/dcm/dcm_{}mm_count.csv".format(sigma)
    with open(outfile, "w+"):
        out = pd.DataFrame(dcm).transpose()
        out.to_csv(outfile, index=False, quoting=1, columns=["PMID"] + labs)

In [24]:
for sigma in [0, 5]:
    count_dcm(df, sigma=sigma, atlas="bilateral", labs=labels_bilateral)

# Article sources

In [5]:
len(df[df["SOURCE"] == "BrainMap"])

3346

In [6]:
len(df[df["SOURCE"] == "ACE"])

2133

In [7]:
len(df[df["SOURCE"] == "Neurosynth"])

12676