In [1]:
import os, shutil
import numpy as np
import pandas as pd
np.random.seed(42)

# Aggregate the corpus

## Load PMIDs from articles with coordinates

In [2]:
meta = pd.read_csv("data/metadata.csv", encoding="latin1")
coord_pmids = list(meta["PMID"].dropna().astype(int).values)
print("{:26s} {}".format("Articles with Coordinates", len(coord_pmids)))

Articles with Coordinates  18155


## Load PMIDs from PubMed search for neuroimaging articles

In [3]:
pub_pmids = [int(pmid.strip()) for pmid in open("data/pubmed_pmids.txt", "r").readlines()]
print("{:26s} {}".format("Articles from PubMed", len(pub_pmids)))

Articles from PubMed       30132


## Copy over available full texts to corpus directory

In [4]:
in_dir = "/Users/ehbeam/Dropbox/Stanford/Research/Projects/Thesis/program/nlp/corpus"
for pmid in set(coord_pmids+pub_pmids):
    in_file = "{}/{}.txt".format(in_dir, pmid)
    out_file = "data/text/corpus/{}.txt".format(pmid)
    if os.path.isfile(in_file) and not os.path.isfile(out_file):
        shutil.copy(in_file, out_file)

In [5]:
corpus_files = [file for file in os.listdir("data/text/corpus") if not file.startswith(".")]
print("{:26s} {}".format("Articles in Corpus", len(corpus_files)))

Articles in Corpus         29828


# Split documents with coordinates into sets

## Shuffle the PMIDs

In [6]:
coord_pmids = np.random.choice(coord_pmids, size=len(coord_pmids), replace=False)

## Split randomly into train, dev, and test sets

Train: 90%, Dev: 5%, Test: 5%

In [7]:
n_docs = len(coord_pmids)
splits = {"train": coord_pmids[:int(n_docs*0.9)], 
          "dev": coord_pmids[int(n_docs*0.9):int(n_docs*0.95)],
          "test": coord_pmids[int(n_docs*0.95):]}
for split, split_ids in splits.items():
    print("{:7s}{}".format(split.title(), len(split_ids)))
print("{:7s}{}".format("Total", len(splits["train"])+len(splits["dev"])+len(splits["test"])))

Train  16339
Dev    908
Test   908
Total  18155


## Export the PMID lists

In [8]:
for split, split_ids in splits.items():
    split_ids = list(split_ids.astype(str))
    with open("data/splits/{}.txt".format(split), "w+") as outfile:
        outfile.write("\n".join(split_ids))