In [1]:
import os, shutil
import numpy as np
import pandas as pd
np.random.seed(42)

# Split documents with coordinates into sets

## Load PMIDs from articles with coordinates

In [2]:
meta = pd.read_csv("data/metadata.csv", encoding="latin1")
coord_pmids = list(meta["PMID"].dropna().astype(int).values)
print("{:26s} {}".format("Documents with Coordinates", len(coord_pmids)))

Documents with Coordinates 18155


## Shuffle the PMIDs

In [3]:
coord_pmids = np.random.choice(coord_pmids, size=len(coord_pmids), replace=False)

## Split randomly into train, dev, and test sets

Train: 90%, Dev: 5%, Test: 5%

In [4]:
n_docs = len(coord_pmids)
splits = {"train": coord_pmids[:int(n_docs*0.9)], 
          "dev": coord_pmids[int(n_docs*0.9):int(n_docs*0.95)],
          "test": coord_pmids[int(n_docs*0.95):]}
for split, split_ids in splits.items():
    print("{:7s}{}".format(split.title(), len(split_ids)))
print("{:7s}{}".format("Total", len(splits["train"])+len(splits["dev"])+len(splits["test"])))

Train  16339
Dev    908
Test   908
Total  18155


## Export the PMID lists

In [5]:
for split, split_ids in splits.items():
    split_ids = list(split_ids.astype(str))
    with open("data/splits/{}.txt".format(split), "w+") as outfile:
        outfile.write("\n".join(split_ids))

# Aggregate the corpus

## Load PMIDs from PubMed search for neuroimaging articles

In [6]:
pub_pmids = [int(pmid.strip()) for pmid in open("data/pubmed_pmids.txt", "r").readlines()]
print("{:26s} {}".format("Documents from PubMed", len(pub_pmids)))

Documents from PubMed      30132


## Copy over available full texts to corpus directory

In [7]:
in_dir = "/Users/ehbeam/Dropbox/Stanford/Research/Projects/Thesis/program/nlp/corpus"
for pmid in set(list(coord_pmids) + pub_pmids):
    in_file = "{}/{}.txt".format(in_dir, pmid)
    out_file = "data/text/corpus/{}.txt".format(pmid)
    if os.path.isfile(in_file) and not os.path.isfile(out_file):
        shutil.copy(in_file, out_file)

In [8]:
corpus_files = [file for file in os.listdir("data/text/corpus") if not file.startswith(".")]
print("{:26s} {}".format("Documents in Corpus", len(corpus_files)))

Documents in Corpus        29828


## Specify the lexicon

Find the intersection of terms in the VSM and DTM models

### Vector space model

In [9]:
vsm = pd.read_csv("data/text/glove_gen_n100_win15_min5_iter500_190428.txt", 
                  sep = " ", index_col=0, header=0)
n_vocab = vsm.shape[0]
n_emb = vsm.shape[1]
print("{:21s}{}".format("Embedding Dimension", n_emb))
print("{:21s}{}".format("Terms in VSM", n_vocab))

Embedding Dimension  100
Terms in VSM         350543


### Document-term matrix

In [10]:
from utilities import load_dtm

In [11]:
dtm_bin = load_dtm()
n_terms = dtm_bin.shape[1]
print("{:21s}{}".format("Documents", dtm_bin.shape[0]))
print("{:21s}{}".format("Terms in DTM", n_terms))

Documents            18155
Terms in DTM         1683


### Lexicon of mental function terms

In [12]:
lexicon = list(dtm_bin.columns.intersection(vsm.index))
lexicon += ["<eos>", "<unk>"]
with open("data/text/lexicon.txt", "w+") as file:
    file.write("\n".join(lexicon))
print("{:21s}{}".format("Terms in Lexicon", len(lexicon)))

Terms in Lexicon     1544


## Compile the LSTM corpora

**Training set:**  (1) Training set of documents with coordinates, (2) Documents from PubMed without coordinates.

**Dev and test sets:**  Splits from above consisting of documents with coordinates.

In [13]:
pm_only = set(pub_pmids).difference(set(coord_pmids))
texts_avail = [int(file.replace(".txt", "")) for file in corpus_files]
lstm_set = set(splits["train"]).union(pm_only).intersection(texts_avail)
print("{:28s} {}".format("Documents in LSTM Train Set", len(lstm_set)))

Documents in LSTM Train Set  28012


In [14]:
lstm_splits = splits
lstm_splits["train"] = lstm_set

In [31]:
with open("data/text/lstm_train_pmids.txt", "w+") as fout:
    for pmid in lstm_splits["train"]:
        fout.write(str(pmid) + "\n")

In [26]:
for split, split_ids in lstm_splits.items():
    corpus_file = open("data/text/corpus_{}.txt".format(split), "w+")
    docs = []
    print("Processing {} set".format(split))
    for i, pmid in enumerate(splits[split]):
        if i % 100 == 0:
            print("   Processing {}th document".format(i))
        words = open("data/text/corpus/{}.txt".format(pmid), "r").read().split()
        words = [word for word in words if word in lexicon]
        docs.append(" ".join(words))
    corpus_file.write("\n".join(docs))
    corpus_file.close()

Processing train set
   Processing 0th document
   Processing 100th document
   Processing 200th document
   Processing 300th document
   Processing 400th document
   Processing 500th document
   Processing 600th document
   Processing 700th document
   Processing 800th document
   Processing 900th document
   Processing 1000th document
   Processing 1100th document
   Processing 1200th document
   Processing 1300th document
   Processing 1400th document
   Processing 1500th document
   Processing 1600th document
   Processing 1700th document
   Processing 1800th document
   Processing 1900th document
   Processing 2000th document
   Processing 2100th document
   Processing 2200th document
   Processing 2300th document
   Processing 2400th document
   Processing 2500th document
   Processing 2600th document
   Processing 2700th document
   Processing 2800th document
   Processing 2900th document
   Processing 3000th document
   Processing 3100th document
   Processing 3200th document
 

   Processing 26800th document
   Processing 26900th document
   Processing 27000th document
   Processing 27100th document
   Processing 27200th document
   Processing 27300th document
   Processing 27400th document
   Processing 27500th document
   Processing 27600th document
   Processing 27700th document
   Processing 27800th document
   Processing 27900th document
   Processing 28000th document
Processing dev set
   Processing 0th document
   Processing 100th document
   Processing 200th document
   Processing 300th document
   Processing 400th document
   Processing 500th document
   Processing 600th document
   Processing 700th document
   Processing 800th document
   Processing 900th document
Processing test set
   Processing 0th document
   Processing 100th document
   Processing 200th document
   Processing 300th document
   Processing 400th document
   Processing 500th document
   Processing 600th document
   Processing 700th document
   Processing 800th document
   Processi

In [None]:
for split, split_ids in lstm_splits.items():