# 1. Scrap abstracts from PubMED

In [19]:
import requests
import pickle
import xml.etree.ElementTree as ET
import csv
import sys
from typing import List
import math
import os

os.makedirs('./../data/abstracts/',exist_ok=True)
email = 'qkrcogns2222@gmail.com'
toolname = 'pubmed_retrieval'
task = 'mortality'

In [15]:
# Function to retrieve articles from a specified database using a provided query string
# Query string can be a single word/phrase or a list of words/phrase separated using '_'
# Note that if a list of words/phrases is provided, this search will require every term
# to be present in any articles it retrieves (i.e., 'AND' operation for multiple-term lists)
# TODO: Please add your tool name and email ID in the base_url variable

def db_extract(db, query)->List[int]:
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db={}&tool={}&email={}&retmax=100000&term=".format(db,toolname,email)

    article_ids = set()
    query = query.split('_')
    query = '+'.join(["%22"+x.replace(" ", "%20")+"%22[MeSH Terms]" for x in query])
    print("Running query: {}".format(query))
    query_url = base_url + query

    response = requests.get(query_url)
    root = ET.fromstring(response.content)
    count = root.find("Count").text
    id_list = root.find("IdList").findall("Id")
    article_ids.update([x.text for x in id_list])
    print(len(article_ids))
    if int(count) > 100000:
        cur = 100000
        while cur < int(count):
            new_query = base_url + query + "&retstart={}".format(cur)
            print("Running additional query: {}".format(query))
            response = requests.get(new_query)
            root = ET.fromstring(response.content)
            id_list = root.find("IdList").findall("Id")
            cur += len(id_list)
            article_ids.update([x.text for x in id_list])
            print(len(article_ids))
        print('Retrieved {}/{} results'.format(cur, count))
    else:
        print('Retrieved {} results'.format(count))
    return article_ids

In [4]:
# Example querying procedure for mortality outcome
# TODO: Specify query terms according to your outcomes of interest

# Retrieve mortality related articles from the PMC database
pmc_ids = db_extract("pmc", "hospital mortality")
pmc_ids = pmc_ids.union(db_extract("pmc", "mortality_risk factors_humans"))

# Retrieve mortality related articles from the PubMed database
pubmed_ids = db_extract("pubmed", "hospital mortality")
pubmed_ids = pubmed_ids.union(db_extract("pubmed", "mortality_risk factors_humans")) # set of int


Running query: %22hospital%20mortality%22[MeSH Terms]
11454
Retrieved 11454 results
Running query: %22mortality%22[MeSH Terms]+%22risk%20factors%22[MeSH Terms]+%22humans%22[MeSH Terms]
13825
Retrieved 13825 results
Running query: %22hospital%20mortality%22[MeSH Terms]
48755
Retrieved 48755 results
Running query: %22mortality%22[MeSH Terms]+%22risk%20factors%22[MeSH Terms]+%22humans%22[MeSH Terms]
62570
Retrieved 62570 results


In [6]:
# Procedure to combine articles retrieved from both PMC and PubMed databases
# To do this combination, PMC article IDs need to be mapped to their corresponding PubMed IDs first
# to avoid double-counting of articles included in both databases
def combine_ids(pmc, pubmed):
    reader = csv.reader(open('../data/PMC_id_map.csv'))
    id_dict = {}
    next(reader, None)
    for row in reader:
        id_dict[row[-4][3:]] = row[-3]
    correct_pmc = set()
    for id in pmc:
        if id not in id_dict or id_dict[id] == '':
            correct_pmc.add('PMC'+id)
            continue
        correct_pmc.add(id_dict[id])
    final_ids = correct_pmc.union(pubmed)
    return final_ids

In [8]:
FILENAME = 'mortality.ids.pck'
article_ids = combine_ids(pmc_ids, pubmed_ids)
print("Final collection for {} has {} articles".format("mortality", len(article_ids)))
pickle.dump(article_ids, open(f'../data/abstracts/{FILENAME}', 'wb'))

Final collection for mortality has 99301 articles


In [10]:
# Split abstracts according to database they are retrieved from
# This needs to be done to ensure that we are checking the correct database while retrieving text
def split_abstracts(abstracts):
    pubmed = []
    pmc = []
    for abstract in abstracts:
        if abstract.startswith('PMC'):
            pmc.append(abstract[3:])  # Drop PMC prefix since it is no longer needed to distinguish between PubMed/PMC
        else:
            pubmed.append(abstract)
    return pubmed, pmc

In [16]:
# Function to retrieve complete data for a batch of abstract IDs from a provided database
# Results will be retrieved in XML format
# TODO: Please add your tool name and email ID in the base_url variable
def retrieve_abstract_batch(id_batch, database):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db={}&id={}&retmode=xml&tool={}&email={}"
    query = base_url.format(database, ','.join(id_batch), toolname,email)
    response = requests.get(query)
    xml_abstracts = response.content
    return xml_abstracts

In [12]:
# Function to parse out abstract elements from retrieved XMLs
def parse_abstract_xml(xml, database):
    top_tag = {'pubmed': 'PubmedArticle', 'pmc': 'article'}  # PubMed and PMC use different XML tags
    parsed_xml = ET.fromstring(xml)
    if database == 'pmc':
        print(ET.tostring(parsed_xml))
    articles = parsed_xml.findall(top_tag[database])
    return articles

In [21]:
# Procedure that takes a large set of IDs, breaks it into manageable batches,
# queries the provided database and extracts abstracts from retrieved XMLs
# TODO: Change file path for storage if needed
def retrieve_all_abstracts(id_list, database):
    max_query_size = 200  # PubMed only accepts 200 IDs at a time when retrieving abstract text
    print('Retrieval will require {} queries'.format(math.ceil(len(id_list)/float(max_query_size))))
    retrieved_abstracts = []
    texts = {}
    for i in range(0, len(id_list), max_query_size):
        start = i
        end = min(len(id_list), start+max_query_size)
        cur_ids = id_list[start:end]
        cur_abstracts = retrieve_abstract_batch(cur_ids, database)
        cur_parsed_abstracts = parse_abstract_xml(cur_abstracts, database)
        if len(cur_parsed_abstracts) != (end-start):
            error_log.write('Missing abstracts:\n')
            error_log.write(','.join(cur_ids)+'\n')
        retrieved_abstracts += cur_parsed_abstracts
        for abstract in retrieved_abstracts:
            pmid = -99999
            abstract_text = ""
            year = -1000
            for element in abstract.iter():
                if element.tag == 'PMID':
                    if pmid == -99999:
                        pmid = element.text
                if element.tag == 'AbstractText':
                    if element.text:
                        abstract_text += element.text + '\n'
                if element.tag == 'PubDate':
                    for subelement in element.iter():
                        if subelement.tag == 'Year':
                            year = int(subelement.text )
            texts[pmid] = {'text': abstract_text, 'year': year}
        if len(texts) % 1000 == 0 or end == len(id_list):
            print('Retrieved {} abstracts'.format(end))
            retrieved_abstracts = []
        pickle.dump(texts, open('./../data/abstracts/{}.{}.texts_and_dates.pkl'.format(task, database), 'wb'))
    return

In [23]:
# Running text retrieval for IDs retrieved by outcome-specific queries
error_log = open('retrieval_errors.txt', 'w')
pubmed_abs, pmc_abs = split_abstracts(article_ids)
print('{} abstracts will be scraped from PubMed'.format(len(pubmed_abs)))
print('{} abstracts will be scraped from PMC'.format(len(pmc_abs)))
retrieve_all_abstracts(pubmed_abs, 'pubmed') #97094 for mortality
# retrieve_all_abstracts(pmc_abs, 'pmc') # 2207 for mortality
error_log.close()

97094 abstracts will be scraped from PubMed
2207 abstracts will be scraped from PMC
Retrieval will require 486 queries
Retrieved 1000 abstracts
Retrieved 2000 abstracts
Retrieved 3000 abstracts
Retrieved 4000 abstracts
Retrieved 5000 abstracts
Retrieved 6000 abstracts
Retrieved 7000 abstracts
Retrieved 8000 abstracts
Retrieved 9000 abstracts
Retrieved 10000 abstracts
Retrieved 11000 abstracts
Retrieved 12000 abstracts
Retrieved 13000 abstracts
Retrieved 14000 abstracts
Retrieved 15000 abstracts
Retrieved 16000 abstracts
Retrieved 17000 abstracts
Retrieved 18000 abstracts
Retrieved 19000 abstracts
Retrieved 20000 abstracts
Retrieved 21000 abstracts
Retrieved 22000 abstracts
Retrieved 23000 abstracts
Retrieved 24000 abstracts
Retrieved 25000 abstracts
Retrieved 26000 abstracts
Retrieved 27000 abstracts
Retrieved 28000 abstracts
Retrieved 29000 abstracts
Retrieved 30000 abstracts
Retrieved 31000 abstracts
Retrieved 32000 abstracts
Retrieved 33000 abstracts
Retrieved 34000 abstracts
Retrie

In [24]:
import torch

In [32]:
a = torch.rand([1,512,768])

In [33]:
a[0,0,:].numpy().transpose().reshape(1,-1)

array([[1.91601276e-01, 3.28029037e-01, 3.31625104e-01, 1.40011311e-04,
        8.07687581e-01, 6.47511661e-01, 8.35479975e-01, 4.81885314e-01,
        4.84279513e-01, 7.26099908e-01, 1.71241462e-01, 8.74027491e-01,
        1.42327547e-02, 4.86201286e-01, 9.26299691e-01, 2.31533587e-01,
        5.05117595e-01, 7.43281960e-01, 1.34374678e-01, 5.88095188e-02,
        5.36613524e-01, 9.21352446e-01, 8.70655775e-02, 9.91213560e-01,
        2.27103531e-01, 6.59547865e-01, 7.06470728e-01, 5.58162868e-01,
        7.49839544e-02, 2.54178047e-01, 5.91780007e-01, 5.33542991e-01,
        9.43664551e-01, 6.63102746e-01, 5.26661038e-01, 7.24485874e-01,
        7.96494782e-01, 2.87002265e-01, 2.00895905e-01, 9.67690170e-01,
        8.11074555e-01, 7.22173154e-01, 5.55595458e-01, 5.11919439e-01,
        3.37115228e-01, 9.80640113e-01, 5.06812334e-03, 3.62220049e-01,
        2.70380795e-01, 4.07463610e-01, 3.55592787e-01, 9.70121324e-01,
        4.93561208e-01, 8.50137591e-01, 6.87897205e-01, 4.189478