## Scraping PubMed Central

In [1]:
import numpy as np
import pandas as pd

from Bio import Entrez
import spacy
from spacy import displacy

import requests
from requests.exceptions import ConnectionError
import re
from time import sleep

from json import dump

### Setup

In [2]:
Entrez.email = 'david.hobson@mila.quebec'
Entrez.api_key = '86ee959885ef977e7590b5b297ac672a5008'

In [12]:
nlp = spacy.load("en_core_web_sm")

### Function Definitions

In [3]:
# article title
def get_title(article_obj):
    return article_obj['front']['article-meta']['title-group']['article-title']

# keywords
def get_keywords(article_obj):
    kwd_group = article_obj['front']['article-meta']['kwd-group']
    if len(kwd_group) > 1:
        print('Length of keyword group is larger than 1 (probably need to re-implement this function)!')
        return np.nan
    return article_obj['front']['article-meta']['kwd-group'][0]['kwd'] if kwd_group != [] else []

# author names
def get_authors(article_obj):
    contrib_group = article_obj['front']['article-meta']['contrib-group']
    if len(contrib_group) > 1:
        print('Length of author contribution group is larger than 1 (probably need to re-implement this function)!')
        return np.nan
    author_objs = contrib_group[0]['contrib']
    authors = []
    for author_obj in author_objs:
        if len(author_obj['name']) > 0:
            if len(author_obj['name']) > 1:
                print("Author has more than 1 name (need to re-implement this function)!")
                return np.nan
            author = author_obj['name'][0]
            authors.append(f"{author['given-names']} {author['surname']}")
    return authors

# abstract
def get_abstract(article_obj):
    abstract_objs = article_obj['front']['article-meta']['abstract']
    if len(abstract_objs) == 0:
        return np.nan
    abstract = ""
    for abstract_obj in abstract_objs:
        abstract += get_section_text(abstract_obj)
    return abstract

# get the PMC ID (PubMed Central ID) of the article object
def get_pmc_id(article_obj):
    article_ids = article_obj['front']['article-meta']['article-id']
    for id_ in article_ids:
        if id_.attributes['pub-id-type'] == 'pmc':
            return str(id_)
    return "N/A"

# get the PubMed ID of the article object
def get_pm_id(article_obj):
    article_ids = article_obj['front']['article-meta']['article-id']
    for id_ in article_ids:
        if id_.attributes['pub-id-type'] == 'pmid':
            return str(id_)
    return "N/A"

# Full-text; returns np.nan if not available
def get_full_text(article_obj):
    # checks if full-text is available
    if 'body' not in article_obj:
        return "N/A"
    return get_section_text(article_obj['body'])

# get the section text by recursively traversing the Entrez article objects
def get_section_text(text_body):
    full_text = ""
    if 'title' in text_body:
        full_text += "\t" + text_body['title'] + '\n'
    if 'p' in text_body:
        for paragraph in text_body['p']:
            full_text += paragraph + '\n'
    if 'sec' in text_body:
        for section in text_body['sec']:
            full_text += get_section_text(section) + '\n'
    return full_text

Process
- query ESearch and EFetch
    - get articles in batches with the PMC ids
    - if this fails, save those PMC IDs and skip to next ones
- Process those articles using BERN2
    - Handle errors as they come
- Save articles to a CSV
- Repeat

PubMed Functions

In [2]:
# get the article objects from pubmed based on the query and retstart
def query_pubmed(search_query, db='pmc', rettype='xml', retstart=0, retmax=100):
    print(f"Restart = {retstart}")
    
    # get the article IDs
    handle_esearch = Entrez.esearch(db=db, term=search_query, sort='relevance', retstart=retstart, retmax=retmax)
    results_esearch = Entrez.read(handle_esearch)

    ids_ = results_esearch['IdList']
    efetch_query = ",".join(ids_)

    # get the full text articles
    num_tries = 20
    for i in range(num_tries):
        try:
            handle_efetch = Entrez.efetch(id=efetch_query, db=db, rettype=rettype, retmax=retmax)
            article_objs = Entrez.read(handle_efetch, validate=False)
            if len(article_objs) == retmax:
#                 print("\t\tSuccess!")
                break
            else:
                print(f"\tTry {i+1}) EFetch returned {len(article_objs)} records... Retrying")

        except(ValueError):
            print(f"\t\tIDs could not be found")
            article_objs = []
            break
    return article_objs, ids_

# filter bad article objects returned from PubMed
def filter_article_objects(article_objs, ids_):
    good_articles, bad_ids = [], [] 
    for i, article_obj in enumerate(article_objs):
        pmc_id = get_pmc_id(article_obj)
        if pmc_id == ids_[i]:
            good_articles.append(article_obj)
        else:
            bad_ids.append(ids_[i])
    print(f"\tNumber bad articles: {len(bad_ids)}")
    return good_articles, bad_ids 

BERN2 functions

In [3]:
# send an POST request to the given URL
# 5000 character limit

BERN2_CHAR_LIMIT = 5000

def bern2_query_plain(text, url="http://bern2.korea.ac.kr/plain"):
    return requests.post(url, json={'text': text}).json()

def get_mesh_id(disease_obj):
    ids = disease_obj['id']
    for id_ in ids:
        if id_[:4] == 'mesh':
            return re.sub("mesh:", "", id_)
    return 'N/A'

Search criteria function

In [4]:
def simple_search(doc, diseases):
    results = []
    if len(diseases) != 0:
        disease_idx = 0
        for sent in doc.sents:
            sent_ents = list(sent.ents)
            if len(sent_ents) == 0:
                continue
            elif len(sent_ents) == 1 and sent_ents[0].label_ == 'DISEASE':
                disease_idx += 1
                continue
            for i in range(len(sent_ents) - 1):
                if sent_ents[i].label_ != 'DISEASE':
                    continue
                ent1_start = sent_ents[i].start_char
                ent2_end = sent_ents[i+1].end_char
                if sent_ents[i+1].label_ == 'DISEASE' and 'comorbid' in doc.text[ent1_start:ent2_end]:
                    mesh_1 = get_mesh_id(diseases[disease_idx])
                    mesh_2 = get_mesh_id(diseases[disease_idx+1])
                    if mesh_1 != mesh_2:       
                        hit = {
                            'span': sent,
                            'comorbidity': (sent_ents[i].text, sent_ents[i+1].text),
                            'comorbidity_mesh': (mesh_1, mesh_2)
                        }
                        results.append(hit)
                disease_idx += 1
            if sent_ents[-1].label_ == 'DISEASE':
                disease_idx += 1
    return results

Finding comorbidities and processing functions

In [5]:
def find_comorbidities_text(text, finding_scheme=simple_search):
    
    text = clean_text(text)
    
    comorbid_sentences = get_comorbid_sentences(text)
    
    doc, diseases = get_bern2_docs(" ".join(comorbid_sentences))
    
    return finding_scheme(doc, diseases)
    
# clean the text for further processing
def clean_text(text):
    text = re.sub("<xref.+?</xref>|<.+?>", "", text)            # remove references completely, and HTML formatting tags
    text = text.encode('ascii', errors='ignore').decode()       # convert to ASCII (BERN2 requires this)
    text = re.sub("\s+", " ", text).strip()                     # remove any extraneous formatting characters
    return text

# find sentences with the word "comorbid" in them
def get_comorbid_sentences(text): 
    article_doc = nlp(text, disable=['attribute_ruler', 'lemmatizer', 'NER']) 
    comorbid_sentences = []
    for sent in article_doc.sents:
        if 'comorbid' in sent.text:
            comorbid_sentences.append(sent.text)  
    return comorbid_sentences
 
def get_bern2_docs(text):
    
    doc = nlp(text)
    
    # split the doc if the text size is too large
    if len(doc.text) > BERN2_CHAR_LIMIT:
        batches = batchify_doc(doc)                 # batch the doc
        bern2_result = {'annotations': []}
        idx_offset = 0
        for batch in batches:
            if not len(batch) > BERN2_CHAR_LIMIT:
                batch_text, batch_diseases = get_bern2_docs(batch)
                for disease in batch_diseases:
                    disease['span']['begin'] = disease['span']['begin'] + idx_offset 
                    disease['span']['end'] = disease['span']['end'] + idx_offset 
                    bern2_result['annotations'].append(disease)
                idx_offset += len(batch_text.text) + 1
            else:
                print("Caution here!")
                # sometimes the spaCy Doc doesn't parse the sentences correctly and still gives sentences greater
                # than 5000 characters so we remove those 
                idx_offset += len(batch_text.text) + 1
    else:
        # query BERN2 and error check
        try:
            bern2_result = bern2_query_plain(doc.text)
        except(ValueError):
            print("Error in querying BERN2: BERN2 got a JSON decoding error")
            return doc, []
        if 'error_message' in bern2_result:
            print(f"Error in querying BERN2: {bern2_result['error_message']}")
            return doc, []
        elif 'annotations' not in bern2_result:
            print("Error in querying BERN2: No error but no annotations given.")
            return doc, []
    
    doc, diseases = add_diseases_to_doc(doc, bern2_result)
    
    return doc, diseases

def add_diseases_to_doc(doc, bern2_result):

    annotations = bern2_result['annotations']

    # filter the diseases
    diseases = []
    for result in annotations:
        if result['obj'] == 'disease' and ('comorbid' not in result['mention']) and (result['mention'] not in ['medical', 'violent crimes']):
            diseases.append(result)

    # create a list of Span objects for the diseases
    disease_ents = [doc.char_span(disease['span']['begin'], disease['span']['end'], label='DISEASE') for disease in diseases]
    
    # sometimes BERN2 mis-identifies a disease. Remove those
    diseases = [diseases[i] for i in range(len(diseases)) if disease_ents[i] is not None]
    disease_ents = [disease_ent for disease_ent in disease_ents if disease_ent is not None]
    
    # update the entities with the diseases
    doc.set_ents(disease_ents, default='unmodified')
    
    return doc, diseases
    
# batch the Doc object
# only give back results less than the limit number of characters
def batchify_doc(doc, limit=BERN2_CHAR_LIMIT):
    batches = []
    curr_batch = ""
    for sent in doc.sents:
        if len(" ".join([curr_batch, sent.text])) > limit:
            batches.append(curr_batch)
            curr_batch = ""
        curr_batch = " ".join([curr_batch, sent.text]) if curr_batch != "" else sent.text    
    batches.append(curr_batch)    
    return batches

In [6]:
# find comorbdities from an article object
def find_comorbidities(article_obj, sleep_time=0.1):
    if len(get_full_text(article_obj)) > 1000000:
        print(f'\tArticle PMC ID {get_pmc_id(article_obj)} is too long for spaCy')
        return []
    comorbid_results = find_comorbidities_text(get_full_text(article_obj))
    sleep(sleep_time)
    article_results = []
    if len(comorbid_results) != 0:
        print(f"\t\tGot {len(comorbid_results)} results!")
        for result in comorbid_results:
            mesh_1 = result['comorbidity_mesh'][0]
            mesh_2 = result['comorbidity_mesh'][1]
            comorbidity_mesh = " ".join([mesh_1, mesh_2]) if mesh_1 < mesh_2 else " ".join([mesh_2, mesh_1])
            article_results.append({
                'title': get_title(article_obj),
                'pmc_id': get_pmc_id(article_obj),
                'sentence': result['span'].text,
                'disease_1': result['comorbidity'][0],
                'disease_2': result['comorbidity'][1],
                'mesh_1': mesh_1,
                'mesh_2': mesh_2,
                'comorbidity_mesh': comorbidity_mesh
            })
    return article_results

In [7]:
def save_results(results, missed_ids, bad_article_ids, no_comorbids_ids, retstart):
    
    df = pd.DataFrame(results)
    good_ids = list(df['pmc_id'].unique()) if len(results) != 0 else []

    ids_scraped = {
        'Successfully Scraped': {
            'Comorbidities Found': good_ids,
            'No Comorbidities Found': no_comorbids_ids
        }, 
        'Errors': {
            'EFetch Retrieval Errors': missed_ids,
            'EFetch Parsing Errors': bad_article_ids,   
        }
    }
    
    results_filepath = f"/Users/David/Desktop/McGill Project/Datasets/Comorbidity/elderly_full_text_search/raw/results_{retstart}.csv"
    pmc_ids_filepath = f"/Users/David/Desktop/McGill Project/Datasets/Comorbidity/elderly_full_text_search/raw/ids_{retstart}.json"
    
    df.to_csv(results_filepath, index=False)
    with open(pmc_ids_filepath, 'w') as f:
        dump(ids_scraped, f)

## Run the Search

In [11]:
# DEMO
def find_comorbidities_text(text, finding_scheme=simple_search):
    
    text = clean_text(text)
    
#     comorbid_sentences = get_comorbid_sentences(text)
    
    return get_bern2_docs(text)

In [13]:
s = """
CVD often occurs comorbidly with diabetes, COPD, and other potential diseases 
Steve Jobs, the co-founder of Apple, had pancreatic cancer.
"""

In [14]:
doc, diseases = find_comorbidities_text(s)
displacy.render(doc, style='ent')
diseases

[{'id': ['mesh:D002318'],
  'is_neural_normalized': False,
  'mention': 'CVD',
  'obj': 'disease',
  'prob': 0.9999243021011353,
  'span': {'begin': 0, 'end': 3}},
 {'id': ['mesh:D003920'],
  'is_neural_normalized': True,
  'mention': 'diabetes',
  'obj': 'disease',
  'prob': 0.9999961853027344,
  'span': {'begin': 33, 'end': 41}},
 {'id': ['mim:606963', 'mesh:D029424'],
  'is_neural_normalized': False,
  'mention': 'COPD',
  'obj': 'disease',
  'prob': 0.9999788999557495,
  'span': {'begin': 43, 'end': 47}},
 {'id': ['mim:260350', 'mesh:D010190'],
  'is_neural_normalized': False,
  'mention': 'pancreatic cancer',
  'obj': 'disease',
  'prob': 0.999997615814209,
  'span': {'begin': 119, 'end': 136}}]

In [29]:
search_query = 'open access[filter] elderly comorbidities'
num_articles = 10000
start = 9000

batch_size = 100
save_interval = batch_size*10
max_tries_bern2 = 5

In [30]:
missed_ids = []             # EFetch couldn't fetch articles
bad_article_ids = []        # EFetch results couldn't be parsed
no_comorbids_ids = []       # articles had no comorbidity information
results = []                # comorbidity results
for retstart in range(start, num_articles, batch_size):
    
    # save file if enough articles have been processed
    if retstart != start and (retstart % save_interval == 0):
        print(f"Saving file: {retstart}")
        save_results(results, missed_ids, bad_article_ids, no_comorbids_ids, retstart)
        results, missed_ids, bad_article_ids, no_comorbids_ids = [], [], [], []
        
    retmax = batch_size if num_articles - retstart > batch_size else num_articles - retstart 
    article_objs, ids_ = query_pubmed(search_query, retstart=retstart, retmax=retmax)
    
    # skip if pubmed query returned nothing
    if article_objs == []:
        missed_ids += ids_
        continue
        
    # filter through article objects which aren't of good quality
    article_objs, bad_ids = filter_article_objects(article_objs, ids_)
    bad_article_ids += bad_ids
    
    # process the articles
    for i, article_obj in enumerate(article_objs):
        print(f"\tProcessing article {i + retstart}")
        for j in range(max_tries_bern2):
            try:
                comorbid_results = find_comorbidities(article_obj, sleep_time=0)
                if comorbid_results != []:
                    results += comorbid_results
                else:
                    no_comorbids_ids.append(get_pmc_id(article_obj))
                break
            except(ConnectionError, ConnectionResetError):
                print(f"\t\tRemote Disconnect Error\n\t\tSleeping and trying again (tries = {j+1})...")
                sleep(10)
save_results(results, missed_ids, bad_article_ids, no_comorbids_ids, num_articles)

Restart = 9000
		IDs could not be found
Restart = 9100
	Number bad articles: 2
	Processing article 9100
	Processing article 9101
		Got 1 results!
	Processing article 9102
	Processing article 9103
		Got 5 results!
	Processing article 9104
		Got 2 results!
	Processing article 9105
	Processing article 9106
	Processing article 9107
		Got 2 results!
	Processing article 9108
	Processing article 9109
		Got 1 results!
	Processing article 9110
		Got 1 results!
	Processing article 9111
	Processing article 9112
	Processing article 9113
	Processing article 9114
	Processing article 9115
		Got 6 results!
	Processing article 9116
		Got 4 results!
	Processing article 9117
		Got 2 results!
	Processing article 9118
		Got 2 results!
	Processing article 9119
	Processing article 9120
	Processing article 9121
		Got 3 results!
	Processing article 9122
	Processing article 9123
		Got 1 results!
	Processing article 9124
	Processing article 9125
	Processing article 9126
		Got 2 results!
	Processing article 9127


	Processing article 9557
	Processing article 9558
	Processing article 9559
	Processing article 9560
	Processing article 9561
		Got 1 results!
	Processing article 9562
		Got 1 results!
	Processing article 9563
	Processing article 9564
	Processing article 9565
		Got 1 results!
	Processing article 9566
	Processing article 9567
		Got 2 results!
	Processing article 9568
	Processing article 9569
	Processing article 9570
	Processing article 9571
	Processing article 9572
	Processing article 9573
		Got 6 results!
	Processing article 9574
		Got 2 results!
	Processing article 9575
	Processing article 9576
		Got 2 results!
	Processing article 9577
	Processing article 9578
	Processing article 9579
		Got 2 results!
	Processing article 9580
	Processing article 9581
		Got 9 results!
	Processing article 9582
	Processing article 9583
		Got 1 results!
	Processing article 9584
	Processing article 9585
		Got 2 results!
	Processing article 9586
	Processing article 9587
	Processing article 9588
	Processing a