## Downloading the PubMed Open Access Case Reports for corpus building

In [None]:
import pandas as pd
import os
import urllib, sys
from time import sleep
# for downloading files (works with ftp too)
import wget
# import the Entrez (pubmed API) from Biopython module
# see biopython reference here: http://biopython.org/DIST/docs/tutorial/Tutorial.html
from Bio import Entrez

## make original query for pubmed case reports

In [None]:
case_report_pmids = []

In [None]:
##### set up my email in case they email me an alert
Entrez.email = ""
# build search in pubmed
# get case reports from past ten years (2007-2017)
# should return about 208761 PMIDs (more seem to be added now and again)
query = Entrez.esearch(db="pubmed", retmax = 5000000, retstart = 5000001,
                     term='"2007/01/01"[PDat] : "2017/12/31"[PDat] AND English[lang] AND "humans"[MeSH Terms]')
result = Entrez.read(query)
case_report_pmids = case_report_pmids + list(set(result['IdList']))
print('Retrieved PMIDs for',len(case_report_pmids),'case reports')

## download references for licensing 

In [None]:
# now get the index files of all papers in the open access subset
# just run this once because the files are big
# for research purposes we can use both the regular and commercial use datasets
# see details of the open access dataset here: https://www.ncbi.nlm.nih.gov/pmc/tools/openftlist/
url_oa_flist = 'ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_file_list.csv'
url_oa_cu_flist = 'ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_comm_use_file_list.csv'

# download the file indices
# should just have to do this part once
print('Downloading the open acess file list...')
oa_file_dl = wget.download(url_oa_flist)
print('Downloading the commercial-use open acess file list...')
oa_cu_file_dl = wget.download(url_oa_cu_flist)

## load reference files

In [None]:
# load files
df_oa_flist = pd.read_csv('reference_files/oa_file_list.csv', dtype={'PMID': object})
df_oa_cu_flist = pd.read_csv('reference_files/oa_comm_use_file_list.csv', dtype={'PMID': object})

In [None]:
# merge the open access and commercial use file lists -- both are fair game for this research project
pmc_oa_flist = pd.concat([df_oa_flist, df_oa_cu_flist])

## filter all PMIDs by ones that allow licensing

In [None]:
# filter by PMIDs in the original Case Report Query
df_case_reports = pd.DataFrame({'PMID': case_report_pmids})
df_file_loc_case_reports = pd.merge(df_case_reports, pmc_oa_flist, how = 'inner', on = 'PMID')
list_accession_ids_case_reports = list(set(df_file_loc_case_reports['Accession ID']))
list_pmc_id_case_reports = list(map(lambda x: x[3:], list_accession_ids_case_reports))
print('Found',len(list_pmc_id_case_reports))

In [None]:
import pickle
pickle.dump(list_accession_ids_case_reports, open("OA_ALL_pmc_ids.pkl", "wb" ))

## extract the case reports as xml files

In [None]:
# make sure we have the right directory to put them in
file_dir = 'pmc_files_full/'
if not os.path.isdir(file_dir):
    os.makedirs(file_dir)
# now get the files
my_ncbi_api_key = '431919d2e10a3757c5f70f8f5376a85f7308'
base_efetch_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id=' # gets the full text xml
print('Now downloading',len(list_pmc_id_case_reports),'manuscripts from PMC Open Access Dataset')
for i, pmc_id in enumerate(list_pmc_id_case_reports):
    if ((i+1) % 100 == 0):
        sys.stdout.write('Completed: %d of %d report downloads \r' % (i+1, len(list_pmc_id_case_reports)))
        #sys.stdout.flush()
    sleep(2) # be respectful of NIH resources
    try:
        wget.download(base_efetch_url + pmc_id + '&api_key=' + my_ncbi_api_key, file_dir + pmc_id + '.xml')
    except urllib.error.HTTPError as e:
        print("Error: ", e, pmc_id)