In [1]:
import time
import logging
from Bio import Entrez
import numpy as np
import pandas as pd
from redcap import Project
from datetime import datetime

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys

### Get access keys from the setup file - config.py
import config
import pub_comp_lib

# ## !!** For DEV
from importlib import reload
# reload(name_of_module)
# ## !!** For DEV

start_time = time.time()

logging.basicConfig(
    filename="test.log",
    level=logging.DEBUG,
    format="%(asctime)s:%(levelname)s:%(message)s"
    )

logger = logging.getLogger(__name__)
#logger = logging.basicConfig(filename='app.log', filemode='w',
#                             format='%(name)s - %(levelname)s - %(message)s')


In [2]:
# import pub_comp_lib.py libraries
from Bio import Entrez
from Bio.Entrez import efetch
from Bio.Entrez import read
import regex as re
import datetime
import time
import logging
import pandas as pd
import time
from bs4 import BeautifulSoup
import unicodedata

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys

In [3]:
# loop over all config grants for cleanup
for x in range(len(config.grant_list)):
    # remove all whitespace, leading or trailing hyphenates - clean_grant.py
    config.grant_list[x] = pub_comp_lib.clean(config.grant_list[x])

### Create list for each grant with 34 grant variations - grant_vari.py
variations = []
for grant in config.grant_list:
    variations.extend(pub_comp_lib.variety(grant))

### Get pmids from pubmed for all grant variations
# create variables for pubmed queries
Entrez.email = "Your.Name.Here@example.org"
Entrez.api_key = config.ncbi_api

# create set for unique list of all pmids from querying pubmed with each
# grant variation
pmids = set()
# query pubmed for pmids associated with each grant variation
logger.info("Starting pubmed queries...")
pubmed_results = []

for grant in variations:
    attempt = 1
    while attempt <= 3:
        try:
            handle = Entrez.esearch(db='pubmed', term=grant,
                                    field='grant', retmax=5000,
                                    usehistory='y', retmode='xml')
            record = Entrez.read(handle)
            handle.close()
            if int(record['Count']) > 0:
                pubmed_results.append(record)
                pmids.update(record['IdList'])
                logger.info('Entrez ESearch returns %i Ids for %s' % (int(record['Count']), str(grant)))
            attempt = 4
        except Exception as err:
            logger.warning('Received error from server: %s' % str(err))
            logger.warning('Attempt %i of 3 for grant %s.' % (attempt,
                                                              str(grant)))
            attempt += 1
            time.sleep(2)
    logger.debug('Grant %s queried.' % str(grant))

logger.info('All grant queries complete.')

##### To test for PubMed downtime or blocked access...
#handle = Entrez.esearch(db='pubmed', term=grant[0], field='grant')
#record = Entrez.read(handle)
#handle.close()
#print(record)

### Update pmid set if a REDCap project is being used to track publications
if config.rc_token is not None and config.rc_uri is not None:
    old_pmids = []
    # get the full pmid list from the REDCap project
    project = Project(config.rc_uri, config.rc_token)
    rc_pmids = project.export_records(fields=['pmid'], format='json')
    for rc_pmid in rc_pmids:
        old_pmids.append(rc_pmid['pmid'])
    new_pmids = list(pmids.difference(old_pmids))   # newly discovered pmids
    pmids.update(old_pmids)
    # date of first discovery
    if len(new_pmids) > 0:
        first_disc = [datetime.date.today().strftime("%Y-%m-%d")]*len(new_pmids)
        # create data frame of new_pmids with date of first dicovery and
        # import into REDCap project
        # create data frame using lists and import into redcap
        first_discovered_frame = pd.DataFrame(np.column_stack([new_pmids, first_disc]),
                            columns=['pmid', 'first_discovered'])
        response = project.import_records(first_discovered_frame)


In [4]:
###################### PubMed Summary Section
### Get table of publication details from pubmed for pmids
# make dataframe of publications
pubs_frame = pub_comp_lib.summary(pmids, config.ncbi_api, variations)
# add compliant pmc status for publications with a pmcid
pubs_frame['pmc_status'] = np.where(pubs_frame.pmcid == '', '', '1')
# write table
pubs_frame.to_csv('batch_pubmed_frame.csv', index=False)

# change blank values to nan- makes column merging easier
pubs_frame[pubs_frame == ''] = np.nan

#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
pubs_frame = pubs_frame.rename(columns={'pmcid':'pmc_id', 'nihmsid':'nihms_id'})
###################### END PubMed Summary Section

In [5]:
##### To test for PubMed downtime or blocked access...
#handle = Entrez.esearch(db='pubmed', term=grant[0], field='grant')
#record = Entrez.read(handle)
#handle.close()
#print(record)

In [6]:
#reload(pub_comp_lib)

In [7]:
####################### Development section
pubs_frame['pub_date'] = pd.to_datetime(pubs_frame['pub_date'], format='%Y-%m-%d')
status_pmc = pubs_frame.pmid
status_pmc.to_csv('pmids_to_check_in_pmc.csv', index=False)

delay = 2
long_delay = 7
pmc_rows = []
start = 0
scrape_more = 1
#test_pmc = ['31443893', '31280053', '30968993', '31568479', '31390231', '31161938']
####################### Development section

In [8]:
##########!!!!!!***** Dev interrupt and jump off point
driver = pub_comp_lib.ncbi_login(config.ncbi_login, config.ncbi_pass)
driver.get('https://www.ncbi.nlm.nih.gov/myncbi/collections/mybibliography/')
pub_comp_lib.clear_my_bib(driver, delay, logger)
pub_comp_lib.add_to_my_bib(driver, status_pmc[0:51], delay, long_delay, logger)
time.sleep(delay)
driver.get('https://www.ncbi.nlm.nih.gov/myncbi/collections/mybibliography/')
time.sleep(delay)

soup = BeautifulSoup(driver.page_source, 'lxml')
cites = soup.find_all('div', 'citation-wrap')


#status_pmc = pubs_frame.pmid[0:1100]
##########!!!!!!*****

In [9]:
pmc_rows = []
print(len(pmc_rows))
print(len(cites))

0
50


In [10]:
#pmc_rows.append(pub_comp_lib.scrape_citations(cites[0], 0, variations, driver, delay, long_delay, logger, start))

In [11]:
for x in range(len(cites)):
    pmc_rows.append(pub_comp_lib.scrape_citations(cites[x], x, variations, driver, delay, long_delay, logger, start))
## check if there's another page of citations to scrape
time.sleep(delay)

In [12]:
print(type(pmc_rows))
print (pmc_rows[0:4])
print(len(pmc_rows))

<class 'list'>
[['31688933', '1', '', ['K76 AG060005'], 0], ['31919777', '1', 'UL1 TR002373', ['R01 HD047516', 'UL1 TR002373'], 1], ['29112024', '4', '', [], 2], ['32034257', '1', '', [], 3]]
50


In [13]:
try:
    next_button = driver.find_element_by_xpath('//*[@id="pager1"]/ul/li[4]/a').get_attribute('onclick')
except Exception as err:
    next_button = 'return false;'
print(next_button)

None


In [14]:
if next_button == 'return false;' or driver.find_element_by_xpath('//*[@id="pager2"]/ul/li/span').get_attribute('innerText') == '1':
    scrape_more = 0
else: driver.find_element_by_xpath('//*[@id="pager1"]/ul/li[4]/a').click()
print(scrape_more)

1


In [15]:
print(type(pmc_rows))
print (pmc_rows[0:4])
print(len(pmc_rows))

<class 'list'>
[['31688933', '1', '', ['K76 AG060005'], 0], ['31919777', '1', 'UL1 TR002373', ['R01 HD047516', 'UL1 TR002373'], 1], ['29112024', '4', '', [], 2], ['32034257', '1', '', [], 3]]
50


In [16]:
soup = BeautifulSoup(driver.page_source, 'lxml')
cites = soup.find_all('div', 'citation-wrap')

In [17]:
print(len(pmc_rows))
print(len(cites))

50
1


In [18]:
for x in range(len(cites)):
    pmc_rows.append(pub_comp_lib.scrape_citations(cites[x], x, variations, driver, delay, long_delay, logger, start))
## check if there's another page of citations to scrape
time.sleep(delay)

In [19]:
print(type(pmc_rows))
print (pmc_rows[0:4])
print(len(pmc_rows))
print(pmc_rows[50])

<class 'list'>
[['31688933', '1', '', ['K76 AG060005'], 0], ['31919777', '1', 'UL1 TR002373', ['R01 HD047516', 'UL1 TR002373'], 1], ['29112024', '4', '', [], 2], ['32034257', '1', '', [], 3]]
51
['18788956', '1', 'KL2 RR025012', ['KL2 RR025012'], 0]


In [20]:
try:
    next_button = driver.find_element_by_xpath('//*[@id="pager1"]/ul/li[4]/a').get_attribute('onclick')
except Exception as err:
    next_button = 'return false;'
print(next_button)

return false;


In [21]:
if next_button == 'return false;' or driver.find_element_by_xpath('//*[@id="pager2"]/ul/li/span').get_attribute('innerText') == '1':
    scrape_more = 0
else: driver.find_element_by_xpath('//*[@id="pager1"]/ul/li[4]/a').click()
print(scrape_more)

0


In [22]:
scrape_more == 1

False

In [24]:
len(status_pmc)

2864

In [None]:
##!!!!!!!!! DEV ONLY 

##*** Dev only variables
start = 0
count = 0
#status_pmc = pubs_frame.pmid[0:249]

##*** Dev only variables


pmc_rows = []
batch_size = 250
count = len(status_pmc)
delay = 2
long_delay = 7

#for start in range(0, count, batch_size):
end = min(count, start+batch_size)

# reload my bib, clear all publications and load pmids in status_pmc
driver.get('https://www.ncbi.nlm.nih.gov/myncbi/collections/mybibliography/')
pub_comp_lib.clear_my_bib(driver, delay, logger)
pub_comp_lib.add_to_my_bib(driver, status_pmc[start:end], delay, long_delay, logger)
# reload my bib and begin scraping each page of citations
time.sleep(delay)
driver.get('https://www.ncbi.nlm.nih.gov/myncbi/collections/mybibliography/')
time.sleep(delay)
scrape_more = 1

##!!!!!!!!! DEV ONLY 

In [None]:
##!!!!!!!!! DEV ONLY 

#### loop for each 'next page' click
#while scrape_more == 1:
soup = BeautifulSoup(driver.page_source, 'lxml')
cites = soup.find_all('div', 'citation-wrap')
for x in range(len(cites)):
    pmc_rows.append(pub_comp_lib.scrape_citations(cites[x], x, variations, driver, delay, long_delay, logger, start))
## check if there's another page of citations to scrape
time.sleep(delay)
try:
    next_button = driver.find_element_by_xpath('//*[@id="pager1"]/ul/li[4]/a').get_attribute('onclick')
except Exception as err:
    next_button = 'return false;'
if next_button == 'return false;' or driver.find_element_by_xpath('//*[@id="pager2"]/ul/li/span').get_attribute('innerText') == '1':
    scrape_more = 0
else: driver.find_element_by_xpath('//*[@id="pager1"]/ul/li[4]/a').click()
if count > 1000:
    time.sleep(90)
    print('Working on ' + str(start) + ' : ' + str(end) + '    minutes-{0:0.1f}' .format((time.time()-start_time)/60))
##!!!!!!!!! DEV ONLY 

In [25]:
driver.close()

In [None]:
##!!!!!!!!! DEV ONLY
print(len(cites))
print(len(pmc_rows))
print(len(status_pmc))
print(scrape_more)
##!!!!!!!!! DEV ONLY

In [38]:
###################### PMC Section

#####  For updated PMC interface
# log into era commons
attempt = 1
while attempt <= 3:
    try:
        driver = pub_comp_lib.ncbi_login(config.ncbi_login, config.ncbi_pass)
        attempt = 4
    except Exception as err:
        logger.warning('Unable to log into ERA Commons, attempt %i; error: %s' % (attempt, str(err)))
        attempt += 1
        time.sleep(2)

# get list of publications with during current grant cycle with no pmcid to check on
# nihms status
pubs_frame['pub_date'] = pd.to_datetime(pubs_frame['pub_date'], format='%Y-%m-%d')
#config.start = datetime.strptime(config.start, '%m/%d/%Y')

#!!!!!!! how much of the pubmed results are going to pmc to check for compliance
#status_pmc = pubs_frame.pmid[(pubs_frame.pub_date > config.start) & (pubs_frame.pmc_id.isnull())]
#status_pmc = pubs_frame.pmid[pubs_frame.pmc_id.isnull()]
#status_pmc = pubs_frame.pmid[(pubs_frame.pub_date > config.start)]
status_pmc = pubs_frame.pmid

##!!!!!!!!! DEV ONLY csv file since I can't tell if all pmids are being sent to pmc
status_pmc.to_csv('pmids_to_check_in_pmc.csv', index=False)

In [39]:
####################### scrape pmc information in batches
pmc_rows = []
batch_size = 200
count = len(status_pmc)
delay = 2
long_delay = 7

for start in range(0, count, batch_size):
    end = min(count, start+batch_size)
    # reload my bib, clear all publications and load pmids in status_pmc
    driver.get('https://www.ncbi.nlm.nih.gov/myncbi/collections/mybibliography/')
    pub_comp_lib.clear_my_bib(driver, delay, logger)
    pub_comp_lib.add_to_my_bib(driver, status_pmc[start:end], delay, long_delay, logger)
    # reload my bib and begin scraping each page of citations
    time.sleep(delay)
    driver.get('https://www.ncbi.nlm.nih.gov/myncbi/collections/mybibliography/')
    time.sleep(delay)
    scrape_more = 1
    #### loop for each 'next page' click
    while scrape_more == 1:
        soup = BeautifulSoup(driver.page_source, 'lxml')
        cites = soup.find_all('div', 'citation-wrap')
        for x in range(len(cites)):
            pmc_rows.append(pub_comp_lib.scrape_citations(cites[x], x, variations, driver, delay, long_delay, logger, start))
        ## check if there's another page of citations to scrape
        time.sleep(delay)
        try:
            next_button = driver.find_element_by_xpath('//*[@id="pager1"]/ul/li[4]/a').get_attribute('onclick')
        except Exception as err:
            next_button = 'return false;'
        if next_button == 'return false;' or driver.find_element_by_xpath('//*[@id="pager2"]/ul/li/span').get_attribute('innerText') == '1':
            scrape_more = 0
        else: driver.find_element_by_xpath('//*[@id="pager1"]/ul/li[4]/a').click()
    if count > 1000:
        time.sleep(90)
        print('Working on ' + str(start) + ' : ' + str(end) + ' and rows-' + str(len(pmc_rows)) + '    minutes-{0:0.1f}' .format((time.time()-start_time)/60))
driver.close()



Working on 0 : 200 and rows-0    minutes-1004.2
Working on 200 : 400 and rows-0    minutes-1009.9
May need to wait longer for awards to load, got 0 for pmid: 31910032
May need to wait longer for awards to load, got 0 for pmid: 31942683
May need to wait longer for awards to load, got 0 for pmid: 30610655
May need to wait longer for awards to load, got 0 for pmid: 30509852
May need to wait longer for awards to load, got 0 for pmid: 31446976
May need to wait longer for awards to load, got 0 for pmid: 30635978
May need to wait longer for awards to load, got 0 for pmid: 31258977
May need to wait longer for awards to load, got 0 for pmid: 30946739
May need to wait longer for awards to load, got 0 for pmid: 30367828
May need to wait longer for awards to load, got 0 for pmid: 30686507
Well maybe there are no awards for pmid: 30686507
Might have failed on -cancel association- button for pmid: 30686507
May need to wait longer for awards to load, got 0 for pmid: 30679213
May need to wait longer f

UnboundLocalError: local variable 'search_field' referenced before assignment

In [36]:
##########!!!!!!***** Dev interrupt and jump off point
print(len(pmc_rows))
print(len(status_pmc))
print(start)
print(end)

##########!!!!!!*****

910
2864
2800
2864


In [None]:
## package the pmc_rows into a data frame
pmc_frame = pd.DataFrame(pmc_rows, columns=['pmid', 'pmc_status', 'pmc_tags', 'all_awards', 'pub_num'])
pmc_frame.to_csv('DEV_batch_pmc_status.csv', index=False)
# change blank values to nan- makes column merging easier
pmc_frame[pmc_frame == ''] = np.nan

# drop the pub_num column after the data frame has been written to csv file
pmc_frame = pmc_frame.drop('pub_num', 1)

# get list of publications with non-compliant pmc status to check on
# nihms status
status_nihms = pmc_frame.pmid[pmc_frame['pmc_status'].isin(['2', '3', '4', ''])]
###################### END PMC Section

In [None]:
############# NEW NIHMS Section

nihms_frame = pub_comp_lib.get_nihms(status_nihms, config.ncbi_login, config.ncbi_pass, 1, 5)

nihms_frame.to_csv('DEV_batch_nihms_status.csv', index=False)
# change blank values to nan- makes column merging easier
nihms_frame[pmc_frame == ''] = np.nan

################# END NEW NIHMS Section

In [None]:
########## join pmids, pmc, and nihms tables and upload into REDCap
pub_comp = pd.merge(pubs_frame, pmc_frame, on='pmid', how='outer')
pub_comp = pd.merge(pub_comp, nihms_frame, on='pmid', how='outer')


# include nihms ids from all dataframes into a final column
pub_comp['nihms_id'] = pub_comp['nihms_id_x'].combine_first(pub_comp['nihms_id_y'])
pub_comp['nihms_id'] = pub_comp['nihms_id_y'].combine_first(pub_comp['nihms_id'])

# include pmc ids from all dataframes into a final column
pub_comp['pmc_id'] = pub_comp['pmc_id_x'].combine_first(pub_comp['pmc_id_y'])
pub_comp['pmc_id'] = pub_comp['pmc_id_y'].combine_first(pub_comp['pmc_id'])

# include pmc status from all dataframes into a final column
pub_comp['pmc_status'] = pub_comp['pmc_status_x'].combine_first(pub_comp['pmc_status_y'])
pub_comp['pmc_status'] = pub_comp['pmc_status_y'].combine_first(pub_comp['pmc_status'])

# remove columns now that pmc and nihms ids have been merged
pub_comp = pub_comp.drop(['nihms_id_x', 'nihms_id_y'], axis=1)
pub_comp = pub_comp.drop(['pmc_id_x', 'pmc_id_y'], axis=1)
pub_comp = pub_comp.drop(['pmc_status_x', 'pmc_status_y'], axis=1)

pub_comp['nihms_comm'] = ''

### Update REDCap project if one is being used to track publications
if config.rc_token is not None and config.rc_uri is not None and len(pmids) < 5000:
    pub_comp = pub_comp_lib.RC_update_status(pub_comp)
    success = project.import_records(pub_comp)

# write a copy to a .csv file
pub_comp.to_csv('batch_comprehensive_status.csv', index=False)

print('Publication compliance status update process complete in {0:0.1f} minutes' .format((time.time()-start_time)/60))


In [7]:
status_pmc = ['31443893', '31280053', '30968993', '31568479', '31390231', '31161938']
delay = 2
long_delay = 7
pmc_rows = []
start = 0
count = len(status_pmc)
scrape_more = 1
end = 50
####################### Development section

In [15]:
print('I got ' + str(end+start))

I got 50


In [7]:
start = 0
status_pmc = ['31443893', '31280053', '30968993', '31568479', '31390231', '31161938', '31443893', '31280053', '30968993', '31568479', '31390231', '31161938']
batch_size = 5
lap = 1
count = len(status_pmc)

In [8]:
for start in range(0, count, batch_size):
    end = min(count, start+batch_size)
    print("Lap= "+str(lap))
    print("Start= "+str(start))
    print("Count= "+str(count))
    lap += 1

Lap= 1
Start= 0
Count= 12
Lap= 2
Start= 5
Count= 12
Lap= 3
Start= 10
Count= 12
