In [1]:
# import pub_comp_src.py libraries
import time
import logging
from Bio import Entrez
import numpy as np
import pandas as pd
from redcap import Project
from datetime import datetime

### Get access keys from the setup file - config.py
import config
import pub_comp_lib

# ## !!** For DEV
from importlib import reload
# reload(name_of_module)
# ## !!** For DEV

start_time = time.time()

logging.basicConfig(
    filename="test.log", 
    level=logging.DEBUG, 
    format="%(asctime)s:%(levelname)s:%(message)s"
    )

In [2]:
# import pub_comp_lib.py libraries
from Bio import Entrez
from Bio.Entrez import efetch
from Bio.Entrez import read
import regex as re
import datetime
import time
import logging
import pandas as pd
import time
from bs4 import BeautifulSoup
import unicodedata

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys

In [3]:
logger = logging.getLogger(__name__)
#logger = logging.basicConfig(filename='app.log', filemode='w',
#                             format='%(name)s - %(levelname)s - %(message)s')

# loop over all config grants for cleanup
for x in range(len(config.grant_list)):
    # remove all whitespace, leading or trailing hyphenates - clean_grant.py
    config.grant_list[x] = pub_comp_lib.clean(config.grant_list[x])

### Create list for each grant with 34 grant variations - grant_vari.py
variations = []
for grant in config.grant_list:
    variations.extend(pub_comp_lib.variety(grant))

### Get pmids from pubmed for all grant variations
# create variables for pubmed queries
Entrez.email = "Your.Name.Here@example.org"
Entrez.api_key = config.ncbi_api

# create set for unique list of all pmids from querying pubmed with each
# grant variation
pmids = set()
# query pubmed for pmids associated with each grant variation
logger.info("Starting pubmed queries...")
pubmed_results = []

for grant in variations:
    attempt = 1
    while attempt <= 3:
        try:
            handle = Entrez.esearch(db='pubmed', term=grant,
                                    field='grant', retmax=5000,
                                    usehistory='y', retmode='xml')
            record = Entrez.read(handle)
            handle.close()
            if int(record['Count']) > 0:
                pubmed_results.append(record)
                pmids.update(record['IdList'])
                logger.info('Entrez ESearch returns %i Ids for %s' % (int(record['Count']), str(grant)))
            attempt = 4
        except Exception as err:
            logger.warning('Received error from server: %s' % str(err))
            logger.warning('Attempt %i of 3 for grant %s.' % (attempt,
                                                              str(grant)))
            attempt += 1
            time.sleep(2)
    logger.debug('Grant %s queried.' % str(grant))

logger.info('All grant queries complete.')

In [4]:
##### To test for PubMed downtime or blocked access...
#handle = Entrez.esearch(db='pubmed', term=grant[0], field='grant')
#record = Entrez.read(handle)
#handle.close()
#print(record)

In [5]:
### Update pmid set if a REDCap project is being used to track publications
if config.rc_token is not None and config.rc_uri is not None:
    old_pmids = []
    # get the full pmid list from the REDCap project
    project = Project(config.rc_uri, config.rc_token)
    rc_pmids = project.export_records(fields=['pmid'], format='json')
    for rc_pmid in rc_pmids:
        old_pmids.append(rc_pmid['pmid'])
    new_pmids = list(pmids.difference(old_pmids))   # newly discovered pmids
    pmids.update(old_pmids)
    # date of first discovery
    if len(new_pmids) > 0:
        first_disc = [datetime.today().strftime("%Y-%m-%d")]*len(new_pmids)
        # create data frame of new_pmids with date of first dicovery and
        # import into REDCap project
        # create data frame using lists and import into redcap
        first_discovered_frame = pd.DataFrame(np.column_stack([new_pmids, first_disc]),
                            columns=['pmid', 'first_discovered'])
        response = project.import_records(first_discovered_frame)

In [6]:
###################### PubMed Summary Section
### Get table of publication details from pubmed for pmids
# make dataframe of publications
pubs_frame = pub_comp_lib.summary(pmids, config.ncbi_api, variations)
# add compliant pmc status for publications with a pmcid
pubs_frame['pmc_status'] = np.where(pubs_frame.pmcid == '', '', '1')
# write table
pubs_frame.to_csv('batch_pubmed_frame.csv', index=False)

# change blank values to nan- makes column merging easier
pubs_frame[pubs_frame == ''] = np.nan

#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
pubs_frame = pubs_frame.rename(columns={'pmcid':'pmc_id', 'nihmsid': 'nihms_id'})
###################### END PubMed Summary Section

In [7]:
# log in function for era_commons linked sites
#### might ditch this one...

def pacm_login(login, password):
    # set chrome driver options to headless
    options = Options()
    #options.headless = True
    driver = webdriver.Chrome(options = options)
    driver.get('https://auth.nih.gov/CertAuthV2/forms/NIHPivOrFormLogin.aspx')
    driver.set_window_size(1440, 900)
    id_box = driver.find_element_by_id('USER').send_keys(login)
    pass_box = driver.find_element_by_id('PASSWORD').send_keys(password)
    time.sleep(2)
    login_button = driver.find_element_by_id('Image2').click()

    driver.get('https://www.ncbi.nlm.nih.gov/pmc/utils/pacm/')

    driver.find_element_by_xpath('//*[@id="content"]/div/ul/li/a').click()
    driver.switch_to.frame('loginframe')
    driver.find_element_by_xpath('//*[@id="era"]/img').click()
    clear_text(driver.find_element_by_name('USER'))
    login_box = driver.find_element_by_name('USER').send_keys(login)
    pass_box = driver.find_element_by_name('PASSWORD').send_keys(password)
    login_button = driver.find_element_by_xpath('//*[@id="Image2"]').click()
    return driver

In [8]:
# log in function for era_commons linked sites

def ncbi_login(login, password):
    # set chrome driver options to headless
    options = Options()
    options.headless = True
    driver = webdriver.Chrome(options = options)
    driver.set_window_size(1440, 900)
    driver.get('https://www.ncbi.nlm.nih.gov/myncbi/collections/mybibliography/')
    driver.switch_to.frame(driver.find_element_by_id('loginframe'))
    driver.find_element_by_id('nih').click()
    time.sleep(5)
    id_box = driver.find_element_by_id('USER').send_keys(config.era_login)
    pass_box = driver.find_element_by_id('PASSWORD').send_keys(config.era_pass)
    login_button = driver.find_element_by_xpath('//*[@id="CredSelectorNotice"]/div/button').click()
    return driver

In [9]:
###################### PMC Section

#####  For updated PMC interface
# log into era commons
attempt = 1
while attempt <= 3:
    try:
        driver = ncbi_login(config.ncbi_login, config.ncbi_pass)
        attempt = 4
    except Exception as err:
        logger.warning('Unable to log into ERA Commons, attempt %i; error: %s' % (attempt, str(err)))
        attempt += 1
        time.sleep(2)

# get list of publications with during current grant cycle with no pmcid to check on
# nihms status
pubs_frame['pub_date'] = pd.to_datetime(pubs_frame['pub_date'], format='%Y-%m-%d')
#config.start = datetime.strptime(config.start, '%m/%d/%Y')

#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
status_pmc = pubs_frame.pmid[(pubs_frame.pub_date > config.start) & (pubs_frame.pmc_id.isnull())]
#status_pmc = pubs_frame.pmid[pubs_frame.pmc_id.isnull()]
#status_pmc = pubs_frame.pmid

####################### scrape pmc information in batches
pmc_rows = []
batch_size = 200
count = len(status_pmc)

for start in range(0, count, batch_size):
    end = min(count, start+batch_size)
    
    # reload my bib, clear all publications and load pmids in status_pmc
    driver.get('https://www.ncbi.nlm.nih.gov/myncbi/collections/mybibliography/')
    pub_comp_lib.clear_my_bib(driver, 3, logger)
    pub_comp_lib.add_to_my_bib(driver, status_pmc[start:end], 2, 7, logger)

    # reload my bib and begin scraping each page of citations
    time.sleep(2)
    driver.get('https://www.ncbi.nlm.nih.gov/myncbi/collections/mybibliography/')
    time.sleep(2)
    scrape_more = 1

    #### loop for each 'next page' click
    while scrape_more == 1:
        soup = BeautifulSoup(driver.page_source, 'lxml')
        cites = soup.find_all('div', 'citation-wrap')
        for x in range(len(cites)):
            pmc_rows.append(pub_comp_lib.scrape_citations(cites[x], x, variations, driver, 2, 7, logger))

        ## check if there's another page of citations to scrape
        time.sleep(2)

        try:
            next_button = driver.find_element_by_xpath('//*[@id="pager1"]/ul/li[4]/a').get_attribute('onclick')
        except Exception as err:
            next_button = 'return false;'

        if  next_button == 'return false;' or driver.find_element_by_xpath('//*[@id="pager2"]/ul/li/span').get_attribute('innerText') == '1':
            scrape_more = 0
        else: driver.find_element_by_xpath('//*[@id="pager1"]/ul/li[4]/a').click()
        
driver.close()

## package the pmc_rows into a data frame
pmc_frame = pd.DataFrame(pmc_rows, columns=['pmid', 'pmc_status', 'pmc_tags', 'all_awards'])
pmc_frame.to_csv('DEV_batch_pmc_status.csv', index=False)
# change blank values to nan- makes column merging easier
pmc_frame[pmc_frame == ''] = np.nan

# get list of publications with non-compliant pmc status to check on
# nihms status
status_nihms = pmc_frame.pmid[pmc_frame['pmc_status'].isin(['2', '3', '4'])]
###################### END PMC Section

In [10]:
len(pmc_frame)

66

In [11]:
def scrape_nihms_status(driver, nihms, pmid, delay, long_delay):
    # initialize lists for the nihms status and progress details
    pmc = ''
    reviewer = ''
    files_uploaded = ''
    initial_approval = ''
    nihms_conversion = ''
    final_approval = ''
    pmcid_assigned = ''
    
    nihms_url = 'https://www.nihms.nih.gov/submission/' + nihms + '/'
    driver.get(nihms_url)
    time.sleep(delay)
    soup = BeautifulSoup(driver.page_source, 'lxml')
    script_info = soup.find_all('div', 'usa-grid')[2]
    # check for a pmcid value and append, else append blank value
    if re.search('PMCID:</dt>\n<dd>([0-9].*?)</dd', str(script_info)) is not None:
        pmc = (re.search('PMCID:</dt>\n<dd>([0-9].*?)</dd', str(script_info)).group(1))

    # check for a reviewer and append, else append blank value
    if re.search('Reviewer:</dt>\n<dd>([A-Za-z].*?)</dd', str(script_info)) is not None:
        reviewer = re.search('Reviewer:</dt>\n<dd>([A-Za-z].*?)</dd', str(script_info)).group(1)

    # package of dates for the progress stages
    script_progress = soup.find_all('div', 'progress')[0]
    all_status = re.findall('<span>\\((.*?)\\ .*?\\)</span>', str(script_progress))
    if len(all_status) >= 5:
        files_uploaded = all_status[0]
        initial_approval = all_status[1]
        nihms_conversion = all_status[2]
        final_approval = all_status[3]
        pmcid_assigned = all_status[4]
    elif len(all_status) == 4:
        files_uploaded = all_status[0]
        initial_approval = all_status[1]
        nihms_conversion = all_status[2]
        final_approval = all_status[3]
    elif len(all_status) == 3:
        files_uploaded = all_status[0]
        initial_approval = all_status[1]
        nihms_conversion = all_status[2]
    elif len(all_status) == 2:
        files_uploaded = all_status[0]
        initial_approval = all_status[1]
    elif len(all_status) == 1:
        files_uploaded = all_status[0]
        
    row = [pmid, nihms, pmc, reviewer, files_uploaded, initial_approval, nihms_conversion, final_approval, pmcid_assigned]

    return row

In [12]:
#### time to get into NIHMS and scrape the current stage in the NIHMS process
def get_nihms(pmids, login, password, delay, long_delay):
    rows = []
    
    # log into ncbi
    driver = ncbi_login(login, password)

    # navigate to nihms since already logged in to ncbi
    driver.get('https://www.nihms.nih.gov/submission/')
    driver.find_element_by_xpath('//*[@id="react-app"]/div/div/div[2]/div[3]/a').click()
    
    # loop through pmids and get nihms status and progress details that are available
    for pmid in pmids:
        search_url = 'https://www.nihms.nih.gov/submission/search/?q=' + pmid
        driver.get(search_url)
        
        # scrape the search results and see if there's a nihmsid for the pmid
        html = driver.find_element_by_class_name('usa-table-borderless').get_attribute('innerText')
        
        # initialize lists for the nihms status and progress details
        nihms = ''
        pmc = ''
        reviewer = ''
        files_uploaded = ''
        initial_approval = ''
        nihms_conversion = ''
        final_approval = ''
        pmcid_assigned = ''

        # get the nihmsid
        if re.search('No manuscripts found', html) is not None:
            row = [pmid, nihms, pmc, reviewer, files_uploaded, initial_approval, nihms_conversion, final_approval, pmcid_assigned]
        elif re.search('([0-9].*?)\t', html) is None:
            row = [pmid, 'error', pmc, reviewer, files_uploaded, initial_approval, nihms_conversion, final_approval, pmcid_assigned]
        else:
            nihms = re.search('([0-9].*?)\t', html).group(1)
            row = scrape_nihms_status(driver, nihms, pmid, delay, long_delay)
        
        rows.append(row)
        
    driver.close()
    
    ## package the pmc_rows into a data frame
    nihms_frame = pd.DataFrame(rows, columns= ['pmid', 'nihms_id', 'pmc_id', 'reviewer', 'files_uploaded', 'initial_approval', 'nihms_conversion', 'final_approval', 'pmcid_assigned'])
    
    return nihms_frame

In [13]:
############# NEW NIHMS Section

nihms_frame = get_nihms(status_nihms, config.ncbi_login, config.ncbi_pass, 1, 5)

nihms_frame.to_csv('DEV_batch_nihms_status.csv', index=False)
# change blank values to nan- makes column merging easier
nihms_frame[pmc_frame == ''] = np.nan

################# END NEW NIHMS Section

In [14]:
########## join pmids, pmc, and nihms tables and upload into REDCap
pub_comp = pd.merge(pubs_frame, pmc_frame, on='pmid', how='outer')
pub_comp = pd.merge(pub_comp, nihms_frame, on='pmid', how='outer')


# include nihms ids from all dataframes into a final column
pub_comp['nihms_id'] = pub_comp['nihms_id_x'].combine_first(pub_comp['nihms_id_y'])
pub_comp['nihms_id'] = pub_comp['nihms_id_y'].combine_first(pub_comp['nihms_id'])

# include pmc ids from all dataframes into a final column
pub_comp['pmc_id'] = pub_comp['pmc_id_x'].combine_first(pub_comp['pmc_id_y'])
pub_comp['pmc_id'] = pub_comp['pmc_id_y'].combine_first(pub_comp['pmc_id'])

# include pmc status from all dataframes into a final column
pub_comp['pmc_status'] = pub_comp['pmc_status_x'].combine_first(pub_comp['pmc_status_y'])
pub_comp['pmc_status'] = pub_comp['pmc_status_y'].combine_first(pub_comp['pmc_status'])

# remove columns now that pmc and nihms ids have been merged
pub_comp = pub_comp.drop(['nihms_id_x', 'nihms_id_y'], axis=1)
pub_comp = pub_comp.drop(['pmc_id_x', 'pmc_id_y'], axis=1)
pub_comp = pub_comp.drop(['pmc_status_x', 'pmc_status_y'], axis=1)

# write a copy to a .csv file
pub_comp.to_csv('batch_comprehensive_status.csv', index=False)

In [15]:
pub_comp.head()

Unnamed: 0,pmid,nctid,pub_title,authors,authors_lnames,authors_initials,orcid,authors_affil,pub_date,journal_short,...,all_awards,reviewer,files_uploaded,initial_approval,nihms_conversion,final_approval,pmcid_assigned,nihms_id,pmc_id,pmc_status
0,20551832,,DNA vaccine encoding prostatic acid phosphatas...,"Jordan T Becker, Brian M Olson, Laura E Johnso...","Becker, Olson, Johnson, Davies, Dunphy, McNeel","JT, BM, LE, JG, EJ, DG",", , , , ,","Department of Medicine, University of Wisconsi...",2010-07-01,J Immunother,...,,,,,,,,269166.0,3045767,1
1,24615777,"NCT00710528, NCT01090414","Idelalisib, an inhibitor of phosphatidylinosit...","Jennifer R Brown, John C Byrd, Steven E Coutre...","Brown, Byrd, Coutre, Benson, Flinn, Wagner-Joh...","JR, JC, SE, DM, IW, ND, SE, BS, C, HK, DM, S, ...",", , , , , , , , , , , , , , , , , ,","Dana Farber Cancer Institute, Boston, MA;, The...",2014-05-29,Blood,...,,,,,,,,,4123414,1
2,19874516,,Feasibility of an evidence-based medicine educ...,"David A Feldstein, Scott Mead, Linda B Manwell","Feldstein, Mead, Manwell","DA, S, LB",", ,",University of Wisconsin School of Medicine and...,2009-11-01,Med Educ,...,,,,,,,,170078.0,2827765,1
3,31324799,,A novel rapamycin analog is highly selective f...,"Katherine H Schreiber, Sebastian I Arriola Ape...","Schreiber, Arriola Apelo, Yu, Brinkman, Velard...","KH, SI, D, JA, MC, FA, CY, EL, KA, DS, D, R, S...",", , , , , , http://orcid.org/0000-0002-6033-94...","Buck Institute for Research on Aging, Novato, ...",2019-07-19,Nat Commun,...,,,,,,,,,6642166,1
4,28711370,,Assessing the risk of hypercalcemic crisis in ...,"Andrew J Lowell, Norah M Bushman, Xing Wang, Y...","Lowell, Bushman, Wang, Ma, Pitt, Sippel, Schne...","AJ, NM, X, Y, SC, RS, DF, RW",", , , , , , ,","Department of Surgery, University of Wisconsin...",2017-09-01,J Surg Res,...,,,,,,,,886516.0,5603402,1


In [None]:


# upload to REDCap if tokens are in config file
if config.rc_token is not None and config.rc_uri is not None and len(pmids) < 5000:
    pub_comp = pub_comp_lib.RC_update_status(pub_comp)
    success = project.import_records(pub_comp)

print('Publication compliance status update process complete in {0:0.1f} minutes' .format((time.time()-start_time)/60))

In [None]:
driver.find_element_by_xpath('//*[@id="pager2"]/ul/li/span').get_attribute('innerText') == '1'


In [None]:
###################### Dev Report Integration and Upload Section

# work through matching up PubMed, PMC, and NIHMS tables for upload into REDCap tracking system

###################### END Report Integration and Upload Section

In [None]:
########### !!!!!! DEV ONLY BLOCK
driver = ncbi_login(config.ncbi_login, config.ncbi_pass)
status_nihms = status_nihms[0:5]
print(status_nihms)
pmid = status_nihms[0]
delay = 3
long_delay = 7
############# !!!!! END DEV ONLY BLOCK



In [None]:
############# old NIHMS scrape method
html = driver.find_element_by_class_name('ms-attrs').get_attribute('innerText')
pubs.append(pmid)
if re.search('PMC', html) is not None:
    pmc.append(re.search('PMC.*?([0-9].*?) NIHMS', html).group(1))
else:
    pmc.append(None)
if re.search('NIHMSID', html) is not None:
    if re.search('NIHMSID.*?([0-9].*?) [A-Za-z].*$', html) is not None:
        nihms.append(re.search('NIHMSID.*?([0-9].*?) [A-Za-z].*$', html).group(1))
    else:
        nihms.append(re.search('NIHMSID.*?([0-9].*)$', html).group(1))
else:
    nihms.append(None)
details = driver.find_element_by_class_name('box').get_attribute('innerText')
if re.search('Status', details) is not None:
    nihms_status.append(re.search('Status.*?\n(.*?)\n', details).group(1))
else:
    nihms_status.append('Unknown')
if re.search('Reviewer.*?\n', details) is not None:
    if re.search('Reviewer.*?\n(.*?)\n', details) is None:
        reviewer.append(re.search('Reviewer.*?\n(.*?)$', details).group(1))
    else:
        reviewer.append(re.search('Reviewer.*?\n(.*?)\n').group(1))
############ end old NIHMS scrape method

In [None]:
###### NIHMS selection from method in pub_comp_lib
# Create two different delay lengths for letting the webpage catch up
    delay = 5
    long_delay = 25

    # Pause and let the NIHMS webpage load after logging in
    time.sleep(long_delay)

    # initialize lists for the nihms status details
    pubs = []
    pmc = []
    nihms = []
    nihms_status = []
    reviewer = []


    for pmid in pmids:
        # search_url = 'https://www.nihms.nih.gov/db/sub.cgi?'
        search_url = 'https://www.nihms.nih.gov/db/sub.cgi?ms_search_query_type=pm&ms_search_query=' + pmid + '&page_b=&link_b=BtnSearchManuscript&choice_b=Search'
        # WAIT? how fast is the page loading once the search button is
        # clicked??
        # time.sleep(2)
        # Scenario 1, not present in NIHMS, how do we check for this and
        # log it?
        driver.get(search_url)
        alert_banner = driver.find_element_by_tag_name('html').get_attribute('innerHTML')
        if re.search('Manuscript with PubMed ID', alert_banner) is not None:
            nihms_status.append('Not submitted to NIHMS yet')
            pubs.append(pmid)
            pmc.append(None)
            nihms.append(None)
            reviewer.append(None)
        elif re.search('cancel_submission', alert_banner) is not None:
            nihms_status.append('Awaiting Submission')
            pubs.append(pmid)
            pmc.append(None)
            nihms.append(None)
            reviewer.append(None)
        else:
            attempt = 0
            while attempt <= 4:
                try:
                    html = driver.find_element_by_class_name('ms-attrs').get_attribute('innerText')
                    pubs.append(pmid)
                    if re.search('PMC', html) is not None:
                        pmc.append(re.search('PMC.*?([0-9].*?) NIHMS', html).group(1))
                    else:
                        pmc.append(None)
                    if re.search('NIHMSID', html) is not None:
                        if re.search('NIHMSID.*?([0-9].*?) [A-Za-z].*$', html) is not None:
                            nihms.append(re.search('NIHMSID.*?([0-9].*?) [A-Za-z].*$', html).group(1))
                        else:
                            nihms.append(re.search('NIHMSID.*?([0-9].*)$', html).group(1))
                    else:
                        nihms.append(None)
                    details = driver.find_element_by_class_name('box').get_attribute('innerText')
                    if re.search('Status', details) is not None:
                        nihms_status.append(re.search('Status.*?\n(.*?)\n', details).group(1))
                    else:
                        nihms_status.append('Unknown')
                    if re.search('Reviewer.*?\n', details) is not None:
                        if re.search('Reviewer.*?\n(.*?)\n', details) is None:
                            reviewer.append(re.search('Reviewer.*?\n(.*?)$', details).group(1))
                        else:
                            reviewer.append(re.search('Reviewer.*?\n(.*?)\n').group(1))
                    attempt = 5
                except Exception as err:
                    print('NIHMS Search Error: ', str(pmid))
                    if attempt == 4:
                        time.sleep(long_delay)
                        print('I tried: ', str(attempt))
                        nihms_status.append('Status Update Failed')
                        pubs.append(pmid)
                        pmc.append(None)
                        nihms.append(None)
                        reviewer.append(None)
                    else:
                        time.sleep(delay)
                    attempt += 1
    driver.quit()
    nihms_frame = pd.DataFrame(
                                {'pmid': pubs, 'pmc_id': pmc,
                                'nihms_id': nihms, 'nihms_status': nihms_status,
                                'reviewer': reviewer
                                })
################ END NIHMS selcted section

In [None]:
###  development code for new login process of era commons linked my bib collection

options = Options()
driver = webdriver.Chrome(options = options)
#Log into MyNCBI
driver.get('https://www.ncbi.nlm.nih.gov/myncbi/collections/mybibliography/?user=dsurber')
driver.switch_to.frame(driver.find_element_by_id('loginframe'))
driver.find_element_by_id('nih').click()
time.sleep(10)
#print(driver)
#driver.set_window_size(1440, 900)

id_box = driver.find_element_by_id('USER').send_keys(config.era_login)
pass_box = driver.find_element_by_id('PASSWORD').send_keys(config.era_pass)
login_button = driver.find_element_by_xpath('//*[@id="CredSelectorNotice"]/div/button').click()

driver.get('https://www.ncbi.nlm.nih.gov/myncbi/collections/mybibliography/?user=dsurber')

#### when a new tab gets opened up an you need to be in right one...
#parent_h = driver.current_window_handle
# click on the link that opens a new window
#handles = driver.window_handles # before the pop-up window closes
#handles.remove(parent_h)
#driver.switch_to.window(handles.pop())
#soup = BeautifulSoup(driver.page_source, 'lxml')
#print(soup)

### after done with pmc scrape, navigate into nihms to get status there
driver.get('https://www.nihms.nih.gov/submission/')
driver.find_element_by_xpath('//*[@id="react-app"]/div/div/div[2]/div[3]/a').click()