In [1]:
# load start of pub_comp_src.py
import time
import logging
from Bio import Entrez
import numpy as np
import pandas as pd
from redcap import Project
from datetime import datetime

### Get access keys from the setup file - config.py
import config
import pub_comp_lib

# ## !!** For DEV
from importlib import reload
# importlib.reload(name_of_module)
# ## !!** For DEV

start = time.time()

logging.basicConfig(
    filename="test.log", 
    level=logging.DEBUG, 
    format="%(asctime)s:%(levelname)s:%(message)s"
    )

In [None]:
# run pubmed query section of pub_comp_src.py
logger = logging.getLogger(__name__)
#logger = logging.basicConfig(filename='app.log', filemode='w',
#                             format='%(name)s - %(levelname)s - %(message)s')

# loop over all config grants for cleanup
for x in range(len(config.grant_list)):
    # remove all whitespace, leading or trailing hyphenates - clean_grant.py
    config.grant_list[x] = pub_comp_lib.clean(config.grant_list[x])

### Create list for each grant with 34 grant variations - grant_vari.py
variations = []
for grant in config.grant_list:
    variations.extend(pub_comp_lib.variety(grant))

### Get pmids from pubmed for all grant variations
# create variables for pubmed queries
Entrez.email = "Your.Name.Here@example.org"
Entrez.api_key = config.ncbi_api

# create set for unique list of all pmids from querying pubmed with each
# grant variation
pmids = set()
# query pubmed for pmids associated with each grant variation
logger.info("Starting pubmed queries...")
pubmed_results = []

for grant in variations:
    attempt = 1
    while attempt <= 3:
        try:
            handle = Entrez.esearch(db='pubmed', term=grant,
                                    field='grant', retmax=5000,
                                    usehistory='y', retmode='xml')
            record = Entrez.read(handle)
            handle.close()
            if int(record['Count']) > 0:
                pubmed_results.append(record)
                pmids.update(record['IdList'])
                logger.info('Entrez ESearch returns %i Ids for %s' % (int(record['Count']), str(grant)))
            attempt = 4
        except Exception as err:
            logger.warning('Received error from server: %s' % str(err))
            logger.warning('Attempt %i of 3 for grant %s.' % (attempt,
                                                              str(grant)))
            attempt += 1
            time.sleep(2)
    logger.debug('Grant %s queried.' % str(grant))

logger.info('All grant queries complete.')

### Update pmid set if a REDCap project is being used to track publications
if config.rc_token is not None and config.rc_uri is not None:
    old_pmids = []
    # get the full pmid list from the REDCap project
    project = Project(config.rc_uri, config.rc_token)
    rc_pmids = project.export_records(fields=['pmid'], format='json')
    for rc_pmid in rc_pmids:
        old_pmids.append(rc_pmid['pmid'])
    new_pmids = list(pmids.difference(old_pmids))   # newly discovered pmids
    pmids.update(old_pmids)
    # date of first discovery
    if len(new_pmids) > 0:
        first_disc = [datetime.today().strftime("%Y-%m-%d")]*len(new_pmids)
        # create data frame of new_pmids with date of first dicovery and
        # import into REDCap project
        # create data frame using lists and import into redcap
        data = pd.DataFrame(np.column_stack([new_pmids, first_disc]),
                            columns=['pmid', 'first_discovered'])
        response = project.import_records(data)

### Get table of publication details from pubmed for pmids
# make dataframe of publications
pubs_frame = pub_comp_lib.summary(pmids, config.ncbi_api, variations)
# add compliant pmc status for publications with a pmcid
pubs_frame['pmc_status'] = np.where(pubs_frame.pmcid == '', '', '1')
# write table
pubs_frame.to_csv('batch_pubmed_frame.csv', index=False)

# change blank values to nan- makes column merging easier
pubs_frame[pubs_frame == ''] = np.nan

In [2]:
# load pub_comp_lib.py libraries
from Bio import Entrez
from Bio.Entrez import efetch
from Bio.Entrez import read
import regex as re
import datetime
import time
import logging
import pandas as pd
import time
import bs4

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

In [3]:
driver = pub_comp_lib.pacm_login(config.era_login, config.era_pass)
time.sleep(5)

In [4]:
# establish the root of the pacm publication url
pacm_root = 'https://www.ncbi.nlm.nih.gov/pmc/utils/pacm/l/'

# initial value of a pmid for developing the module
pmid = '29802980'

In [5]:
driver.get(pacm_root+pmid)

In [6]:
# start reading them in!
pacm_html = driver.find_element_by_tag_name('html').get_attribute('innerHTML')

In [16]:
from bs4 import BeautifulSoup

In [17]:
soup = BeautifulSoup(driver.page_source, 'lxml')

In [18]:
print(soup)

<html><head>
<title>PA Compliance Monitor</title>
<link href="https://www.ncbi.nlm.nih.gov/pmc/utils/pacm/static/jquery-ui-1.11.3/jquery-ui.min.css" rel="stylesheet"/>
<link href="https://www.ncbi.nlm.nih.gov/pmc/utils/pacm/static/css/archive.css" rel="STYLESHEET" type="text/css"/>
<link href="https://www.ncbi.nlm.nih.gov/pmc/utils/pacm/static/css/ncbi.css" rel="STYLESHEET" type="text/css"/>
<script src="https://www.ncbi.nlm.nih.gov/pmc/utils/pacm/static/js/jquery-2.0.3.min.js" type="text/javascript"></script>
<script src="https://www.ncbi.nlm.nih.gov/pmc/utils/pacm/static/jquery-ui-1.11.3/jquery-ui.min.js" type="text/javascript"></script>
<script src="https://www.ncbi.nlm.nih.gov/pmc/utils/pacm/static/js/archive.js" type="text/javascript"></script>
<style type="text/css">
#title {
    margin: 8px 0px 8px 0px;
    font-size: 14px;
    font-weight: bold;
}

#content {
/*    font-size: 12px; */
    min-height: 80%;
}

#bottommenu {
    text-align: center;
    border-top: 1px solid #a0c0e

In [23]:
table = soup.find_all('table')[2]

In [24]:
print(table)

<table class="qa-table" width="100%"><tbody><tr>
<td style="text-align: center; width: 25%;">PM ID: <a href="http://www.ncbi.nlm.nih.gov/sites/entrez?cmd=retrieve&amp;db=pubmed&amp;list_uids=29802980&amp;dopt=Citation" target="mainwindow">29802980</a>|<a href="http://www.ncbi.nlm.nih.gov/sites/entrez?cmd=retrieve&amp;db=pubmed&amp;list_uids=29802980&amp;dopt=XML" target="_blank">XML</a></td>
<td style="text-align: center; width: 25%;">PMC ID: <a href="http://www.ncbi.nlm.nih.gov/pmc/articlerender.fcgi?artid=" target="mainwindow"></a></td>
<td style="text-align: center; width: 25%;">NIHMS ID: 1064214</td>
<td style="text-align: center; width: 25%;">Status: Not Compliant</td></tr>
<tr><td colspan="4"> </td></tr>
<tr><td colspan="4" style="text-align: left;">The Amish have decreased asthma and allergic diseases compared with old order Mennonites.</td></tr>
<tr><td colspan="4" style="text-align: left;">Jamee C Tantoco, Jordan Elliott Bontrager, Qianqian Zhao, James Deline, Christine M Sero