In [1]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.insert(0, "../../scrapemed")

import pandas as pd
import numpy as np
import re
import lxml.etree as ET
import scrapemed.scrape as scrape
import scrapemed.trees as trees
import scrapemed._clean as _clean
import scrapemed._validate as _validate
from Bio import Entrez 

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [20]:
def view_record_index(records, i, verbose = False):
    record = records[i]
    print(f"----------Record #{i} Returned:-------------------\n{record}")
    try:
        p = record['body']['sec']['p']
        print(f"\nSample paragraph: {p}")
        print(f"\nParagraph items: {p.items()}")
        print(f"\nParagraph values: {p.values()}")
        if verbose:
            print(f"\n...Iterating Paragraph Attrs...\n")
            for attr in dir(p):
                if not attr.startswith("__"):
                    attr_value = getattr(p, attr)
                    if not callable(attr_value):
                        print("\nNon-Callable Attr:\n")
                        print(f"{attr}: {attr_value}")

            for attr in dir(p):
                if not attr.startswith("__"):
                    if callable(attr_value):
                        print(f"\nCallable Attr {attr}:\n")
                        try:
                            attr_value()
                        except TypeError as e:
                            print(f"{attr_value} gave Type Error: {e}")
                        except KeyError as e:
                            print(f"{attr_value} gave Key Error: {e}")
    except TypeError as e:
        pass
    except KeyError as e:
        pass
    print(f"---------------------------------------------\n")


def get_records_and_view(pmcid_list):
    #Specify creds and PMCID
    EMAIL = "danielfrees247@gmail.com"
    DB = 'pmc'
    RETTYPE = 'full'
    RETMODE = 'xml'
    Entrez.email = EMAIL

    #Actually fetch from PMC
    handle = Entrez.efetch(db = DB, id = pmcid_list, rettype = RETTYPE, retmode = RETMODE)
    records = Entrez.read(handle)

    for i in range(len(records)):
        view_record_index(records, i)

In [21]:
get_records_and_view(pmcid_list = [7067710])

----------Record #0 Returned:-------------------
DictElement({'front': {'journal-meta': {'journal-id': DictElement({}, attributes={'journal-id-type': 'iso-abbrev'}), 'journal-title-group': {'journal-title': {}}, 'issn': DictElement({}, attributes={'pub-type': 'epub'}), 'publisher': {'publisher-name': {}, 'publisher-loc': {}}}, 'article-meta': {'article-id': DictElement({}, attributes={'pub-id-type': 'doi'}), 'article-categories': {'subj-group': DictElement({'subject': {}}, attributes={'subj-group-type': 'heading'})}, 'title-group': {'article-title': {}}, 'contrib-group': {'contrib': DictElement({'name': {'surname': {}, 'given-names': {}}, 'xref': DictElement({}, attributes={'ref-type': 'aff', 'rid': 'Aff5'})}, attributes={'contrib-type': 'author'}), 'aff': DictElement({'label': {}}, attributes={'id': 'Aff6'})}, 'pub-date': DictElement({'month': {}, 'year': {}}, attributes={'pub-type': 'ppub'}), 'volume': {}, 'issue': {}, 'fpage': {}, 'lpage': {}, 'permissions': {'copyright-statement': 

In [34]:
handle = Entrez.esearch(db="pmc", retmax=100, term="drug")
record = Entrez.read(handle)
handle.close()
drug_article_sample = record['IdList']
drug_article_sample

['10156464', '10156460', '10156452', '10156445', '10156439', '10156411', '10156174', '10156167', '10156161', '10156151', '10156126', '10156123', '10156121', '10156119', '10156104', '10156058', '10156006', '10155988', '10155987', '10155984', '10155980', '10155961', '10155953', '10155949', '10155941', '10155913', '10155901', '10155898', '10155890', '10155872', '10155836', '10155833', '10155817', '10155808', '10155778', '10155777', '10155759', '10155755', '10155746', '10155742', '10155741', '10155735', '10155713', '10155709', '10155705', '10155702', '10155701', '10155692', '10155686', '10155676', '10155675', '10155631', '10155618', '10155609', '10155607', '10155601', '10155597', '10155591', '10155587', '10155584', '10155580', '10155578', '10155577', '10155572', '10155559', '10155557', '10155531', '10155514', '10155506', '10155495', '10155491', '10155484', '10155480', '10155476', '10155474', '10155470', '10155461', '10155453', '10155448', '10155445', '10155432', '10155429', '10155424', '10

In [64]:
dtd_list = []

#Specify creds and PMCID
EMAIL = "danielfrees247@gmail.com"
DB = 'pmc'
RETTYPE = 'full'
RETMODE = 'xml'
Entrez.email = EMAIL

#Actually fetch from PMC
pmcid_list = drug_article_sample
handle.close()
for pmcid in pmcid_list:
    try:
        record = scrape.get_xml(pmcid = pmcid, email = EMAIL, validate = False)
    except ET.XMLSyntaxError as e:
        print("Found a bad XML.")
    dtd_url_pattern = re.compile(r'"(https?://\S+)"')
    match = dtd_url_pattern.search(record.docinfo.doctype)

    if match:
        url = match.group(1)
        dtd_list.append(url)
    else:
        print("Failed to find DTD!")










Found a bad XML.






Found a bad XML.











Found a bad XML.








Found a bad XML.





















Found a bad XML.

















In [33]:

#get_records_and_view(drug_article_sample)


In [61]:
def get_pmc_dtd_list(search_term, retmax=10):
    dtd_list = []

    #Specify creds and PMCID
    EMAIL = "danielfrees247@gmail.com"
    DB = 'pmc'
    RETTYPE = 'full'
    RETMODE = 'xml'
    Entrez.email = EMAIL

    #Actually fetch from PMC
    handle = Entrez.esearch(db = DB, retmax = retmax, term = search_term)
    pmcid_list = Entrez.read(handle)
    handle.close()
    for pmcid in pmcid_list:
        record = scrape.get_xml(pmcid = pmcid, email = EMAIL, validate = False)
        dtd_url_pattern = re.compile(r'"(https?://\S+)"')
        match = dtd_url_pattern.search(record.docinfo.doctype)

        if match:
            url = match.group(1)
            dtd_list.append(url)
        else:
            print("Failed to find DTD!")
    
    return dtd_list
    

In [62]:

get_pmc_dtd_list('drug')


HTTPError: HTTP Error 400: Bad Request

In [51]:
dtd_list = []
#Specify creds and PMCID
EMAIL = "danielfrees247@gmail.com"
DB = 'pmc'
RETTYPE = 'full'
RETMODE = 'xml'
Entrez.email = EMAIL
pmcid = 10156464
record = scrape.get_xml(pmcid = pmcid, email = EMAIL, validate = False)

print(type(record))
dtd_list.append(record.docinfo)

dtd_list

<class 'lxml.etree._ElementTree'>


[<lxml.etree.DocInfo at 0x7ff589683730>]

'<!DOCTYPE pmc-articleset PUBLIC "-//NLM//DTD ARTICLE SET 2.0//EN" "https://dtd.nlm.nih.gov/ncbi/pmc/articleset/nlm-articleset-2.0.dtd">'