In [1]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.insert(0, "../../scrapemed")

import pandas as pd
import numpy as np
import re
import lxml.etree as ET
import scrapemed.scrape as scrape
import scrapemed.trees as trees
import scrapemed._clean as _clean
import scrapemed._validate as _validate
from Bio import Entrez 
from urllib.error import HTTPError

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
def view_record_index(records, i, verbose = False):
    record = records[i]
    print(f"----------Record #{i} Returned:-------------------\n{record}\n")
    print(f"----------Record #{i} Front:---------------------- \n{record['front']}\n")
    print(f"----------Record #{i} Body:---------------------- \n{record['body']}\n")
    try:
        p = record['body']['sec']['p']
        print(f"\nSample paragraph: {p}")
        print(f"\nParagraph items: {p.items()}")
        print(f"\nParagraph values: {p.values()}")
        if verbose:
            print(f"\n...Iterating Paragraph Attrs...\n")
            for attr in dir(p):
                if not attr.startswith("__"):
                    attr_value = getattr(p, attr)
                    if not callable(attr_value):
                        print("\nNon-Callable Attr:\n")
                        print(f"{attr}: {attr_value}")

            for attr in dir(p):
                if not attr.startswith("__"):
                    if callable(attr_value):
                        print(f"\nCallable Attr {attr}:\n")
                        try:
                            attr_value()
                        except TypeError as e:
                            print(f"{attr_value} gave Type Error: {e}")
                        except KeyError as e:
                            print(f"{attr_value} gave Key Error: {e}")
    except TypeError as e:
        pass
    except KeyError as e:
        pass
    print(f"---------------------------------------------\n")


def get_records_and_view(pmcid_list):
    #Specify creds and PMCID
    EMAIL = "danielfrees247@gmail.com"
    DB = 'pmc'
    RETTYPE = 'full'
    RETMODE = 'xml'
    Entrez.email = EMAIL

    #Actually fetch from PMC
    handle = Entrez.efetch(db = DB, id = pmcid_list, rettype = RETTYPE, retmode = RETMODE)
    records = Entrez.read(handle)

    for i in range(len(records)):
        view_record_index(records, i)

In [3]:
get_records_and_view(pmcid_list = [7067710])

----------Record #0 Returned:-------------------
DictElement({'front': {'journal-meta': {'journal-id': DictElement({}, attributes={'journal-id-type': 'iso-abbrev'}), 'journal-title-group': {'journal-title': {}}, 'issn': DictElement({}, attributes={'pub-type': 'epub'}), 'publisher': {'publisher-name': {}, 'publisher-loc': {}}}, 'article-meta': {'article-id': DictElement({}, attributes={'pub-id-type': 'doi'}), 'article-categories': {'subj-group': DictElement({'subject': {}}, attributes={'subj-group-type': 'heading'})}, 'title-group': {'article-title': {}}, 'contrib-group': {'contrib': DictElement({'name': {'surname': {}, 'given-names': {}}, 'xref': DictElement({}, attributes={'ref-type': 'aff', 'rid': 'Aff5'})}, attributes={'contrib-type': 'author'}), 'aff': DictElement({'label': {}}, attributes={'id': 'Aff6'})}, 'pub-date': DictElement({'month': {}, 'year': {}}, attributes={'pub-type': 'ppub'}), 'volume': {}, 'issue': {}, 'fpage': {}, 'lpage': {}, 'permissions': {'copyright-statement': 

In [4]:
handle = Entrez.esearch(db="pmc", retmax=10, term="drug")
record = Entrez.read(handle)
handle.close()
drug_article_sample = record['IdList']
drug_article_sample

['10191793', '10191786', '10191785', '10191755', '10191748', '10191746', '10191738', '10191736', '10191730', '10191713']

In [5]:
dtd_list = []

#Specify creds and PMCID
EMAIL = "danielfrees247@gmail.com"
DB = 'pmc'
RETTYPE = 'full'
RETMODE = 'xml'
Entrez.email = EMAIL

#Actually fetch from PMC
pmcid_list = drug_article_sample
handle.close()
for pmcid in pmcid_list:
    try:
        record = scrape.get_xml(pmcid = pmcid, email = EMAIL, validate = False)
    except ET.XMLSyntaxError as e:
        print("Found a bad XML.")
    except HTTPError as e:
        print("HTTP Error with Entrez servers.")
    dtd_url_pattern = re.compile(r'"(https?://\S+)"')
    match = dtd_url_pattern.search(record.docinfo.doctype)

    if match:
        url = match.group(1)
        dtd_list.append(url)
    else:
        print("Failed to find DTD!")













In [6]:
#WRITE OUT SCAN RESULTS
print(f"# of DTDs found: {len(dtd_list)}\n")
dtd_set = set(dtd_list)
print(f"Unique DTDs: {dtd_set}\n")
with open('dtd_list.txt', 'w') as f:
    f.write(f"# of DTDs found: {len(dtd_list)}\n")
    f.write(str(dtd_list))
with open('dtd_set.txt', 'w') as f:
    f.write(str(dtd_set))

# of DTDs found: 10

Unique DTDs: {'https://dtd.nlm.nih.gov/ncbi/pmc/articleset/nlm-articleset-2.0.dtd'}



In [7]:

#get_records_and_view(drug_article_sample)


In [8]:
def get_pmc_dtd_list(search_term, retmax=10):
    dtd_list = []

    #Specify creds and PMCID
    EMAIL = "danielfrees247@gmail.com"
    DB = 'pmc'
    RETTYPE = 'full'
    RETMODE = 'xml'
    Entrez.email = EMAIL

    #Actually fetch from PMC
    handle = Entrez.esearch(db = DB, retmax = retmax, term = search_term)
    pmcid_list = Entrez.read(handle)
    handle.close()
    for pmcid in pmcid_list:
        record = scrape.get_xml(pmcid = pmcid, email = EMAIL, validate = False)
        dtd_url_pattern = re.compile(r'"(https?://\S+)"')
        match = dtd_url_pattern.search(record.docinfo.doctype)

        if match:
            url = match.group(1)
            dtd_list.append(url)
        else:
            print("Failed to find DTD!")
    
    return dtd_list
    

In [9]:

#get_pmc_dtd_list('drug')


In [10]:
dtd_list = []
#Specify creds and PMCID
EMAIL = "danielfrees247@gmail.com"
DB = 'pmc'
RETTYPE = 'full'
RETMODE = 'xml'
Entrez.email = EMAIL
pmcid = 10156464
record = scrape.get_xml(pmcid = pmcid, email = EMAIL, validate = False)
dtd_list.append(record.docinfo.public_id)

print(type(record))




dtd_list

<class 'lxml.etree._ElementTree'>





['-//NLM//DTD ARTICLE SET 2.0//EN']