In [1]:
#!conda list
#just checking that vscode was correctly syncing with conda env

#sample article to work with
#"Ibuprofen for acute treatment of episodic tension‐type headache in adults"
#https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6457940/

#IMPORTS
import pandas as pd
import numpy as np
import sklearn
#from requests_html import HTMLSession
import requests
from requests.exceptions import ConnectionError

# Paper Scraping Exploration 

*Author: Daniel Frees, Email: daniuelfrees247@gmail.com, Last Updated: 02/22/23*

*(Goal: Parse PMC Articles into Standard Format, Evaluate Parse Quality, Store in DB)*

1. Search for relevant papers via NCBI
    >todo: focusing on a small sample of papers first  
    >notes: will use Entrez  

1. Access XML via NCBI √
    >notes: using Biopython's Entrez to access the NCBI's XML data for papers. Yields useful metadata

1. Access full paper data via NCBI √
    >Tried So Far:
    >>•Entrez √ 
      
    >>•BioC RESTful API (Pubmed says this yields full data for Open Access articles, but it does not in any of my experimentation here, regardless of json vs. xml).  
      
    >>•FTP should supposedly work (https://www.biostars.org/p/159761/), but seems to be a convoluted mess. Sticking with Entrez.

1. Convert downloaded full paper data into standard format [IN PROGRESS]
    >notes: use ElementTree library and XPath to parse XMLs  
    >[XML NCBI Guidelines](https://www.ncbi.nlm.nih.gov/pmc/pmcdoc/tagging-guidelines/article/style.html)  
    >can run with retmode = 'json' but does not return most of the needed information. Unfortunate as this could have been useful for identifying problematic html styling tags 
    >Use json request from Entrez to identify html tags that need removal, remove them and use Xpath on the resulting XML to create standardized format.

1. Evaluate success of conversion (0 = perfect, 1 = success, but not all sections found, 2 = failure)

1. Use SQL Alchemy to store perfect papers in a table, successful imperfect papers in another table

In [2]:
#SET UP desired format for papers retrieved
#Using an object so this can be SQL Alchemy friendly when I scale to connection with the backend
from sqlalchemy import create_engine, ForeignKey, Column, String, Integer, CHAR, Boolean
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker

Base = declarative_base()

class Paper(Base):
    __tablename__ = "Papers"

    #paper identifiers, DOI should be a completely unique string and is PK
    doi = Column("doi", String, primary_key=True)
    title = Column("title", String)
    pmid = Column("pmid", String)  #for articles in PMC
    authors = Column("authors", String)  #list of authors of paper, could potentially blow out into multiple columns later
    journal = Column("journal", String)

    #included in case some papers don't properly parse into the paper sections following
    #papers are very very frequently formatted different ways and may contain extra sections/
    #be missing sections from the following. 
    #so full_text will likely need to be a fall back for most papers
    full_text = Column("full_text", String) 

    #paper sections
    abstract = Column("abstract", String)
    background = Column("background", String)
    methods = Column("methods", String)
    results = Column("results", String)
    discussion = Column("discussion", String)
    conclusion = Column("conclusion", String)
    acknowledgements = Column("acknowledgements", String)

# BioC API Requests

In [3]:
import requests
from requests.exceptions import ConnectionError
import xml.etree.ElementTree as ET


#Basic REST API endpoint provided by NCBI. Doesn't seem to retrieve the entire paper
#set up paper request
def get_bioc_data(pmid, file_format = 'json', encoding = 'unicode', verbose = False):
    PMID =  pmid    #for ibuprofen: "26230487", for more standard format "
    req = f"https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pubmed.cgi/BioC_{file_format}/{PMID}/{encoding}"

    response = requests.get(req)
   
    if verbose:
        print(f"Response type received: {type(response)}")

    if file_format == 'json':
        data = response.json()
    elif file_format == 'xml':
        data = ET.ElementTree(ET.fromstring(response.text))
    else:
        print("No data to return. Please specify a file_format of json or xml.")
    return data

#Ibuprofen article
#get_bioc_data(26230487)

In [4]:
#Cetirizine in the dog
#get_bioc_data(30477556)

In [5]:
#Open Access children ibuprofen use lit review
#get_bioc_data(28597358)

In [6]:
#definitely open access
#get_bioc_data(36629266)

In [7]:
#definitely open access
oa_tree = get_bioc_data(36629266, file_format = 'xml', verbose = True)
element = oa_tree.getroot()
ET.indent(element)
print(ET.tostring(element, encoding='unicode'))

Response type received: <class 'requests.models.Response'>
<collection>
  <source>PubMed</source>
  <date>20230225</date>
  <key>collection.key</key>
  <document>
    <id>36629266</id>
    <passage>
      <infon key="type">title</infon>
      <offset>0</offset>
      <text>Hypothesis-driven probabilistic modelling enables a principled perspective of genomic compartments.</text>
    </passage>
    <passage>
      <infon key="type">abstract</infon>
      <offset>100</offset>
      <text>The Hi-C method has revolutionized the study of genome organization, yet interpretation of Hi-C interaction frequency maps remains a major challenge. Genomic compartments are a checkered Hi-C interaction pattern suggested to represent the partitioning of the genome into two self-interacting states associated with active and inactive chromatin. Based on a few elementary mechanistic assumptions, we derive a generative probabilistic model of genomic compartments, called deGeco. Testing our model, we find it 

# Entrez Data Collection, HTML tag removal, XML Parsing

In [8]:
import re

#Cleanup record bytes HTML styling
def remove_html_styling(text, removals = ["<italic>", "<i>", "<bold", "<b>", "<underline>", "<u>"], replaces = {"<sub>": "_"}, verbose = False):
    """
    Removes <italic>,<i>,<bold>,<b>,<underline>,<u> opening and closing tags.
    Replaces <sub> opening tags with _, and removes <sub> closing tags.

    html tags should not contain regex special characters, hence no escaping necessary. 
    """
    to_remove = removals
    more_to_remove = []
    to_replace = replaces 
    for tag in to_remove:
        more_to_remove.append(tag[0] + "/" + tag[1:]) #remove closing tags as well
    to_remove.extend(more_to_remove)
    for tag in to_replace.keys():
        to_remove.append(tag[0] + "/" + tag[1:]) #remove closing tags for those opening tags being replaced
    if verbose:
        print("Removing the following tags:\n")
        print(to_remove)
        print()
        print("Making the following replacements:\n")
        for find, replace in to_replace.items():
            print(f"{find} replaced with {replace}\n")
    removal_pattern = '|'.join(to_remove)
    text = re.sub(removal_pattern, '', text)
    for find, replace in to_replace.items():
        text = re.sub(find, replace, text)
    return text

test_text = ("Hello my name is Daniel, my <italic>favorite</italic> chemical is <i>C</i><sub>4</sub>. "
"<b>I</b> also wanted to say that <underline>you</underline> should <u>use this code as a test to make sure "
"html tagging removal is going as expected.")
remove_html_styling(test_text, verbose = True)

Removing the following tags:

['<italic>', '<i>', '<bold', '<b>', '<underline>', '<u>', '</italic>', '</i>', '</bold', '</b>', '</underline>', '</u>', '</sub>']

Making the following replacements:

<sub> replaced with _



'Hello my name is Daniel, my favorite chemical is C_4. I also wanted to say that you should use this code as a test to make sure html tagging removal is going as expected.'

In [9]:
#Use Entrez to request data. This library was built specifically for querying NCBI databases.
from Bio import Entrez

#Credential and PMCID
Entrez.email = 'danielfrees247@gmail.com'
PMCID = 7067710 #Acetaminophen and Ibuprofen Study

#GET XML
handle = Entrez.efetch(db = 'pmc', id = PMCID, rettype = 'full', retmode = 'xml')
xml_record = handle.read()
print(f"XML Record First 100 bytes: {xml_record[0:100]}")

#GET JSON (runs but does not yield most of the information unfortunately)
#handle = Entrez.efetch(db = 'pmc', id = PMCID, rettype = 'full', retmode = 'json')
#json_record = handle.read()
#print(f"JSON Record First 100 bytes: {json_record[0:100]}")

#Save text of the XML for investigation
with open(f"data/entrez_download_PMCID={PMCID}_xml.txt", "w") as f:
    f.write(xml_record.decode(encoding = "utf-8"))
#Save text of the JSON for investigation
#with open(f"data/entrez_download_PMCID={PMCID}_json.txt", "w") as f:
    #f.write(json_record.decode(encoding = "utf-8"))



XML Record First 100 bytes: b'<?xml version="1.0" ?>\n<!DOCTYPE pmc-articleset PUBLIC "-//NLM//DTD ARTICLE SET 2.0//EN" "https://dt'


In [10]:

#Visualize XML tree with graph viz
from graphviz import Digraph

def visualize_element_tree(element, title = 'data/element_tree.gv'):
    """Visualize an XML element tree using Graphviz."""
    dot = Digraph()
    _add_elements(dot, element)
    dot.render(title, view=True)

def _add_elements(dot, element, parent=None):
    """Recursively add elements to a Graphviz dot graph."""
    if parent is not None:
        dot.edge(parent, element.tag)
    dot.node(element.tag, element.tag)
    for child in element:
        _add_elements(dot, child, element.tag)

## Visualize XML Tree

In [11]:
import xml.etree.ElementTree as ET


#Convert bytestream to XML tree
record_tree = ET.ElementTree(ET.fromstring(xml_record))

#Parse XML Tree
root = record_tree.getroot()
print(f"\nXML Tree Info (PMCID:{PMCID})")
print("------------------------------")
print(f"Root Tag: {root.tag}")
print(f"Root Attributes: {root.attrib}\n")
print("------------------------------")
for child in root:
    print(f"\tChild Tag: {child.tag}")
    print(f"\tChild Attributes: {child.attrib}\n")

all_element_types = set([elem.tag for elem in root.iter()])
print(f"Unique Element Types: {all_element_types}")

#Visualize uncleaned tree
visualize_element_tree(root, title = f"data/{PMCID}_element_tree.gv")
#These are a whole mess for PMID articles it seems as the tree is quite large
#Furthermore, there are some problematic html stylings with italics etc. which can cause data loss

#Visualize tree after removing problematic html styling
xml_record_text = xml_record.decode(encoding = "utf-8")
xml_record_clean = remove_html_styling(xml_record_text)
clean_tree = ET.ElementTree(ET.fromstring(xml_record_clean))
clean_root = clean_tree.getroot()
visualize_element_tree(clean_root, title = f"data/{PMCID}_element_tree_clean.gv")




XML Tree Info (PMCID:7067710)
------------------------------
Root Tag: pmc-articleset
Root Attributes: {}

------------------------------
	Child Tag: article
	Child Attributes: {'article-type': 'research-article'}

Unique Element Types: {'td', 'person-group', 'lpage', 'copyright-statement', 'article', 'title-group', 'given-names', 'source', 'fn-group', 'award-group', 'license', 'article-meta', 'tbody', 'abstract', 'p', 'sub', 'issn', 'institution-id', 'journal-id', 'suffix', 'table-wrap-foot', 'ref-list', 'subject', 'tr', 'notes', 'funding-source', 'label', 'month', 'year', 'ext-link', 'day', 'element-citation', 'journal-title-group', 'sec', 'address', 'meta-value', 'fig', 'break', 'volume', 'mixed-citation', 'permissions', 'body', 'table', 'pub-date', 'institution', 'issue', 'thead', 'etal', 'email', 'th', 'article-id', 'journal-title', 'custom-meta', 'ref', 'italic', 'publisher-loc', 'bold', 'contrib-group', 'front', 'xref', 'ack', 'custom-meta-group', 'pmc-articleset', 'meta-name',

In [12]:
#XPath Parsing functions

#Work with the cleaned data
root = clean_root

#Parsing abstract
root_path = './/*/abstract/'
num_abstract_sections = len(root.findall(root_path)) #grab number of abstract sections
for i in range (1, num_abstract_sections+1):
    section_path = root_path + f"/sec[{i}]/"
    section_title_path = section_path + "title"
    section_text_path = section_path + "p"
    print(f"Section: {root.findall(section_title_path)[0].text}--------------")
    print(root.findall(section_text_path)[0].text)
    print()
    print("-----------------------------------------\n")

#Parsing body
num_body_sections = len(root.findall('.//*body/')) #grab number of sections

Section: Introduction--------------
A fixed-dose combination (FDC) of ibuprofen and acetaminophen has been developed that provides greater analgesic efficacy than either agent alone at the same doses without increasing the risk for adverse events.

-----------------------------------------

Section: Methods--------------
We report three clinical phase I studies designed to assess the pharmacokinetics (PK) of the FDC of ibuprofen/acetaminophen 250/500 mg (administered as two tablets of ibuprofen 125 mg/acetaminophen 250 mg) in comparison with its individual components administered alone or together, and to determine the effect of food on the PK of the FDC. Two studies in healthy adults aged 18–55 years used a crossover design in which subjects received a single dose of each treatment with a 2-day washout period between each. In the third study, the bioavailability of ibuprofen and acetaminophen from a single oral dose of the FDC was assessed in healthy adolescents aged 12–17 years, incl

In [13]:
#root.findall(".//*/abstract/sec[3]/p/italic[1]")[0].text
#no longer a problem since we've cleaned the tree of styling

In [14]:
root.findall(".//*/table/")

[<Element 'tbody' at 0x7fb0869da250>,
 <Element 'thead' at 0x7fb0869e5080>,
 <Element 'tbody' at 0x7fb0869e5440>,
 <Element 'thead' at 0x7fb0869e78d0>,
 <Element 'tbody' at 0x7fb0869e7e70>,
 <Element 'thead' at 0x7fb0869f87c0>,
 <Element 'tbody' at 0x7fb0869f8d60>,
 <Element 'thead' at 0x7fb0869fa750>,
 <Element 'tbody' at 0x7fb0869faca0>,
 <Element 'thead' at 0x7fb086a088b0>,
 <Element 'tbody' at 0x7fb086a08db0>,
 <Element 'thead' at 0x7fb086a0aa70>,
 <Element 'tbody' at 0x7fb086a0b060>,
 <Element 'thead' at 0x7fb086a20bd0>,
 <Element 'tbody' at 0x7fb086a21030>,
 <Element 'thead' at 0x7fb086a23e70>,
 <Element 'tbody' at 0x7fb086a30540>,
 <Element 'thead' at 0x7fb086a32890>,
 <Element 'tbody' at 0x7fb086a33510>]

In [15]:
#Current roadblock is that I think I may need to fork and PR Biopython to fix their math-formatted text parsing. Text breaks when reading in any italics, the italics are saved in a separate path but the rest of the non-italicized text disappears.