In [1]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.insert(0, "../../scrapemed")

import pandas as pd
import numpy as np
import re
import lxml.etree as ET
import scrapemed.scrape as scrape
import scrapemed.trees as trees
import scrapemed._clean as _clean
import scrapemed._validate as _validate

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Efficient Example of Using scrape module

In [3]:
#Specify creds and PMCID
PMCID = 7067710
email = "danielfrees247@gmail.com"

#get messy tree for data definitions
full_xml = scrape.get_xml(pmcid = PMCID, email = email, strip_text_styling=False, download=True)
full_data_dict = trees._generate_data_dictionary(full_xml)
full_root = full_xml.getroot()


#see validate xml separately
#scrape._validate_xml(full_xml)

#clean it up
clean_xml = scrape.get_xml(pmcid= PMCID, email = email, strip_text_styling=True, verbose = False)
clean_root = clean_xml.getroot()

#unhash if you want to generate new pdfs of the element tree visualizations
#scrape.visualize_element_tree(clean_root, title = f"data/{PMCID}_element_tree_clean.gv")
#scrape.visualize_element_tree(full_root, title = f"data/{PMCID}_element_tree_full.gv")

#basic description
trees.investigate_xml_tree(clean_xml)
clean_data_dict = trees._generate_data_dictionary(clean_xml)


Num Elements: 1683
Unique Element Types: {'journal-title', 'thead', 'title', 'ref-list', 'notes', 'publisher', 'suffix', <cyfunction ProcessingInstruction at 0x7ff78fe26400>, 'etal', 'journal-id', 'month', 'pub-date', 'abstract', 'subject', 'funding-group', 'funding-source', 'contrib-group', 'back', 'person-group', 'journal-title-group', 'article-meta', 'license', 'meta-name', 'p', 'mixed-citation', 'lpage', 'article-id', 'sc', 'fpage', 'table-wrap-foot', 'table', 'td', 'email', 'issn', 'break', 'ext-link', 'xref', 'subj-group', 'given-names', 'volume', 'issue', 'th', 'custom-meta', 'contrib', 'pmc-articleset', 'sec', 'permissions', 'publisher-loc', 'name', 'fn-group', 'pub-id', 'address', 'publisher-name', 'source', 'element-citation', 'surname', 'tr', 'institution-wrap', 'day', 'article-title', 'front', 'journal-meta', 'meta-value', 'article-categories', 'ref', 'title-group', 'license-p', 'article', 'award-group', 'fn', 'institution', 'caption', 'fig', 'copyright-statement', 'custom-

In [4]:
data_dict = clean_data_dict

#Create a multi-indexed dataframe of all supported/known tag, attr, val combos
multi_indexed_tuples = []
header_names = ["tag", "attr", "val"]
for tag in data_dict.keys():
    for attr in data_dict[tag].keys():
        for val in data_dict[tag][attr]:
            multi_indexed_tuples.append(tuple([tag.lower(), attr.lower(), val.lower()]))

multi_indexed_tuples

m_index = pd.MultiIndex.from_tuples(multi_indexed_tuples)
data_dict_df = m_index.to_frame(name = header_names)
data_dict_df["supported"] = 1
data_dict_df = data_dict_df["supported"]
data_dict_df.head(300)
#data_dict_df.to_csv("test.csv", index_label = header_names)

article           article-type                        research-article                                                         1
journal-id        journal-id-type                     nlm-ta                                                                   1
                                                      iso-abbrev                                                               1
issn              pub-type                            ppub                                                                     1
                                                      epub                                                                     1
article-id        pub-id-type                         pmid                                                                     1
                                                      pmc                                                                      1
                                                      publisher-id                               

In [5]:
full_root = full_xml.getroot()
full_root.findall(".//*/ref")[0].text

'\n        '

### Testing scrape on a small text

In [6]:
test_text = ("<paper><article-title>Daniel</article-title>Hello my name is Daniel, my <italic attr='Whatever' color = 'Blue'>favorite</italic> chemical is <i>C</i><sub>4</sub>. "
"<b hello = 'dan' haha = 'whatever'>I</b> also <italic attr ='something'>wanted</italic> to say that <underline>you</underline> should use this code as a<sup>1</sup> test to make sure "
"html tagging removal is going as expected.</paper>")
test_tree = scrape.xml_tree_from_string(test_text, strip_text_styling=False)

#test the styling removal
print("Testing the html styling removal function: --------------------------\n")
print(_clean._remove_text_styling(test_text, verbose=True))
print("\n-------------End removal func------------\n")


#test data dictionary generation
print("Generating data dictionary for test text: ------------------\n")
data_dict = trees._generate_data_dictionary(test_tree)
print(data_dict)
print(eval(repr(data_dict).lower()))
print("\n-------------End data dictionary generation------------\n")

Testing the html styling removal function: --------------------------

Removing the following tags:
['<italic\\b[^>]*>', '<i\\b[^>]*>', '<bold\\b[^>]*>', '<b\\b[^>]*>', '<underline\\b[^>]*>', '<u\\b[^>]*>', '</italic\\b[^>]*>', '</i\\b[^>]*>', '</bold\\b[^>]*>', '</b\\b[^>]*>', '</underline\\b[^>]*>', '</u\\b[^>]*>', '</sub\\b[^>]*>', '</sup\\b[^>]*>']

Making the following replacements:

<sub> replaced with _

<sup> replaced with ^

<paper><article-title>Daniel</article-title>Hello my name is Daniel, my favorite chemical is C_4. I also wanted to say that you should use this code as a^1 test to make sure html tagging removal is going as expected.</paper>

-------------End removal func------------

Generating data dictionary for test text: ------------------

{'paper': {}, 'article-title': {}, 'italic': {'attr': ['Whatever', 'something'], 'color': ['Blue']}, 'i': {}, 'sub': {}, 'b': {'hello': ['dan'], 'haha': ['whatever']}, 'underline': {}, 'sup': {}}
{'paper': {}, 'article-title': {}, 

In [7]:
data_dict = trees._generate_data_dictionary(test_tree)
print(data_dict)

{'paper': {}, 'article-title': {}, 'italic': {'attr': ['Whatever', 'something'], 'color': ['Blue']}, 'i': {}, 'sub': {}, 'b': {'hello': ['dan'], 'haha': ['whatever']}, 'underline': {}, 'sup': {}}


In [8]:
test_text2 = ("<paper><article-title>Daniel</article-title>Hello my name is Daniel, my <italic attr='' color = 'Blue'>favorite</italic> chemical is <i>C</i><sub>4</sub>. "
"<b hello = 'dan' haha = 'whatever'>I</b> also <italic attr ='something'>wanted</italic> to say that <underline>you</underline> should use this code as a<sup>1</sup> test to make sure "
"html tagging removal is going as expected.</paper>")
test_tree2 = scrape.xml_tree_from_string(test_text2, strip_text_styling=False)

trees._generate_data_dictionary(test_tree2)

{'paper': {},
 'article-title': {},
 'italic': {'attr': ['', 'something'], 'color': ['Blue']},
 'i': {},
 'sub': {},
 'b': {'hello': ['dan'], 'haha': ['whatever']},
 'underline': {},
 'sup': {}}