In [2]:
import os
import json
import xmltodict
from lxml import etree
from tqdm import tqdm

In [3]:
def process_element(elem, file_index):

    # Convert the XML element to a dictionary
    dict_element = xmltodict.parse(etree.tostring(elem, encoding='unicode'))
    # Convert dictionary to JSON string
    json_data = json.dumps(dict_element, indent=4)
    # Save JSON data to file
    with open(f'saved_json/article_{file_index}.json', 'w') as f:
        f.write(json_data)


# Ensure the saved_json directory exists
os.makedirs('saved_json', exist_ok=True)

# Define the path for the XML file
path = 'homeostasis.xml'

# Create an iterator for the XML elements
context = etree.iterparse(path, events=('end',), tag='PubmedArticle')
# Using tqdm to add a progress bar
try:
    # Get total number of PubmedArticle elements to initialize tqdm (optional, remove if performance hit)
    total_articles = sum(1 for event, elem in etree.iterparse(path, events=('end',), tag='PubmedArticle'))
except Exception:
    total_articles = None  # If count fails, progress bar will not show total


context = etree.iterparse(path, events=('end',), tag='PubmedArticle', recover=True)  # reinitialize iterator for actual processing
progress_bar = tqdm(context, total=total_articles, desc="Processing articles")

file_index = 1
for event, elem in progress_bar:
    process_element(elem, file_index)
    file_index += 1
    # Clear the element to free up memory
    elem.clear()
    # Also eliminate now-empty references from the root node to <Element>
    while elem.getprevious() is not None:
        del elem.getparent()[0]

Processing articles: 385694it [12:16, 523.76it/s] 
