In [3]:
import lxml.etree as ET
import os
import requests
import urllib.parse
import requests
from bs4 import BeautifulSoup
""" import spacy
from scispacy.umls_linking import UmlsEntityLinker """

# Load the disease/chemical model
#nlp = spacy.load("en_ner_bc5cdr_md")

# Create the linker
#linker = UmlsEntityLinker(resolve_abbreviations=True, max_entities_per_mention=5)

# Add the linker to the pipeline
#nlp.add_pipe(linker)

' import spacy\nfrom scispacy.umls_linking import UmlsEntityLinker '

In [None]:
# Paths to local XML and DTD files
DTD_FILE = "input/mplus_topics_2012-06-01.dtd"
XML_FILE = "input/mplus_topics_2025-03-04.xml"  
def filter_sites_by_title(site_title):
    """
    Parses the given MedlinePlus health topics XML file,
    and returns all <site> elements (as dicts) that match:
      1) <site title=site_title> (exact attribute match)
      2) Contain <information-category>Patient Handouts</information-category>
         AND <information-category>Encyclopedia</information-category>
      3) <organization>Medical Encyclopedia</organization>

    The results are sorted alphabetically by <site> title.
    """
    tree = ET.parse(XML_FILE)
    root = tree.getroot()  # <health-topics> is the root

    all_matching_sites = []

    # Iterate over all <health-topic> elements
    for health_topic in root.findall("health-topic"):
        # Each <health-topic> may have multiple <site> children
        for site_elem in health_topic.findall("site"):
            # Check if this <site> has the exact title attribute
            current_site_title = site_elem.get("title", "").strip()
            if current_site_title != site_title:
                continue  # Skip if the title doesn't match

            # Gather <information-category> texts
            info_categories = [
                ic.text.strip() for ic in site_elem.findall("information-category") if ic.text
            ]

            # Check for <organization>
            org_elem = site_elem.find("organization")
            org_text = org_elem.text.strip() if org_elem is not None and org_elem.text else ""

            # Condition:
            #   a) Must have 'Patient Handouts' and 'Encyclopedia' among info_categories
            #   b) Must have <organization> == 'Medical Encyclopedia'
            #   c) Already matched the site title above
            if ("Patient Handouts" in info_categories and
                "Encyclopedia" in info_categories and
                org_text == "Medical Encyclopedia"):

                # This site matches the desired criteria
                site_url = site_elem.get("url", "").strip()
                lmapped_url = site_elem.get("language-mapped-url", "").strip()

                all_matching_sites.append({
                    "title": current_site_title,
                    "url": site_url,
                    "language_mapped_url": lmapped_url,
                    "information_categories": info_categories,
                    "organization": org_text
                })

    # Sort results by title (case-insensitive)
    all_matching_sites.sort(key=lambda s: s["title"].lower())

    return all_matching_sites

: 

In [None]:
def download_html(url, title_query, folder="html"):
    """
    Downloads the HTML from a given URL and saves it as a .html file
    in the specified 'folder' (default: 'html').

    Args:
        url (str): The URL to fetch.
        folder (str): The folder where the .html file will be saved.

    Returns:
        str: The full file path of the saved HTML file.
    """
    # Create the folder if it doesn't exist
    os.makedirs(folder, exist_ok=True)

    # URL-encode the entire URL to make a valid filename, then append .html
    #safe_filename = urllib.parse.quote_plus(url) + ".html"
    file_path = os.path.join(folder, title_query + ".html")

    # Send the GET request
    response = requests.get(url)
    response.raise_for_status()  # Raise an error if request failed

    # Write the response text (HTML content) to a file
    with open(file_path, "w", encoding="utf-8") as f:
        f.write(response.text)

    return file_path

: 

: 

In [None]:

def extract_symptoms_bullets(html_text):
    """
    Given an HTML string of a MedlinePlus page (or similar),
    find the section whose <h2> text == 'Symptoms' and
    return all the bullet-point text <li> from that section.

    Returns:
        A list of strings, where each string is the text of a bullet point.
    """

    soup = BeautifulSoup(html_text, "html.parser")
    
    # 1. Find the section that has <h2>Symptoms</h2>
    #    Each <section> typically has structure:
    #    <section><div class="section"><div class="section-header">...
    #    We'll locate the <h2> with text "Symptoms" (case-insensitive match).
    
    # Approach: Look for an <h2> whose .text == "Symptoms" (stripping/case-insensitive).
    target_section = None
    all_h2 = soup.find_all("h2")
    for h2_tag in all_h2:
        if h2_tag.get_text(strip=True).lower() == "symptoms":
            # The <h2> is inside <div class="section-title">, which is inside <div class="section">
            # We'll go up to the nearest <section> or parent that encloses this heading
            # In the snippet, it's two levels up <section><div class="section"><div class="section-title"><h2>Symptoms</h2></div>
            # but let's be flexible in case of slight structure changes.
            # We'll climb up to the <section> ancestor:
            parent_section = h2_tag.find_parent("section")
            if parent_section:
                target_section = parent_section
                break

    if not target_section:
        return []  # No "Symptoms" section found
    
    # 2. Within that section, find all <li> elements
    #    We'll gather text from each bullet.
    bullet_points = []
    for li_tag in target_section.find_all("li"):
        text = li_tag.get_text(" ", strip=True)
        bullet_points.append(text)
    
    return bullet_points

: 

: 

In [None]:
title_query = "Diabetes"
matching_sites = filter_sites_by_title(title_query)

# Print the results
if matching_sites:
    print(f"Found {len(matching_sites)} matching site(s) with title='{title_query}':\n")
    for site in matching_sites:
        download_html(site["url"],title_query)
        print(f"- Title: {site['title']}")
        print(f"  URL: {site['url']}")
        print(f"  Language-Mapped URL: {site['language_mapped_url']}")
        print(f"  Categories: {site['information_categories']}")
        print(f"  Organization: {site['organization']}")
        print("--------------------------------------------------")
else:
    print(f"No <site> elements found with title='{title_query}' matching the criteria.")

Found 1 matching site(s) with title='Diabetes':

- Title: Diabetes
  URL: https://medlineplus.gov/ency/article/001214.htm
  Language-Mapped URL: https://medlineplus.gov/spanish/ency/article/001214.htm
  Categories: ['Patient Handouts', 'Encyclopedia']
  Organization: Medical Encyclopedia
--------------------------------------------------


: 

: 

: 

: 

In [None]:
import glob

# Iterate through all HTML files in the 'html' folder
for html_file in glob.glob("html/*.html"):
    with open(html_file, "r", encoding="utf-8") as file:
        html_content = file.read()
    # Call our function
    symptoms_list = extract_symptoms_bullets(html_content)

    print(f"Extracted bullet points from the 'Symptoms' section in {html_file}:")
    for bp in symptoms_list:
        print("-", bp)
    print("--------------------------------------------------")


Extracted bullet points from the 'Symptoms' section in html/Diabetes.html:
- Blurry vision
- Excess thirst
- Fatigue
- Frequent urination
- Hunger
- Weight loss
- Eye problems , including trouble seeing (especially at night), light sensitivity, cataracts, and blindness
- Sores and infections of the leg or foot, which if untreated, can lead to amputation of the leg or foot
- Damage to nerves in the body , causing pain, tingling, a loss of feeling, problems digesting food, and erectile dysfunction
- Kidney problems , which can lead to kidney failure
- Weakened immune system, which can lead to more frequent infections
- Increased chance of having a heart attack or stroke
--------------------------------------------------


: 

: 

: 

: 