In [0]:
%run ./Helpers/ingestion_utils

In [0]:
import requests
import io
import zipfile
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
from pyspark.sql import SparkSession

In [0]:
spark.sql("USE CATALOG workspace")
spark.sql("USE SCHEMA med")

In [0]:
spark = SparkSession.builder.getOrCreate() # create a spark session
current_time = datetime.utcnow() # get the current time
ensure_raw_table()

In [0]:
# scrape medlineplus and get the url of most recent 'MedlinePlus Compressed Health Topic XML' file
def get_latest_medlineplus_zip_url() -> str:
    index_url = "https://medlineplus.gov/xml.html"
    resp = requests.get(index_url, timeout=30) # get the page that has the latest xml file
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser") # parse the page

    # find the compressed xml file
    for a in soup.find_all("a", href=True):
        if "MedlinePlus Compressed Health Topic XML" in a.get_text(strip=True):
            href = a["href"]
            # Make absolute if needed
            if href.startswith("http"):
                return href
            return f"https://medlineplus.gov/{href.lstrip('/')}"
    raise RuntimeError("Could not find MedlinePlus Compressed Health Topic XML link on xml.html")

In [0]:
MEDLINEPLUS_ZIP_URL = get_latest_medlineplus_zip_url()
print(f"Using MedlinePlus compressed XML from: {MEDLINEPLUS_ZIP_URL}")

response = requests.get(MEDLINEPLUS_ZIP_URL) # get the zip file
response.raise_for_status()

zip_bytes = io.BytesIO(response.content) # convert the response to byte object
zip_file = zipfile.ZipFile(zip_bytes) # create a zip file object

xml_name = [name for name in zip_file.namelist() if name.endswith(".xml")][0] # get the xml file name
xml_data = zip_file.read(xml_name) # read the xml file

In [0]:
root = ET.fromstring(xml_data) # parse xml to element tree
records = []

# loop through each health topic node
for topic in root.findall(".//health-topic"):
    topic_id = topic.get("id") # get the topic id
    title = topic.attrib.get("title")
    url = topic.attrib.get("url")
    language = topic.attrib.get("language", "English")
    if language.lower() != "english":
        continue
    
    # extract synonyms of the title
    also_called_elements = topic.findall("./also-called")
    synonyms_list = [
        element.text.strip()
        for element in also_called_elements
        if element is not None and element.text
    ]
    synonyms = "; ".join(synonyms_list) if synonyms_list else None

    # extract full summary section as raw html
    full_text_tag = topic.find("./full-summary")
    raw_html = ET.tostring(full_text_tag, encoding="unicode") if full_text_tag is not None else None

    category = "condition" # placeholder

    # build the record dictionary
    record = {
        "doc_id": f"medline_{topic_id}",
        "category": category,
        "title": title,
        "synonyms": synonyms,
        "url": url,
        "raw_text": raw_html,
        "meta_json": None,
    }

    records.append(record)

len(records)

In [0]:
load_records_to_raw_data(records, source_value="medlineplus", preview=True)