In [0]:
#print("hello world")

In [0]:
import requests
import io
import zipfile
import xml.etree.ElementTree as ET
from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, TimestampType

In [0]:
spark = SparkSession.builder.getOrCreate() # create a spark session
current_time = datetime.utcnow() # get the current time
MEDLINEPLUS_ZIP_URL = "https://medlineplus.gov/xml/mplus_topics_compressed_2025-12-06.zip"

response = requests.get(MEDLINEPLUS_ZIP_URL) # get the zip file
response.raise_for_status()

zip_bytes = io.BytesIO(response.content) # convert the response to byte object
zip_file = zipfile.ZipFile(zip_bytes) # create a zip file object

xml_name = [name for name in zip_file.namelist() if name.endswith(".xml")][0] # get the xml file name
xml_data = zip_file.read(xml_name) # read the xml file

In [0]:
root = ET.fromstring(xml_data) # parse xml to element tree
records = []

# loop through each health topic node
for topic in root.findall(".//health-topic"):
    topic_id = topic.get("id") # get the topic id
    title = topic.attrib.get("title")
    url = topic.attrib.get("url")
    language = topic.attrib.get("language", "English")
    if language.lower() != "english":
        continue
    
    # extract synonyms of the title
    also_called_elements = topic.findall("./also-called")
    synonyms_list = [
        element.text.strip()
        for element in also_called_elements
        if element is not None and element.text
    ]
    synonyms = "; ".join(synonyms_list) if synonyms_list else None

    # extract full summary section as raw html
    full_text_tag = topic.find("./full-summary")
    raw_html = ET.tostring(full_text_tag, encoding="unicode") if full_text_tag is not None else None

    category = "condition" # placeholder

    # build the record dictionary
    record = {
        "doc_id": f"medline_{topic_id}",
        "category": category,
        "source": "medlineplus",
        "title": title,
        "synonyms": synonyms,
        "url": url,
        "raw_text": raw_html,
        "meta_json": None,
        "ingested_at": current_time,
    }

    records.append(record)

len(records)

In [0]:
# define schema
schema = StructType([
    StructField("doc_id", StringType(), False),
    StructField("category", StringType(), False),
    StructField("source", StringType(), False),
    StructField("title", StringType(), True),
    StructField("synonyms", StringType(), True),
    StructField("url", StringType(), True),
    StructField("raw_text", StringType(), True),
    StructField("meta_json", StringType(), True),
    StructField("ingested_at", TimestampType(), False),
])

df = spark.createDataFrame(records, schema=schema)

In [0]:
display(df.limit(5))

In [0]:
df.write.format("delta") \
    .mode("overwrite") \
    .saveAsTable("med.raw_docs_medlineplus")