✅ Transformation complete. Result saved to 'pharr2025_transformed.xml'.


# write out selections of pharr

In [20]:
from lxml import etree
from IPython.display import display, HTML

# Define the TEI namespace to make XPath queries work correctly.
TEI_NS = {'tei': 'http://www.tei-c.org/ns/1.0'}

In [21]:
def render_element_to_html(element):
    """
    Recursively renders any given TEI element and its children into an HTML string.
    This master function correctly handles all known block and inline elements,
    and now displays the 'n' attribute for any element that has one.
    """
    if element is None:
        return ""

    tag_name = etree.QName(element).localname
    
    # --- Process Content (Element's Text + Children's Rendered HTML) ---
    content = ""
    if element.text:
        content += element.text
    for child in element:
        content += render_element_to_html(child) # Recursive call
        if child.tail:
            content += child.tail
            
    # --- Prepend the 'n' attribute if it exists ---
    n_attr = element.get('n')
    if n_attr:
        # For poetry lines, format it like a line number. For others, like an exercise number.
        if tag_name == 'l':
            content = f'<span class="line-number">{n_attr}</span>{content}'
        else:
            content = f'<b class="item-number">{n_attr}.</b> {content}'

    # --- Wrap the processed content in the appropriate HTML tag ---
    
    # Block Elements
    if tag_name == 'p':
        return f'<p>{content}</p>'
    if tag_name == 'head':
        return f'<h4>{content}</h4>'
    
    # Table Elements
    if tag_name == 'table':
        return f'<table class="data-table">{content}</table>'
    if tag_name == 'tbody':
        return f'<tbody>{content}</tbody>'
    if tag_name in ['row', 'tr']:
        return f'<tr>{content}</tr>'
    if tag_name in ['cell', 'td']:
        attrs = []
        if element.get('colspan'): attrs.append(f'colspan="{element.get("colspan")}"')
        is_in_head = element.xpath('ancestor::tei:head', namespaces=TEI_NS)
        cell_tag = 'th' if is_in_head else 'td'
        return f'<{cell_tag} {" ".join(attrs)}>{content}</{cell_tag}>'

    # List Elements
    if tag_name == 'list':
        list_type = element.get('type', 'simple')
        if list_type == 'gloss':
            dl_items = ""
            for item in element.xpath('tei:item', namespaces=TEI_NS):
                term = item.find('tei:term', TEI_NS)
                gloss = item.find('tei:gloss', TEI_NS)
                if term is not None:
                    dt = f"<dt>{render_element_to_html(term)}</dt>"
                    dd_content = render_element_to_html(gloss) if gloss is not None else ""
                    if term.tail: dd_content = term.tail.strip() + " " + dd_content
                    dl_items += f"{dt}<dd>{dd_content}</dd>"
            return f'<dl class="glossary">{dl_items}</dl>'
        else: # Handles simple lists
            return f'<ul class="simple-list">{content}</ul>'
    if tag_name == 'item':
        return f'<li>{content}</li>'

    # Poetry Elements
    if tag_name == 'lg':
        return f'<div class="poetry-group">{content}</div>'
    if tag_name == 'l':
        return f'<span class="poetry-line">{content}</span>'
    
    # Inline Elements
    if tag_name == 'foreign':
        css_class = "greek" if element.get('{http://www.w3.org/1999/xml}lang') == 'grc' else ""
        return f'<i class="{css_class}">{content}</i>'
    if tag_name == 'ref':
        return f'<a href="#" class="reference">{content}</a>'
    if tag_name == 'term':
        return f'<b>{content}</b>'
    if tag_name == 'gloss':
        return f'<i>{content}</i>'
    if tag_name == 'lb':
        return '<br/>'
    
    # Structural tags
    if tag_name in ['div', 'note', 'pb']:
        return content
        
    return content

def get_paragraphs_html(root, paragraph_range_str):
    """
    Finds all sections requested in the range string. Handles numeric ranges ("8-10")
    and literal IDs ("p.i"). Now also finds and displays the lesson number.
    """
    html_outputs = []
    target_ids = []

    try:
        if '-' in paragraph_range_str:
            start_str, end_str = paragraph_range_str.split('-')
            start_id, end_id = int(start_str), int(end_str)
            target_ids = [f'p{i}' for i in range(start_id, end_id + 1)]
        else:
            num = int(paragraph_range_str)
            target_ids = [f'p{num}']
    except ValueError:
        target_ids = [paragraph_range_str]

    found_any = False
    for target_id in target_ids:
        target_div_list = root.xpath(f'//tei:div[@xml:id="{target_id}"]', namespaces=TEI_NS)
        
        if target_div_list:
            found_any = True
            div = target_div_list[0]
            
            # *** NEW: Find the parent lesson and construct the heading ***
            lesson_div = div.xpath('ancestor::tei:div[@type="lesson"]', namespaces=TEI_NS)
            lesson_n = ""
            if lesson_div:
                lesson_n = f"Lesson {lesson_div[0].get('n', '')}, "
            
            heading_text = f"{lesson_n}Section {div.get('n', target_id)}"
            html_outputs.append(f'<h2>{heading_text}</h2>')
            
            for child_element in div:
                html_outputs.append(render_element_to_html(child_element))
            
            html_outputs.append('<hr>')

    if not found_any:
        return [f"<p><strong>Error:</strong> No sections found for: {paragraph_range_str}.</p>"]

    return html_outputs

In [48]:
def generate_and_display_html(xml_file_path, paragraph_range):
    """
    Main function for Jupyter: parses the XML, generates HTML for a given range,
    and displays it in the notebook output.
    """
    try:
        tree = etree.parse(xml_file_path)
        root = tree.getroot()
    except Exception as e:
        display(HTML(f"<p><strong>Error:</strong> Could not process file '{xml_file_path}'.<br>{e}</p>"))
        return

    paragraph_html_list = get_paragraphs_html(root, paragraph_range)

    html_template = """
    <div style="font-family: Georgia, serif; line-height: 1.6;">
        <style>
            .greek {{ font-style: italic; color: #333; }}
            .reference {{ font-weight: bold; color: #0056b3; text-decoration: none; }}
            h2, h4 {{ border-bottom: 1px solid #ccc; padding-bottom: 5px; margin-top: 1.5em;}}
            hr {{ border: none; border-top: 1px solid #eee; margin: 2rem 0; }}
            .glossary dt {{ font-weight: bold; font-family: monospace, serif; float: left; width: 150px; clear: left; text-align: right; padding-right: 1em; }}
            .glossary dd {{ margin-left: 160px; padding-bottom: 0.7em; }}
            .data-table {{ border-collapse: collapse; width: 100%; margin: 1em 0; }}
            .data-table th, .data-table td {{ border: 1px solid #ddd; padding: 8px; text-align: left; vertical-align: top; }}
            .data-table th {{ background-color: #f2f2f2; font-weight: bold; }}
            .simple-list {{ list-style-type: none; padding-left: 1em; }}
            .poetry-group {{ margin-bottom: 1em; }}
            .poetry-line {{ display: block; padding-left: 2em; text-indent: -2em; }}
            .item-number {{ margin-right: 0.5em; }}
            .line-number {{ font-size: 0.8em; color: #666; margin-right: 1em; }}
        </style>
        <h1>Homeric Greek Excerpt: Section(s) {range_str}</h1>
        <hr>
        {content}
    </div>
    """
    
    final_html = html_template.format(
        range_str=paragraph_range,
        content="\n".join(paragraph_html_list)
    )
    
    display(HTML(final_html))

In [52]:
generate_and_display_html(xml_file, "179-200 ")

In [9]:
# Define the path to your XML file
xml_file = 'pharr2025.xml'

# --- Example 1: Show paragraphs 8 through 10 ---
print("--- Displaying paragraphs 8-10 ---")
generate_and_display_html(xml_file, "8-10")

# --- Example 2: Show paragraphs 659 through 662 ---
print("\n--- Displaying paragraphs 659-662 ---")
generate_and_display_html(xml_file, "659-662")

# --- Example 3: Show a single paragraph ---
print("\n--- Displaying paragraph 501 ---")
generate_and_display_html(xml_file, "501")

--- Displaying paragraphs 8-10 ---



--- Displaying paragraphs 659-662 ---



--- Displaying paragraph 501 ---


0,1,2,3
Α α,a as in father (when short as in aha),ἄλφα,alpha
Β β,b as in bite,βῆτα,beta
Γ γ,g as in get (never soft as in oblige),γάμμα,gamma
Δ δ,d as in deal,δέλτα,delta
Ε ε,e as in red,"εἶ, ἔ (ἐ ψιλόν)",epsilon
F ϝ1,w as in wine,ϝαῦ1 (δίγαμμα),vau (digamma)
Ζ ζ,zd as in Ahura Mazda,ζῆτα,zeta
Η η,ê as in fête (open e),ἦτα,eta
Θ θ,th as in thick (originally t+h),θῆτα,theta
Ι ι,i as in machine (when short as i in hit),ἰῶτα,iota


In [161]:
generate_and_display_html(xml_file, "969")

0,1,2,3,4
,PRESENT,IMPERF.,IMPER.,INFIN.
S. 1,ἧμαι,ἥμην,,ἧσθαι
2,ἧσαι,ἧσο,ἧσο,
3,ἧσται,ἧστο,ἥσθω,
Du. 2,ἧσθον,ἧσθον,ἧσθον,PART.
3,ἧσθον,ἥσθην,ἥσθων,"ἥμενος, η, ον"
PL. 1,ἥμε(σ)θα,ἥμε(σ)θα,,
2,ἧσθε,ἧσθε,ἧσθε,
3,"ἧνται (ἥαται, ἔαται)","ἧντο (ἥατο, ἕατο)",ἥσθων,


In [None]:
from lxml import etree
import re
from copy import deepcopy

def transform_xml_structure(file_path: str) -> str:
    """
    Parses a TEI XML file and transforms specific list structures.

    This function finds all <div type="paragraph"> elements that contain a <p>
    followed by an <list type="ordered">. It converts each <item> in the list
    into its own nested <div type="paragraph">, preserving the original
    introductory text.

    Args:
        file_path: The path to the input XML file.

    Returns:
        A string containing the transformed, pretty-printed XML.
    """
    # Define namespaces used in the XML document
    ns = {'tei': 'http://www.tei-c.org/ns/1.0'}
    XML_NS = "http://www.w3.org/XML/1998/namespace"
    XML_ID = f"{{{XML_NS}}}id"

    # Use a parser that preserves the document structure
    parser = etree.XMLParser(remove_blank_text=False)
    tree = etree.parse(file_path, parser)
    root = tree.getroot()

    # XPath expression to find the target <div> elements
    xpath_expr = ".//tei:div[@type='paragraph' and tei:p[following-sibling::*[1][self::tei:list]]]"
    divs_to_transform = root.xpath(xpath_expr, namespaces=ns)

    if not divs_to_transform:
        print("No matching elements found for transformation.")
        return ""

    for old_div in divs_to_transform:
        # Create a new <div> element to replace the old one, copying attributes
        new_div = etree.Element(old_div.tag, nsmap=old_div.nsmap)
        new_div.attrib.update(old_div.attrib)

        # Get the introductory <p> and list elements
        intro_p_element = old_div.find('tei:p', namespaces=ns)
        list_element = old_div.find('tei:list', namespaces=ns)

        if intro_p_element is None or list_element is None:
            continue

        # Add a copy of the introductory <p> to the new div
        new_div.append(deepcopy(intro_p_element))
        
        parent_id = old_div.get(XML_ID)

        # Iterate through each <item> and create a new nested <div> for it
        for item in list_element.findall('tei:item', namespaces=ns):
            item_text = (item.text or "").strip()

            # Extract the number and content from the item text
            match = re.match(r'(\d+)\)\s*(.*)', item_text)
            if not match:
                continue
            
            num, text_content = match.groups()

            # Create the new nested <div type="paragraph">
            nested_div = etree.SubElement(new_div, f"{{{ns['tei']}}}div", type="paragraph", n=num)
            if parent_id:
                nested_div.set(XML_ID, f"{parent_id}-{num}")

            # Create the <p> inside the nested div and add the content
            p_in_nested = etree.SubElement(nested_div, f"{{{ns['tei']}}}p")
            p_in_nested.text = text_content.strip()

            # If the item had child elements (like <ref>), copy them over
            for child in item:
                p_in_nested.append(deepcopy(child))

        # Replace the original <div> with the newly structured one
        parent = old_div.getparent()
        if parent is not None:
            parent.replace(old_div, new_div)

    # Return the transformed XML as a formatted string
    return etree.tostring(root, pretty_print=True, encoding='UTF-8', xml_declaration=True).decode('utf-8')

# --- Main execution block for the notebook ---
input_file = 'pharr2025.xml'
output_file = 'pharr2025_transformed.xml'

try:
    transformed_xml_content = transform_xml_structure(input_file)
    
    if transformed_xml_content:
        with open(output_file, 'w', encoding='utf-8') as f:
            f.write(transformed_xml_content)
        print(f"✅ Transformation complete. Result saved to '{output_file}'.")

except FileNotFoundError:
    print(f"❌ Error: The file '{input_file}' was not found in the same directory as the notebook.")
except Exception as e:
    print(f"An error occurred: {e}")