In [27]:
# It's best practice to define necessary imports and constants at the beginning.
from lxml import etree
from IPython.display import display, HTML

# Define the TEI namespace globally so all functions can access it.
TEI_NS = {'tei': 'http://www.tei-c.org/ns/1.0'}

def roman_to_int(s):
    """Converts a Roman numeral string to an integer."""
    s = s.upper().strip()
    roman_map = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}
    i = 0
    num = 0
    while i < len(s):
        # Handle the subtractive case (e.g., IV, IX)
        if i + 1 < len(s) and s[i] in roman_map and s[i+1] in roman_map and roman_map[s[i]] < roman_map[s[i+1]]:
            num += roman_map[s[i+1]] - roman_map[s[i]]
            i += 2
        # Handle the additive case
        elif s[i] in roman_map:
            num += roman_map[s[i]]
            i += 1
        else:
            # Return -1 or raise an error for invalid characters
            return -1
    return num

def render_element_to_html(element):
    """
    Recursively renders any given TEI element and its children into an HTML string.
    This master function correctly handles all known block and inline elements,
    and now displays the 'n' attribute for any element that has one.
    """
    if element is None:
        return ""

    tag_name = etree.QName(element).localname
    
    # --- Process Content (Element's Text + Children's Rendered HTML) ---
    content = ""
    if element.text:
        content += element.text
    for child in element:
        content += render_element_to_html(child) # Recursive call
        if child.tail:
            content += child.tail
            
    # --- Prepend the 'n' attribute if it exists ---
    n_attr = element.get('n')
    if n_attr:
        # Exclude 'pb' from getting the default 'n' attribute prepended
        if tag_name == 'l':
            content = f'<span class="line-number">{n_attr}</span>{content}'
        elif tag_name != 'pb': # Prevents "[p. PN] PN."
            content = f'<b class="item-number">{n_attr}.</b> {content}'

    # --- Wrap the processed content in the appropriate HTML tag ---
    
    # Block Elements
    if tag_name == 'p':
        return f'<p>{content}</p>'
    if tag_name == 'head':
        return f'<h4>{content}</h4>'
    
    # Table Elements
    if tag_name == 'table':
        return f'<table class="data-table">{content}</table>'
    if tag_name == 'tbody':
        return f'<tbody>{content}</tbody>'
    if tag_name in ['row', 'tr']:
        return f'<tr>{content}</tr>'
    if tag_name in ['cell', 'td']:
        attrs = []
        if element.get('colspan'): attrs.append(f'colspan="{element.get("colspan")}"')
        is_in_head = element.xpath('ancestor::tei:head', namespaces=TEI_NS)
        cell_tag = 'th' if is_in_head else 'td'
        return f'<{cell_tag} {" ".join(attrs)}>{content}</{cell_tag}>'

    # List Elements
    if tag_name == 'list':
        list_type = element.get('type', 'simple')
        if list_type == 'gloss':
            dl_items = ""
            children = list(element) 
            i = 0
            while i < len(children):
                current_child = children[i]
                child_tag = etree.QName(current_child).localname
                
                if child_tag == 'label' and i + 1 < len(children):
                    next_child = children[i+1]
                    next_child_tag = etree.QName(next_child).localname
                    
                    if next_child_tag == 'item':
                        dt_content = render_element_to_html(current_child)
                        dd_content = render_element_to_html(next_child)
                        dl_items += f"<dt>{dt_content}</dt><dd>{dd_content}</dd>"
                        i += 2
                    else:
                        dl_items += render_element_to_html(current_child)
                        i += 1
                else:
                    dl_items += render_element_to_html(current_child)
                    i += 1
            return f'<dl class="glossary">{dl_items}</dl>'
        else: # Handles simple and ordered lists
            list_tag = 'ol' if element.get('type') == 'ordered' else 'ul'
            return f'<{list_tag} class="simple-list">{content}</{list_tag}>'

    if tag_name == 'item':
        return f'<li>{content}</li>'

    if tag_name == 'bibl':
        return f'<i>{content}</i>'

    if tag_name == 'note':
        return f' [note {content} ]'
    
    if tag_name == 'label':
        return content

    # Poetry Elements
    if tag_name == 'lg':
        return f'<div class="poetry-group">{content}</div>'
    if tag_name == 'l':
        return f'<span class="poetry-line">{content}</span>'
    
    # Inline Elements
    if tag_name == 'foreign':
        lang_attr = element.get('{http://www.w3.org/1999/xml}lang')
        css_class = "greek" if lang_attr == 'grc' else ""
        return f'<i class="{css_class}">{content}</i>'
    if tag_name == 'ref':
        return f'<a href="{element.get("target", "#")}" class="reference">{content}</a>'
    if tag_name == 'term':
        return f'<b>{content}</b>'
    if tag_name == 'gloss':
        return f'<i>{content}</i>'
    if tag_name == 'lb':
        return '<br/>'
        
    # **NEW**: Handler for <pb> (page break)
    if tag_name == 'pb':
        n_attr = element.get('n')
        if n_attr:
            return f'<span class="page-break">[pg. {n_attr}]</span>'
        return ''

    
    # Structural tags whose content should just be passed through
    # **MODIFIED**: Removed 'pb' from this list
    if tag_name in ['div', 'note', 'lesson', 'paragraph', 'exercise', 'sentence', 'commentary', 'reading', 'figure', 'figDesc', 'vocabulary', 'derivatives', 'exercises', 'reading-title', 'quote']:
        return content
        
    return content

def get_sections_html(root, range_str):
    """
    Finds and renders sections from the XML. Handles ranges for paragraphs
    ("30-35") and chapters/lessons ("c30-c35").
    """
    html_outputs = []
    
    is_chapter_request = range_str.lower().strip().startswith('c')
    clean_range_str = range_str.strip()[1:] if is_chapter_request else range_str.strip()
        
    try:
        if '-' in clean_range_str:
            start_str, end_str = clean_range_str.split('-')
            start_id, end_id = int(start_str), int(end_str)
        else:
            start_id = end_id = int(clean_range_str)
    except ValueError:
        return [f"<p><strong>Error:</strong> Invalid range format: '{range_str}'. Use formats like '30-35' or 'c30-c35'.</p>"]

    target_divs = []
    if is_chapter_request:
        all_lesson_divs = root.xpath('//tei:div[@type="lesson"]', namespaces=TEI_NS)
        for div in all_lesson_divs:
            n_attr = div.get('n')
            if n_attr:
                try:
                    n_val = roman_to_int(n_attr)
                    if start_id <= n_val <= end_id:
                        target_divs.append(div)
                except (KeyError, IndexError):
                    pass
    else: # Paragraph request
        xpath_query = f'//tei:div[@type="paragraph" and number(@n) >= {start_id} and number(@n) <= {end_id}]'
        target_divs = root.xpath(xpath_query, namespaces=TEI_NS)
    
    if not target_divs:
        return [f"<p><strong>Error:</strong> No sections found for range: {range_str}.</p>"]

    if target_divs:
        target_divs.sort(key=lambda div: int(div.get('n')) if not is_chapter_request else roman_to_int(div.get('n')))

    for div in target_divs:
        div_type = div.get('type')
        div_n = div.get('n')

        if div_type == "lesson":
            heading_text = f"Chapter {div_n}"
        else: # paragraph
            lesson_div_list = div.xpath('ancestor::tei:div[@type="lesson"]', namespaces=TEI_NS)
            lesson_n_str = ""
            if lesson_div_list:
                lesson_n_str = f"Chapter {lesson_div_list[0].get('n', '')}, "
            heading_text = f"{lesson_n_str}Paragraph {div_n}"

        html_outputs.append(f'<h2>{heading_text}</h2>')
        
        html_outputs.append(render_element_to_html(div))
        
        html_outputs.append('<hr>')

    return html_outputs

In [37]:
def generate_and_display_html(xml_file_path, section_range):
    """
    Main function for Jupyter: parses the XML, generates HTML for a given range,
    and displays it in the notebook output.
    """
    try:
        tree = etree.parse(xml_file_path)
        root = tree.getroot()
    except Exception as e:
        display(HTML(f"<p><strong>Error:</strong> Could not process file '{xml_file_path}'.<br>{e}</p>"))
        return

    # Call the updated function to get the HTML list
    section_html_list = get_sections_html(root, section_range)

    # The HTML template is mostly the same, with a more generic title
    html_template = """
    <div style="font-family: Georgia, serif; line-height: 1.6;">
        <style>
            .greek {{ font-style: italic; color: #333; }}
            .reference {{ font-weight: bold; color: #0056b3; text-decoration: none; }}
            h2, h4 {{ border-bottom: 1px solid #ccc; padding-bottom: 5px; margin-top: 1.5em;}}
            hr {{ border: none; border-top: 1px solid #eee; margin: 2rem 0; }}
            .glossary dt {{ font-weight: bold; font-family: monospace, serif; float: left; width: 150px; clear: left; text-align: right; padding-right: 1em; }}
            .glossary dd {{ margin-left: 160px; padding-bottom: 0.7em; }}
            .data-table {{ border-collapse: collapse; width: 100%; margin: 1em 0; }}
            .data-table th, .data-table td {{ border: 1px solid #ddd; padding: 8px; text-align: left; vertical-align: top; }}
            .data-table th {{ background-color: #f2f2f2; font-weight: bold; }}
            .simple-list {{ list-style-type: none; padding-left: 1em; }}
            .poetry-group {{ margin-bottom: 1em; }}
            .poetry-line {{ display: block; padding-left: 2em; text-indent: -2em; }}
            .item-number {{ margin-right: 0.5em; }}
            .line-number {{ font-size: 0.8em; color: #666; margin-right: 1em; }}
        </style>
        <h1>Homeric Greek Excerpt: {range_str}</h1>
        <hr>
        {content}
    </div>
    """
    
    final_html = html_template.format(
        range_str=section_range,
        content="\n".join(section_html_list)
    )
    
    display(HTML(final_html))

xml_file = 'pharr2025.xml'

# --- EXAMPLE USAGE ---
# To use these functions, you would call generate_and_display_html like this:

# 1. To get paragraphs 30 through 35:
# generate_and_display_html('pharr2025.xml', '30-35')

# 2. To get chapters (lessons) 10 through 12:
generate_and_display_html('pharr2025.xml', 'c26-30')

In [29]:
generate_and_display_html(xml_file, " 906  ")

0,1,2,3
,PRESENT,AORIST,"PERFECT [note 3. See note 3 on Subjunctive, §905. ]"
S. 1,"λύοιμι may I loose [note 4. Also, I may, might, could, would, should loose, with various other meanings which must be learned from the syntax. Read the notes to 905. ]",λύσαιμι may I loose,λελύκοιμι may I loose
2,λύοις(θα) may you loose,λύσαις(θα) (λύσειας) may you loose,λελύκοις(θα) may you loose
3,λύοι may he loose,λύσαι (λύσειε) may he loose,λελύκοι may he loose
Du. 1,,,
2,λύοιτον may you two loose,λύσαιτον may you two loose,λελύκοιτον may you two loose
3,λυοίτην may they two loose,λῦσαίτην may they two loose,λελυκοίτην may they two loose
PL. 1,λύοιμεν may we loose,λύσαιμεν may we loose,λελύκοιμεν may we loose
2,λύοιτε may you loose,λύσαιτε may you loose,λελύκοιτε may you loose
3,λύοιεν may they loose,λύσαιεν (λύσειαν) may they loose,λελύκοιεν may they loose


In [31]:
from lxml import etree
from IPython.display import display, HTML

# Define namespaces required for XPath queries
# This makes queries cleaner and more reliable.
TEI_NS = {'tei': 'http://www.tei-c.org/ns/1.0'}
XML_NS = {'xml': 'http://www.w3.org/XML/1998/namespace'}


def run_tests(xml_file_path):
    """
    Parses the XML and runs various validation tests on the content.
    """
    display(HTML("<h1>XML Validation Tests</h1>"))
    try:
        # Using a parser that can recover and track line numbers is helpful
        parser = etree.XMLParser(recover=True)
        tree = etree.parse(xml_file_path, parser)
        root = tree.getroot()
    except Exception as e:
        display(HTML(f"<p><strong>Error:</strong> Could not process file '{xml_file_path}'.<br>{e}</p>"))
        return

    # --- Run all test functions here ---
    test_paragraph_ids(root)
    test_xml_id_format(root)  # <-- NEW: Added call to the new test function


# NEW FUNCTION: Checks for correct xml:id format
def test_xml_id_format(root):
    """
    Tests if divs with an 'n' attribute have a matching xml:id="p[n]".
    For example, a div with n="16" should have xml:id="p16".
    """
    # Find all <div> elements that have both 'n' and 'xml:id' attributes
    divs_with_ids = root.xpath('//tei:div[@n and @xml:id]', namespaces=TEI_NS)
    
    mismatched_ids = []
    
    # The key for accessing namespaced attributes like 'xml:id'
    xml_id_key = f'{{{XML_NS["xml"]}}}id'

    for div in divs_with_ids:
        n_val = div.get('n')
        xml_id_val = div.get(xml_id_key)
        
        # Construct the expected xml:id from the 'n' attribute
        expected_id = f"p{n_val}"
        
        # If the actual ID doesn't match the expected ID, log it as an error
        if xml_id_val != expected_id:
            mismatched_ids.append({
                'n': n_val,
                'found': xml_id_val,
                'expected': expected_id,
                'line': div.sourceline  # Storing the line number is great for debugging
            })
            
    if not mismatched_ids:
        result_html = """
        <div style="border-left: 4px solid green; padding-left: 10px; margin-bottom: 1em;">
            <p><strong>Test Passed: Div xml:id Format</strong></p>
            <p>All <code>&lt;div&gt;</code> elements with <code>n</code> and <code>xml:id</code> have the correct format (<code>xml:id="p[n]"</code>).</p>
        </div>
        """
    else:
        # Build a detailed list of all the errors found
        error_list_html = "<ul>"
        for error in mismatched_ids:
            error_list_html += f"<li>Line {error['line']}: For <code>n=\"{error['n']}\"</code>, found <code>xml:id=\"{error['found']}\"</code> but expected <code>xml:id=\"{error['expected']}\"</code>.</li>"
        error_list_html += "</ul>"
        
        result_html = f"""
        <div style="border-left: 4px solid red; padding-left: 10px; margin-bottom: 1em;">
            <p><strong>Test Failed: Mismatched Div xml:id Attributes</strong></p>
            <p>The following <code>&lt;div&gt;</code> elements have an <code>xml:id</code> that does not match their <code>n</code> attribute:</p>
            {error_list_html}
        </div>
        """
        
    display(HTML(result_html))


def test_paragraph_ids(root):
    """
    Tests if paragraph numbers (@n attribute) are sequential and reports any gaps.
    """
    # Find all paragraph divs that have a numeric 'n' attribute
    paragraph_divs = root.xpath('//tei:div[(@type="vocabulary" or @type="paragraph" or @type="exercises" or @type="commentary" or @type="reading") and @n]', namespaces=TEI_NS)


    
    p_numbers = []
    for div in paragraph_divs:
        n_str = div.get('n')
        # Ensure the attribute is a digit before converting
        if n_str and n_str.isdigit():
            p_numbers.append(int(n_str))
            
    if not p_numbers:
        result_html = "<p><strong>Test Warning:</strong> No paragraphs with numeric 'n' attributes found to test.</p>"
        display(HTML(result_html))
        return
        
    p_numbers.sort()
    
    missing_numbers = []
    # Check for any missing numbers before the first one found
    if p_numbers[0] > 1:
        for i in range(1, p_numbers[0]):
            missing_numbers.append(i)

    # Check for gaps between the numbers found
    for i in range(1, len(p_numbers)):
        expected_next = p_numbers[i-1] + 1
        if p_numbers[i] > expected_next:
            for missing in range(expected_next, p_numbers[i]):
                missing_numbers.append(missing)
                
    if missing_numbers:
        result_html = f"""
        <div style="border-left: 4px solid red; padding-left: 10px; margin-bottom: 1em;">
            <p><strong>Test Failed: Missing Paragraph Numbers</strong></p>
            <p>The following paragraph numbers are missing from the XML file:</p>
            <p>{', '.join(map(str, missing_numbers))}</p>
        </div>
        """
    else:
        result_html = """
        <div style="border-left: 4px solid green; padding-left: 10px; margin-bottom: 1em;">
            <p><strong>Test Passed: Paragraph Numbers Sequential</strong></p>
            <p>All paragraph numbers are in sequential order without any gaps.</p>
        </div>
        """
    display(HTML(result_html))

# --- EXAMPLE USAGE FOR TESTING ---
# To run the test, you would execute this line in your notebook cell:
run_tests('pharr2025.xml')
