In [22]:
import os
from lxml import etree
from IPython.display import display, HTML
import base64
import html
import re

# --- Constants and Namespace ---
NS = {
    'tei': 'http://www.tei-c.org/ns/1.0',
    'xml': 'http://www.w3.org/XML/1998/namespace'
}
XML_FILE = 'pharr2025.xml'
OUTPUT_FILE = "homeric_greek_three_pane.html"

# --- Embedded CSS for Three-Pane Layout ---
CSS_STYLE = """
<style>
    @import url('https://fonts.googleapis.com/css2?family=Cardo:ital,wght@0,400;0,700;1,400&family=Noto+Serif+Greek:wght@400;700&display=swap');

    :root {
        --sidebar-width: 280px;
        --primary-color: #0056b3;
        --background-color: #f4f4f0;
        --pane-background: #fdfdfa;
        --text-color: #333;
        --border-color: #ddd;
        --pane-padding: 30px;
        --highlight-bg: #FFF9C4;
        --highlight-text: #5D4037;
    }
    
    html {
        scroll-behavior: smooth;
    }

    body {
        font-family: 'Cardo', serif;
        line-height: 1.7;
        margin: 0; padding: 0;
        display: flex;
        overflow: hidden;
        background-color: var(--background-color);
        color: var(--text-color);
    }

    #sidebar {
        width: var(--sidebar-width);
        min-width: var(--sidebar-width);
        height: 100vh;
        position: fixed; left: 0; top: 0;
        background-color: #f0f0e8;
        border-right: 1px solid var(--border-color);
        overflow-y: auto;
        padding: 20px;
        box-sizing: border-box;
    }
    
    .sidebar-header {
        padding-bottom: 10px;
        margin-bottom: 15px;
        border-bottom: 1px solid var(--border-color);
        font-size: 0.85em;
        line-height: 1.5;
    }
    .sidebar-header h1 {
        font-size: 1.4em;
        margin: 0 0 5px 0;
        line-height: 1.2;
        text-align: left;
    }
    .sidebar-header p {
        margin: 2px 0;
        font-size: 1em;
        text-align: left;
    }
    .sidebar-header a {
        font-weight: bold;
    }

    #sidebar h2 { font-size: 1.3em; margin-top: 0; color: #000; }
    #sidebar ul { list-style-type: none; padding: 0; margin: 0; }
    #sidebar ul.toc-level-1 > li { margin-bottom: 8px; font-weight: bold; font-size: 1em;}
    #sidebar ul.toc-level-2 { padding-left: 15px; font-weight: normal; }
    #sidebar ul.toc-level-2 > li { margin-bottom: 4px; font-size: 0.9em; }
    #sidebar a { text-decoration: none; color: var(--primary-color); display: block; padding: 2px 0; }
    #sidebar a:hover { text-decoration: underline; color: #003d7c; }
    
    .content-pane {
        flex: 1;
        height: 100vh;
        overflow-y: auto;
        padding: 20px var(--pane-padding);
        box-sizing: border-box;
        background-color: var(--pane-background);
    }
    #main-content-center {
        margin-left: var(--sidebar-width);
        border-right: 1px solid var(--border-color);
    }
    
    .page-anchor { display: block; position: relative; top: -70px; visibility: hidden; }
    
    .page-milestone {
        font-weight: bold;
        color: #888;
        padding: 0 0.25em;
    }

    h1, h2, h3, h4 { color: #222; line-height: 1.2; margin-top: 1.8em; font-family: 'Cardo', serif; }
    h2 { font-size: 2em; border-bottom: 1px solid var(--border-color); padding-bottom: 5px; }
    h3 { font-size: 1.6em; }
    h4 { font-size: 1.3em; font-style: italic; }
    p { margin-bottom: 1em; text-align: justify; }
    p b:first-child {
        padding-right: 0.5em;
    }
    .greek { font-family: 'Noto Serif Greek', serif; }
    .center { text-align: center; }
    
    .lem {
        background-color: var(--highlight-bg);
        color: var(--highlight-text);
        padding: 1px 4px;
        border-radius: 3px;
        font-weight: bold;
    }

    figure { margin: 25px auto; padding: 15px; border: 1px solid #e0e0e0; background-color: #f9f9f9; text-align: center; max-width: 90%; }
    figcaption { font-style: italic; font-size: 0.9em; color: #555; margin-top: 8px; }
    table { width: 100%; border-collapse: collapse; margin: 25px 0; font-size: 0.95em; }
    caption { font-weight: bold; font-size: 1.1em; padding: 8px; margin-bottom: 10px; text-align: left; }
    th, td { border: 1px solid var(--border-color); padding: 8px; text-align: left; vertical-align: top; }
    th { background-color: #f2f2f2; font-weight: bold; }
    .footnote-section { font-size: 0.9em; border-top: 1px solid var(--border-color); margin-top: 2.5em; padding-top: 1em; }
    .footnote-ref { vertical-align: super; font-size: 0.75em; }
    blockquote { border-left: 3px solid #ccc; padding-left: 15px; margin-left: 0; font-style: italic;}
</style>
"""

# --- Embedded JavaScript for Interactive Scrolling ---
JS_SCRIPT = """
<script>
    document.addEventListener('DOMContentLoaded', function() {
        document.body.addEventListener('click', function(event) {
            let target = event.target.closest('a');

            if (target && target.getAttribute('href') && target.getAttribute('href').startsWith('#')) {
                event.preventDefault();
                
                const href = target.getAttribute('href');
                const baseId = href.substring(href.indexOf('#') + 1).replace(/^(center-|right-)/, '');

                const sourcePane = target.closest('#sidebar, #main-content-center, #main-content-right');
                
                if (!sourcePane || !baseId) return;

                const sourceId = sourcePane.id;

                if (sourceId === 'sidebar') {
                    // TOC click scrolls both panes
                    const centerTarget = document.getElementById('center-' + baseId);
                    const rightTarget = document.getElementById('right-' + baseId);
                    if (centerTarget) centerTarget.scrollIntoView({ behavior: 'smooth' });
                    if (rightTarget) rightTarget.scrollIntoView({ behavior: 'smooth' });
                } else if (sourceId === 'main-content-center') {
                    // Center click scrolls right pane
                    const rightTarget = document.getElementById('right-' + baseId);
                    if (rightTarget) rightTarget.scrollIntoView({ behavior: 'smooth' });
                } else if (sourceId === 'main-content-right') {
                    // Right click scrolls center pane
                    const centerTarget = document.getElementById('center-' + baseId);
                    if (centerTarget) centerTarget.scrollIntoView({ behavior: 'smooth' });
                }
            }
        });
    });
</script>
"""

class TeiToHtmlConverter:
    """Converts a TEI XML file into a standalone HTML file."""
    
    def __init__(self, xml_tree):
        self.tree = xml_tree
        self.footnotes = []
        self.footnote_counter = 0

    def process_children(self, element):
        """Processes element children, preserving and normalizing whitespace."""
        parts = []
        if element.text:
            parts.append(html.escape(element.text))
        
        for child in element:
            if isinstance(child, etree._Element):
                parts.append(self.process_element(child))
            if child.tail:
                parts.append(html.escape(child.tail))
        
        content = "".join(parts)
        # Convert all whitespace sequences (including newlines) to a single space, then strip.
        content = re.sub(r'\s+', ' ', content)
        return content.strip()
        
    def process_element(self, element):
        """Dispatcher to handle different TEI elements."""
        tag = etree.QName(element.tag).localname
        handler = getattr(self, f'handle_{tag}', self.handle_default)
        return handler(element)

    def handle_default(self, element):
        return self.process_children(element)

    # --- Element Handlers ---
    def handle_p(self, element):
        extra_class = "center" if element.get('rend') == 'center' else ""
        return f'<p class="{extra_class}">{self.process_children(element)}</p>'

    def handle_head(self, element):
        parent_tag = etree.QName(element.getparent().tag).localname
        if parent_tag == 'table':
            return f"<caption>{self.process_children(element)}</caption>"
        if parent_tag == 'figure':
             return f"<h4>{self.process_children(element)}</h4>"
        
        level = 2
        parent_div_depth = len(element.xpath('ancestor::tei:div', namespaces=NS))
        if parent_div_depth > 1: level = 3
        if element.get('type') == 'sub': level = 4
        return f"<h{level}>{self.process_children(element)}</h{level}>"

    def handle_pb(self, element):
        """Handles page breaks by creating an anchor and a visible milestone."""
        pb_id = element.get(f'{{{NS["xml"]}}}id')
        n = element.get('n')
        anchor_id = pb_id if pb_id else f"page-{n}"
        
        anchor_html = f'<a class="page-anchor" id="{anchor_id}"></a>' if anchor_id else ''
        milestone_html = f'<span class="page-milestone">[pg. {n}]</span>' if n else ''
        
        return f'{anchor_html}{milestone_html}'

    def handle_div(self, element):
        """Handles div elements, including special formatting and numbering."""
        div_id = element.get(f'{{{NS["xml"]}}}id', '')
        n_attr = element.get('n')
        div_type = element.get('type')
        subtype = element.get('subtype')
        prefix = ''

        # Determine prefix based on div attributes
        id_match = re.match(r'^p(\d+)$', div_id)
        if id_match:
            # Primary rule: xml:id="pNUM" always gets section symbol
            num = id_match.group(1)
            prefix = f'<b>§{num}.</b> '
        # Fallback rules for other types of numbered divs
        elif subtype == 'commline' and n_attr:
            prefix = f'<b>{n_attr}:</b> '
        elif n_attr and (div_type == 'sentence' or element.xpath('ancestor::tei:div[@type="exercise"]', namespaces=NS)):
            prefix = f'<b>{n_attr}.</b> '

        # Process the child elements and footnotes
        self.footnotes = []
        self.footnote_counter = 0
        content = self.process_children(element)
        
        # Prepend the prefix to the content
        final_content = prefix + content

        # Handle footnotes
        if self.footnotes:
            final_content += '<div class="footnote-section">'
            for i, note_text in enumerate(self.footnotes, 1):
                final_content += f'<p id="fn-{div_id}-{i}"><sup>{i}</sup> {note_text}</p>'
            final_content += '</div>'
            
        return f'<section id="{div_id}">{final_content}</section>'

    def handle_foreign(self, element):
        lang = element.get(f'{{{NS["xml"]}}}lang')
        cls = "greek" if lang == 'grc' else ""
        return f'<span class="{cls}"><i>{self.process_children(element)}</i></span>'

    def handle_title(self, element): return f'<i>{self.process_children(element)}</i>'
    def handle_lb(self, element): return "<br/>"
    def handle_list(self, element): return f"<ul>{self.process_children(element)}</ul>"
    def handle_item(self, element): return f"<li>{self.process_children(element)}</li>"
    
    # Table Handling
    def handle_table(self, element): return f"<table>{self.process_children(element)}</table>"
    def handle_tbody(self, element): return f"<tbody>{self.process_children(element)}</tbody>"
    def handle_thead(self, element): return f"<thead>{self.process_children(element)}</thead>"
    def handle_row(self, element): return f"<tr>{self.process_children(element)}</tr>"
    def handle_tr(self, element): return f"<tr>{self.process_children(element)}</tr>"
    def handle_cell(self, element):
        tag = 'th' if element.getparent().get('role') == 'label' else 'td'
        colspan = f" colspan='{element.get('cols')}'" if element.get('cols') else ''
        content = self.process_children(element)
        return f"<{tag}{colspan}>{content}</{tag}>"
    def handle_td(self, element):
        colspan = f" colspan='{element.get('colspan')}'" if element.get('colspan') else ""
        content = self.process_children(element)
        return f"<td{colspan}>{content}</td>"

    def handle_lem(self, element):
        return f'<span class="lem">{self.process_children(element)}</span>'
    
    def handle_figure(self, element):
        return f"<figure>{self.process_children(element)}</figure>"
    def handle_figDesc(self, element):
        return f"<figcaption>{self.process_children(element)}</figcaption>"
    
    def handle_note(self, element):
        self.footnote_counter += 1
        note_text = self.process_children(element)
        self.footnotes.append(note_text)
        div_id_elements = element.xpath('ancestor::tei:div[1]/@xml:id', namespaces=NS)
        div_id = div_id_elements[0] if div_id_elements else 'section'
        num_display = element.get('n') or self.footnote_counter
        return f'<a href="#fn-{div_id}-{self.footnote_counter}" class="footnote-ref">{num_display}</a>'
        
    def handle_lg(self, element): return f"<blockquote style='margin-left: 2em;'>{self.process_children(element)}</blockquote>"
    def handle_l(self, element): return f"{self.process_children(element)}<br/>"
    def handle_quote(self, element): return f"<blockquote>{self.process_children(element)}</blockquote>"
    def handle_hi(self, element):
        rend = element.get('rend', '')
        if 'bold' in rend: return f"<b>{self.process_children(element)}</b>"
        if 'italic' in rend: return f"<i>{self.process_children(element)}</i>"
        if 'superscript' in rend: return f"<sup>{self.process_children(element)}</sup>"
        return self.process_children(element)
    
    def handle_ref(self, element):
        target = element.get("target", "")
        text = (self.process_children(element) or "").strip()
        target_attr = ' target="_blank"' if target.startswith('http') else ''
        if not target and text:
            sub_para_pattern = r'^\d+,\s*\d+$'
            if re.match(sub_para_pattern, text):
                first_num = text.split(',')[0].strip()
                link_target = f"#p{first_num}"
                return f'<a href="{link_target}">{text}</a>'
            elif ',' in text:
                parts = text.split(',')
                links = [self.handle_ref_part(part.strip()) for part in parts]
                return ', '.join(links)
            else:
                return self.handle_ref_part(text)
        if target:
            return f'<a href="{target}"{target_attr}>{text}</a>'
        return text

    def handle_ref_part(self, text_part):
        """Helper to process a single number or a hyphenated range."""
        if re.match(r'^\d+(-\d+)?$', text_part):
            first_num = text_part.split('-')[0]
            target = f"#p{first_num}"
            return f'<a href="{target}">{text_part}</a>'
        return text_part

    def handle_cit(self, element): return self.handle_quote(element)
    def handle_term(self, element): return f"<b>{self.process_children(element)}</b>"
    def handle_gloss(self, element): return self.process_children(element)
    def handle_app(self, element): return self.process_children(element)

    def build_sidebar_header(self):
        """Extracts metadata from the TEI header to build an HTML header for the sidebar."""
        header_parts = []
        
        title_el = self.tree.find('.//tei:titleStmt/tei:title', NS)
        if title_el is not None and title_el.text:
            header_parts.append(f"<h1>{html.escape(title_el.text)}</h1>")
            
        author_el = self.tree.find('.//tei:titleStmt/tei:author', NS)
        if author_el is not None and author_el.text:
            header_parts.append(f"<p><strong>Author:</strong> {html.escape(author_el.text)}</p>")
        
        pub_stmt = self.tree.find('.//tei:publicationStmt', NS)
        if pub_stmt is not None:
            publisher = pub_stmt.find('tei:publisher', NS)
            pub_place = pub_stmt.find('tei:pubPlace', NS)
            date = pub_stmt.find('tei:date', NS)
            pub_info = []
            if publisher is not None and publisher.text: pub_info.append(publisher.text)
            if pub_place is not None and pub_place.text: pub_info.append(pub_place.text)
            if date is not None and date.text: pub_info.append(date.text)
            if pub_info:
                header_parts.append(f"<p><strong>Published:</strong> {html.escape(', '.join(pub_info))}</p>")

        ref_el = self.tree.find('.//tei:sourceDesc/tei:bibl/tei:ref', NS)
        if ref_el is not None:
            target = ref_el.get('target', '')
            text = ref_el.text or "View Source Scan"
            if target:
                header_parts.append(f'<p><a href="{target}" target="_blank">{html.escape(text)}</a></p>')

        if not header_parts:
            return ""
            
        return f'<header class="sidebar-header">{"".join(header_parts)}</header>'

    def build_toc(self):
        """Builds the HTML for the Table of Contents sidebar."""
        toc_div = self.tree.find('.//tei:div[@xml:id="toc"]', NS)
        if toc_div is None: return "<h2>Table of Contents Not Found</h2>"
        html_toc = "<h2>Contents</h2>"
        in_lessons_section = False
        html_toc += '<ul class="toc-level-1">'

        for item in toc_div.findall('.//tei:item', NS):
            label_text = "".join(item.xpath('tei:label//text()', namespaces=NS))
            
            if 'LESSON' in label_text:
                if not in_lessons_section:
                    html_toc += '</ul><h3>Lessons</h3><ul class="toc-level-2">'
                    in_lessons_section = True
            
            ref = item.find('tei:ref', NS)
            full_text = ' '.join("".join(item.itertext()).split())
            ref_text = (ref.text or "").strip() if ref is not None else ""
            if ref_text and full_text.endswith(ref_text):
                link_text = full_text[:-len(ref_text)].strip()
            else:
                link_text = full_text
            
            target = ""
            if ref is not None:
                target = ref.get('target', '')
                ref_text_content = (ref.text or "").strip()
                if not target and ref_text_content:
                    pb_el = self.tree.find(f'.//tei:pb[@n="{ref_text_content}"]', NS)
                    if pb_el is not None:
                        pb_id = pb_el.get(f'{{{NS["xml"]}}}id')
                        anchor_id = pb_id if pb_id else f"page-{ref_text_content}"
                        target = f"#{anchor_id}"
                
                if "LESSON" in link_text.upper():
                    num_match = re.match(r'\s*(\d+)', link_text)
                    if num_match:
                        num = num_match.group(1)
                        target = f"#lesson{num}"
                
                if target:
                    base_anchor = target[1:]
                    html_toc += f'<li><a href="#center-{base_anchor}">{link_text}</a></li>'

        html_toc += '</ul>'
        return html_toc

    def convert(self):
        """Main conversion method to generate the full HTML document."""
        title_element = self.tree.find('.//tei:titleStmt/tei:title', NS)
        title = title_element.text if title_element is not None else "Homeric Greek"
        
        html_doc = f"<!DOCTYPE html>\n<html lang='en'>\n<head>\n"
        html_doc += f"    <meta charset='UTF-8'>\n"
        html_doc += f"    <meta name='viewport' content='width=device-width, initial-scale=1.0'>\n"
        html_doc += f"    <title>{html.escape(title)}</title>\n{CSS_STYLE}\n</head>\n<body>\n"
        
        sidebar_header = self.build_sidebar_header()
        table_of_contents = self.build_toc()
        html_doc += f"<nav id='sidebar'>{sidebar_header}{table_of_contents}</nav>\n"
        
        text_element = self.tree.find('.//tei:text', NS)
        main_content_html = self.process_children(text_element) if text_element is not None else ""

        center_content = re.sub(r'id="([^"]+)"', r'id="center-\1"', main_content_html)
        center_content = re.sub(r'href="#([^"]+)"', r'href="#center-\1"', center_content)
        
        right_content = re.sub(r'id="([^"]+)"', r'id="right-\1"', main_content_html)
        right_content = re.sub(r'href="#([^"]+)"', r'href="#right-\1"', right_content)

        html_doc += f"<main id='main-content-center' class='content-pane'>{center_content}</main>\n"
        html_doc += f"<aside id='main-content-right' class='content-pane'>{right_content}</aside>\n"
        
        html_doc += f"{JS_SCRIPT}\n</body>\n</html>"
        return html_doc

# --- Main execution block for the Jupyter cell ---
if __name__ == "__main__":
    try:
        if not os.path.exists(XML_FILE):
             raise FileNotFoundError(f"The file '{XML_FILE}' was not found.")
             
        with open(XML_FILE, 'rb') as f:
            # UPDATED: Changed remove_blank_text to False to preserve whitespace
            parser = etree.XMLParser(remove_blank_text=False, remove_comments=True)
            xml_tree = etree.parse(f, parser)

        converter = TeiToHtmlConverter(xml_tree)
        html_output = converter.convert()

        # Pretty-print the HTML for readability
        root = etree.HTML(html_output)
        pretty_html = etree.tostring(root, pretty_print=True, method="html", encoding="unicode")

        with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
            f.write(pretty_html)
        
        print(f"✅ Successfully converted '{XML_FILE}' and saved it as '{OUTPUT_FILE}'.")

    except Exception as e:
        import traceback
        traceback.print_exc()
        print(f"❌ An error occurred: {e}")

✅ Successfully converted 'pharr2025.xml' and saved it as 'homeric_greek_three_pane.html'.


In [19]:
you need:

SyntaxError: invalid syntax (555477872.py, line 1)