In [2]:
import xml.etree.ElementTree as ET
import os

def extract_headings_from_xml(xml_file_path):
    """
    Extract and print all headings from an XML file.
    Works with various heading tag types commonly found in patent documents.
    """
    try:
        # Parse the XML file
        tree = ET.parse(xml_file_path)
        root = tree.getroot()
        
        # Common heading tags in patent XML files
        heading_tags = ['heading', 'title', 'claim', 'table', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']
        
        print(f"\n=== Headings found in {os.path.basename(xml_file_path)} ===")
        print("-" * 60)
        
        heading_count = 0
        
        # Search for all elements that might be headings
        for elem in root.iter():
            # Check if element tag is a common heading tag
            if elem.tag.lower() in heading_tags:
                text_content = (elem.text or '').strip()
                if text_content:
                    heading_count += 1
                    print(f"{heading_count:2d}. [{elem.tag}] {text_content}")
            
            # Also check for elements with 'title' or 'heading' in attributes
            elif any('title' in str(attr).lower() or 'heading' in str(attr).lower() 
                     for attr in elem.attrib.values()):
                text_content = (elem.text or '').strip()
                if text_content:
                    heading_count += 1
                    print(f"{heading_count:2d}. [{elem.tag}] {text_content}")
        
        if heading_count == 0:
            print("No headings found using common heading tags.")
            print("\nAnalyzing XML structure...")
            
            # If no standard headings found, show all elements with text
            # that might be headings (short text, typically < 100 characters)
            potential_headings = []
            for elem in root.iter():
                text_content = (elem.text or '').strip()
                if text_content and len(text_content) < 100 and '\n' not in text_content:
                    potential_headings.append((elem.tag, text_content))
            
            if potential_headings:
                print("\nPotential headings (short text elements):")
                for i, (tag, text) in enumerate(potential_headings[:20], 1):  # Show first 20
                    print(f"{i:2d}. [{tag}] {text}")
                if len(potential_headings) > 20:
                    print(f"... and {len(potential_headings) - 20} more")
        else:
            print(f"\nTotal headings found: {heading_count}")
            
    except ET.ParseError as e:
        print(f"Error parsing XML file: {e}")
    except FileNotFoundError:
        print(f"File not found: {xml_file_path}")
    except Exception as e:
        print(f"An error occurred: {e}")

# Example usage - replace with your XML file path
xml_file_path = "EP18823397W1B9.xml"  # Change this to your file path

# Extract and print headings
extract_headings_from_xml(xml_file_path)


=== Headings found in EP18823397W1B9.xml ===
------------------------------------------------------------
 1. [heading] (2s)-2-(2,6-dichloro-4-(2-(hydroxy(phenyl)phosphoryl)ethyl)benzamido)-3-(3-(methylsulfon yl)phenyl)propionic acid
 2. [heading] Step A: methoxyphenylphosphoryl chloride (Compound 1.1)
 3. [heading] Step B: Methyl 2,6-dichloro-4-((phenyl(methoxy)phosphoryl)ethynyl)benzoate (Compound 1.2)
 4. [heading] Step C: 2,6-dichloro-4-((hydroxy(phenyl)phosphoryl)ethynyl)benzoic acid (Compound 1.3)
 5. [heading] Step D:
 6. [heading] (2s)-2-(2,6-dichloro-4-((hydroxy(phenyl)phosphoryl)ethynyl)benzamido)-3-(3-(methylsulfon yl)phenyl)benzyl propionate (Compound 1.4)
 7. [heading] Step E:
 8. [heading] (2s)-2-(2,6-dichloro-4-(2-(hydroxy(phenyl)phosphoryl)ethyl)benzamido)-3-(3-(methylsulfon yl)phenyl)propionic acid (Compound 1)
 9. [heading] (2s)-2-(2,6-dichloro-4-((hydroxy(3-hydroxyphenyl)phosphoryl)ethynyl)benzylamino)-3-(3-( methylsulfonyl)phenyl)propionic acid
10. [heading] Step A