In [1]:
import xml.etree.ElementTree as ET
import json
import os

def analyze_xml_structure(xml_path):
    """Analyze the XML structure to understand tag hierarchy"""
    try:
        tree = ET.parse(xml_path)
        root = tree.getroot()
        
        tag_counts = {}
        for elem in root.iter():
            tag = elem.tag
            if tag in tag_counts:
                tag_counts[tag] += 1
            else:
                tag_counts[tag] = 1
        
        print(f"\nXML Structure Analysis for {xml_path}:")
        print(f"Root tag: {root.tag}")
        print("Tag frequency:")
        for tag, count in sorted(tag_counts.items()):
            print(f"  {tag}: {count}")
        
        return root, tag_counts
    except Exception as e:
        print(f"Error analyzing {xml_path}: {e}")
        return None, {}

def extract_patent_content(xml_path):
    """Extract content from patent XML using actual structure"""
    try:
        tree = ET.parse(xml_path)
        root = tree.getroot()
        
        grouped = {}
        current_heading = None
        
        # Function to clean text
        def clean_text(text):
            if text:
                return ' '.join(text.split())
            return ""
        
        # Extract content based on actual XML structure
        for elem in root.iter():
            text_content = clean_text(elem.text) if elem.text else ""
            
            # Check for heading-like elements
            if elem.tag in ['claim', 'table'] and text_content:
                # Use the text content as heading
                current_heading = text_content[:100] + ("..." if len(text_content) > 100 else "")
                if current_heading not in grouped:
                    grouped[current_heading] = []
            
            # Check for paragraph elements
            elif elem.tag == 'p' and text_content:
                if current_heading is None:
                    current_heading = "Introduction"
                    grouped[current_heading] = []
                grouped[current_heading].append(text_content)
            
            # Handle other potential content containers
            elif elem.tag in ['description', 'abstract', 'title'] and text_content:
                if text_content not in grouped:
                    grouped[text_content] = []
        
        return grouped
        
    except Exception as e:
        print(f"Error extracting content from {xml_path}: {e}")
        return {}

def main():
    # List of XML files to process
    xml_files = [
        "EP18823397W1B9.xml",
        "EP22914805W1A9.xml"
    ]
    
    for xml_file in xml_files:
        if os.path.exists(xml_file):
            print(f"\n{'='*50}")
            print(f"Processing: {xml_file}")
            print('='*50)
            
            # Analyze structure first
            root, tag_counts = analyze_xml_structure(xml_file)
            
            if root is not None:
                # Extract content
                content = extract_patent_content(xml_file)
                
                if content:
                    # Save to JSON file
                    output_file = xml_file.replace('.xml', '_extracted.json')
                    with open(output_file, 'w', encoding='utf-8') as f:
                        json.dump(content, f, indent=2, ensure_ascii=False)
                    print(f"\nSaved extracted content to: {output_file}")
                    
                    # Show first few entries
                    print(f"\nFirst few sections extracted:")
                    for i, (heading, paragraphs) in enumerate(content.items()):
                        if i >= 3:  # Show only first 3
                            break
                        print(f"\n{heading}:")
                        for j, para in enumerate(paragraphs[:2]):  # Show first 2 paragraphs
                            print(f"  - {para[:100]}{'...' if len(para) > 100 else ''}")
                        if len(paragraphs) > 2:
                            print(f"  ... and {len(paragraphs) - 2} more paragraphs")
                else:
                    print("No content extracted")
        else:
            print(f"\nFile not found: {xml_file}")

main()


Processing: EP18823397W1B9.xml

XML Structure Analysis for EP18823397W1B9.xml:
Root tag: ep-patent-document
Tag frequency:
  B000: 1
  B001EP: 1
  B005EP: 1
  B007EP: 1
  B100: 1
  B110: 1
  B120: 1
  B121: 1
  B130: 1
  B132EP: 1
  B140: 1
  B150: 1
  B151: 1
  B155: 1
  B1551: 9
  B1552: 9
  B190: 1
  B200: 1
  B210: 1
  B220: 1
  B240: 1
  B241: 1
  B242: 1
  B250: 1
  B251EP: 1
  B260: 1
  B300: 1
  B310: 2
  B320: 2
  B330: 2
  B400: 1
  B405: 1
  B430: 1
  B450: 1
  B452EP: 1
  B472: 1
  B475: 1
  B480: 1
  B500: 1
  B510EP: 1
  B520EP: 1
  B540: 1
  B541: 3
  B542: 3
  B560: 1
  B561: 3
  B562: 1
  B565EP: 1
  B700: 1
  B7000: 1
  B7001: 1
  B7002: 1
  B7020: 1
  B7021: 1
  B7022: 1
  B7023: 1
  B7050: 1
  B7052: 1
  B720: 1
  B721: 12
  B730: 1
  B731: 1
  B740: 1
  B741: 1
  B7720: 1
  B7721: 12
  B7730: 1
  B7731: 1
  B7740: 1
  B7741: 1
  B800: 1
  B840: 1
  B860: 1
  B861: 1
  B862: 1
  B870: 1
  B871: 1
  SDOBI: 1
  adr: 28
  anum: 1
  b: 80
  bnum: 5
  br: 13
  chemistry