In [3]:
import xml.etree.ElementTree as ET
import json
import os
from collections import defaultdict

def map_paragraphs_to_headings(xml_file_path, output_file=None):
    """
    Process patent XML file and map each paragraph to its most recent heading
    
    Args:
        xml_file_path (str): Path to the XML file
        output_file (str, optional): Path to save the output JSON file
    
    Returns:
        dict: Mapping of headings to their paragraphs
    """
    try:
        # Parse the XML file
        tree = ET.parse(xml_file_path)
        print(tree)
        root = tree.getroot()
        
        # Find the description section
        description = None
        for elem in root.iter():
            if elem.tag == 'description':
                description = elem
                break
        
        if not description:
            print(f"No description section found in {xml_file_path}")
            return {}
        
        # Extract all headings and paragraphs in order
        elements = []
        for elem in description:
            if elem.tag == 'heading' or elem.tag == 'p':
                elements.append(elem)
        
        # Map paragraphs to headings
        heading_to_paragraphs = defaultdict(list)
        current_heading = {"id": None, "text": "No Heading"}
        paragraph_count = 0
        
        for elem in elements:
            if elem.tag == 'heading':
                # Extract heading ID and text
                heading_id = elem.get('id')
                
                # Handle different heading formats (with u, b, etc.)
                if len(elem) > 0 and elem[0].tag in ['u', 'b', 'i']:
                    heading_text = elem[0].text
                else:
                    heading_text = elem.text
                
                if heading_text:
                    heading_text = heading_text.strip()
                else:
                    heading_text = f"Untitled Heading ({heading_id})"
                
                current_heading = {"id": heading_id, "text": heading_text}
            
            elif elem.tag == 'p':
                paragraph_text = elem.text.strip() if elem.text else ""
                if paragraph_text:
                    heading_key = f"{current_heading['text']} ({current_heading['id']})"
                    heading_to_paragraphs[heading_key].append({
                        "p_number": paragraph_count,
                        "text": paragraph_text
                    })
                    paragraph_count += 1
        
        # Convert defaultdict to regular dict for JSON serialization
        result = dict(heading_to_paragraphs)
        
        # Save to file if output file is specified
        if output_file:
            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(result, f, indent=2, ensure_ascii=False)
            print(f"Saved mapping to {output_file}")
        
        return result
    
    except ET.ParseError as e:
        print(f"Error parsing XML file {xml_file_path}: {e}")
        return {}
    except Exception as e:
        print(f"Error processing {xml_file_path}: {e}")
        return {}

def process_directory(directory_path, output_dir=None):
    """
    Process all XML files in the given directory
    
    Args:
        directory_path (str): Directory containing XML files
        output_dir (str, optional): Directory to save output files
    """
    if not output_dir:
        output_dir = directory_path
    
    os.makedirs(output_dir, exist_ok=True)
    
    xml_files = [f for f in os.listdir(directory_path) if f.endswith('.xml')]
    
    if not xml_files:
        print(f"No XML files found in {directory_path}")
        return
    
    print(f"Found {len(xml_files)} XML files")
    
    for xml_file in xml_files:
        print(f"Processing {xml_file}...")
        xml_path = os.path.join(directory_path, xml_file)
        output_file = os.path.join(output_dir, f"{os.path.splitext(xml_file)[0]}_headings_to_paragraphs.json")
        map_paragraphs_to_headings(xml_path, output_file)

# Example usage
if __name__ == "__main__":
    # Process a single file
    xml_file_path = "EP22914805W1A9.xml"
    output_file = "EP22914805W1A9_mapping.json"
    map_paragraphs_to_headings(xml_file_path, output_file)
    
    # Process all XML files in a directory
    # process_directory("langechain/notebooks")

<xml.etree.ElementTree.ElementTree object at 0xffff873fe3e0>
Saved mapping to EP22914805W1A9_mapping.json


In [5]:
import xml.etree.ElementTree as ET
import json
import os
from collections import defaultdict

def extract_text_from_element(element):
    """
    Extract all text from an element, including text in nested elements.
    """
    text = element.text or ""
    for child in element:
        text += extract_text_from_element(child)
        if child.tail:
            text += child.tail
    return text

def map_paragraphs_to_headings(xml_file_path, output_file=None):
    """
    Process patent XML file and map each paragraph to its most recent heading
    
    Args:
        xml_file_path (str): Path to the XML file
        output_file (str, optional): Path to save the output JSON file
    
    Returns:
        dict: Mapping of headings to their paragraphs
    """
    try:
        # Parse the XML file
        tree = ET.parse(xml_file_path)
        root = tree.getroot()
        
        # Find the description section
        description = None
        for elem in root.iter():
            if elem.tag == 'description':
                description = elem
                break
        
        if not description:
            print(f"No description section found in {xml_file_path}")
            return {}
        
        # Extract all headings and paragraphs in order
        elements = []
        for elem in description:
            if elem.tag == 'heading' or elem.tag == 'p':
                elements.append(elem)
        
        # Map paragraphs to headings
        heading_to_paragraphs = defaultdict(list)
        current_heading = {"id": None, "text": "No Heading"}
        paragraph_count = 0
        
        for elem in elements:
            if elem.tag == 'heading':
                # Extract heading ID
                heading_id = elem.get('id')
                
                # Extract text from element and all its children
                heading_text = extract_text_from_element(elem)
                
                if heading_text:
                    heading_text = heading_text.strip()
                else:
                    heading_text = f"Untitled Heading ({heading_id})"
                
                current_heading = {"id": heading_id, "text": heading_text}
            
            elif elem.tag == 'p':
                paragraph_text = elem.text.strip() if elem.text else ""
                if paragraph_text:
                    heading_key = f"{current_heading['text']} ({current_heading['id']})"
                    heading_to_paragraphs[heading_key].append({
                        "p_number": paragraph_count,
                        "text": paragraph_text
                    })
                    paragraph_count += 1
        
        # Convert defaultdict to regular dict for JSON serialization
        result = dict(heading_to_paragraphs)
        
        # Save to file if output file is specified
        if output_file:
            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(result, f, indent=2, ensure_ascii=False)
            print(f"Saved mapping to {output_file}")
        
        return result
    
    except ET.ParseError as e:
        print(f"Error parsing XML file {xml_file_path}: {e}")
        return {}
    except Exception as e:
        print(f"Error processing {xml_file_path}: {e}")
        return {}

def process_directory(directory_path, output_dir=None):
    """
    Process all XML files in the given directory
    
    Args:
        directory_path (str): Directory containing XML files
        output_dir (str, optional): Directory to save output files
    """
    if not output_dir:
        output_dir = directory_path
    
    os.makedirs(output_dir, exist_ok=True)
    
    xml_files = [f for f in os.listdir(directory_path) if f.endswith('.xml')]
    
    if not xml_files:
        print(f"No XML files found in {directory_path}")
        return
    
    print(f"Found {len(xml_files)} XML files")
    
    for xml_file in xml_files:
        print(f"Processing {xml_file}...")
        xml_path = os.path.join(directory_path, xml_file)
        output_file = os.path.join(output_dir, f"{os.path.splitext(xml_file)[0]}_headings_to_paragraphs.json")
        map_paragraphs_to_headings(xml_path, output_file)

# Example usage

# Process a single file
xml_file_path = "EP22914805W1A9.xml"
output_file = "EP22914805W1A9_mapping.json"
map_paragraphs_to_headings(xml_file_path, output_file)

Saved mapping to EP22914805W1A9_mapping.json


{'No Heading (None)': [{'p_number': 0,
   'text': 'The present application claims the priority of the following patent application filed on December 27, 2021:'},
  {'p_number': 1,
   'text': 'The above contents are incorporated herein by reference to the extent consistent with the present invention.'}],
 'Field of Technology (h0001)': [{'p_number': 2,
   'text': 'The present invention relates to the field of nucleic acid sequence detection, and in particular to a method for detecting a target nucleic acid. More specifically, the present invention relates to a method for detecting a single or multiple target nucleic acid on digital PCR by using a universal probe.'}],
 'Background (h0002)': [{'p_number': 3,
   'text': 'Polymerase chain reaction (PCR) is a molecular biological technique for enzymatic replication of DNA without using a living organism. PCR is commonly used in medical and biological research laboratories to undertake a variety of tasks, such as diagnosis of infectious disea

In [9]:
import xml.etree.ElementTree as ET
import json
import os
from collections import defaultdict
import re

def extract_text_from_heading(heading_elem):
    """
    Special function to extract text from heading elements with nested b and u tags.
    """
    # Convert the element to a string and use regex to extract text
    elem_str = ET.tostring(heading_elem, encoding='unicode')
    
    # Try to match patterns with nested b and u tags
    patterns = [
        r'<b><u>(.*?)</u></b>',  # <b><u>Text</u></b>
        r'<u><b>(.*?)</b></u>',  # <u><b>Text</b></u>
        r'<b>(.*?)</b>',         # <b>Text</b>
        r'<u>(.*?)</u>',         # <u>Text</u>
        # r'<heading[^>]*>(.*?)</heading>'
    ]
    
    for pattern in patterns:
        match = re.search(pattern, elem_str)
        if match:
            return match.group(1).strip()
    
    # If nothing matched, try the direct text
    return (heading_elem.text or "").strip()

def map_paragraphs_to_headings(xml_file_path, output_file=None):
    """
    Process patent XML file and map each paragraph to its most recent heading
    """
    try:
        # Parse the XML file
        tree = ET.parse(xml_file_path)
        root = tree.getroot()
        
        # Find the description section
        description = None
        for elem in root.iter():
            if elem.tag == 'description':
                description = elem
                break
        
        if not description:
            print(f"No description section found in {xml_file_path}")
            return {}
        
        # Extract all headings and paragraphs in order
        elements = []
        for elem in description:
            if elem.tag == 'heading' or elem.tag == 'p':
                elements.append(elem)
        
        # Map paragraphs to headings
        heading_to_paragraphs = defaultdict(list)
        current_heading = {"id": None, "text": "No Heading"}
        paragraph_count = 0
        
        for elem in elements:
            if elem.tag == 'heading':
                # Extract heading ID
                heading_id = elem.get('id')
                
                # Extract heading text using the special function
                heading_text = extract_text_from_heading(elem)
                
                if not heading_text:
                    heading_text = f"Untitled Heading ({heading_id})"
                
                # Debug print to verify
                print(f"Found heading: ID={heading_id}, Text='{heading_text}'")
                
                current_heading = {"id": heading_id, "text": heading_text}
            
            elif elem.tag == 'p':
                # Get paragraph ID and number
                p_id = elem.get('id')
                p_num = elem.get('num')
                
                # Get paragraph text (strip XML comments)
                paragraph_text = elem.text or ""
                if paragraph_text:
                    paragraph_text = re.sub(r'<!--.*?-->', '', paragraph_text)
                    paragraph_text = paragraph_text.strip()
                
                if paragraph_text:
                    heading_key = f"{current_heading['text']} ({current_heading['id']})"
                    heading_to_paragraphs[heading_key].append({
                        "p_id": p_id,
                        "p_number": p_num or paragraph_count,
                        "text": paragraph_text
                    })
                    paragraph_count += 1
        
        # Convert defaultdict to regular dict for JSON serialization
        result = dict(heading_to_paragraphs)
        
        # Save to file if output file is specified
        if output_file:
            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(result, f, indent=2, ensure_ascii=False)
            print(f"Saved mapping to {output_file}")
        
        return result
    
    except ET.ParseError as e:
        print(f"Error parsing XML file {xml_file_path}: {e}")
        return {}
    except Exception as e:
        print(f"Error processing {xml_file_path}: {e}")
        import traceback
        traceback.print_exc()
        return {}

# Example usage


In [10]:
# Process a single file
xml_file_path = "EP18823397W1B9.xml"  # Update this path
output_file = "EP18823397W1B9_mapping_1.json"
map_paragraphs_to_headings(xml_file_path, output_file)

Found heading: ID=h0001, Text='Technical field'
Found heading: ID=h0002, Text='Technical background'
Found heading: ID=h0003, Text='Summary of the Invention'
Found heading: ID=h0004, Text='Action and effect of the invention:'
Found heading: ID=h0005, Text='Detailed description of the invention'
Found heading: ID=h0006, Text='Example 1'
Found heading: ID=h0007, Text='(2s)-2-(2,6-dichloro-4-(2-(hydroxy(phenyl)phosphoryl)ethyl)benzamido)-3-(3-(methylsulfon yl)phenyl)propionic acid'
Found heading: ID=h0008, Text='Step A: methoxyphenylphosphoryl chloride (Compound 1.1)'
Found heading: ID=h0009, Text='Step B: Methyl 2,6-dichloro-4-((phenyl(methoxy)phosphoryl)ethynyl)benzoate (Compound 1.2)'
Found heading: ID=h0010, Text='Step C: 2,6-dichloro-4-((hydroxy(phenyl)phosphoryl)ethynyl)benzoic acid (Compound 1.3)'
Found heading: ID=h0011, Text='Step D:'
Found heading: ID=h0012, Text='(2s)-2-(2,6-dichloro-4-((hydroxy(phenyl)phosphoryl)ethynyl)benzamido)-3-(3-(methylsulfon yl)phenyl)benzyl propionate

{'Technical field (h0001)': [{'p_id': 'p0001',
   'p_number': '0001',
   'text': 'The present invention relates to the field of pharmaceuticals, and in particular to phosphorus-containing compound, preparation method thereof, and use for treating dry eye.'}],
 'Technical background (h0002)': [{'p_id': 'p0002',
   'p_number': '0002',
   'text': 'Tears provide long-lasting moisturization and lubrication to the eyes, which is the key to maintaining vision and eye comfort. Tears are composed of water, lipids, mucus, antibodies, and specific proteins with anti-infective properties. These components are secreted by specific glands located around the eyes. When there is an imbalance in the tear system, people will feel dry eyes.'},
  {'p_id': 'p0003',
   'p_number': '0003',
   'text': 'Dry eye syndrome is a common ocular surface inflammatory disease. People with dry eye may experience eye pain, photosensitivity, itching, redness and blurred vision. Dry eye syndrome is caused by multiple induc

In [17]:
import xml.etree.ElementTree as ET
import json
import os
import re

def is_compound_heading(heading_text):
    """Detect if a heading is actually a chemical compound identifier"""
    # Check for typical patterns in compound identifiers
    compound_patterns = [
        r'^\([0-9][sS]\)-',          # Starts with stereochemical descriptor like (2S)-
        r'^[Ss]tep [A-Z]:',          # Step A:, Step B:, etc.
        r'^[Cc]ompound [0-9]',       # Compound 1, Compound 2, etc.
        r'acid$',                    # Ends with "acid"
        r'phosphoryl',               # Contains phosphoryl 
        r'benzamido',                # Contains benzamido
        r'phenyl',                   # Contains phenyl
        r'LCMS ESI',                 # LCMS data
        r'\([A-Z][a-z]?[0-9]\)'      # Contains (C1), (N2), etc.
    ]
    
    # If heading is very long, it's likely a compound
    if len(heading_text) > 30:
        return True
        
    # Check for compound patterns
    for pattern in compound_patterns:
        if re.search(pattern, heading_text):
            return True
            
    return False

def map_paragraphs_to_headings(xml_file_path, output_file=None):
    """
    Process patent XML file and map each paragraph to its most recent heading,
    distinguishing between true section headings and compound identifiers
    """
    try:
        # Parse the XML file
        tree = ET.parse(xml_file_path)
        root = tree.getroot()
        
        # Find the description section
        description = None
        for elem in root.iter():
            if elem.tag == 'description':
                description = elem
                break
        
        if not description:
            print(f"No description section found in {xml_file_path}")
            return {}
        
        # Extract all headings and paragraphs in order
        elements = []
        for elem in description:
            if elem.tag == 'heading' or elem.tag == 'p':
                elements.append(elem)
        
        # Map paragraphs to headings
        document_structure = {
            "section_headings": {},
            "compound_headings": {}
        }
        current_section = {"id": None, "text": "No Heading"}
        current_compound = None
        paragraph_count = 0
        
        for elem in elements:
            if elem.tag == 'heading':
                # Extract heading ID
                heading_id = elem.get('id')
                
                # Extract text from element using regex for nested tags
                elem_str = ET.tostring(elem, encoding='unicode')
                heading_text = ""
                
                # Try to extract from nested tags
                match = re.search(r'<[bu]><[bu]>(.*?)</[bu]></[bu]>', elem_str)
                if match:
                    heading_text = match.group(1)
                else:
                    match = re.search(r'<[bu]>(.*?)</[bu]>', elem_str)
                    if match:
                        heading_text = match.group(1)
                    else:
                        # Get text directly if no nested tags
                        heading_text = elem.text or ""
                
                heading_text = heading_text.strip()
                
                if not heading_text:
                    heading_text = f"Untitled Heading ({heading_id})"
                
                # Determine if this is a compound heading or section heading
                if is_compound_heading(heading_text):
                    current_compound = {"id": heading_id, "text": heading_text}
                else:
                    current_section = {"id": heading_id, "text": heading_text}
                    current_compound = None
            
            elif elem.tag == 'p':
                # Get paragraph ID and number
                p_id = elem.get('id')
                p_num = elem.get('num')
                
                # Extract paragraph text
                paragraph_text = elem.text.strip() if elem.text else ""
                
                if paragraph_text:
                    # Decide which heading this paragraph belongs to
                    if current_compound:
                        heading_key = f"{current_compound['text']} ({current_compound['id']})"
                        container = document_structure["compound_headings"]
                    else:
                        heading_key = f"{current_section['text']} ({current_section['id']})"
                        container = document_structure["section_headings"]
                    
                    if heading_key not in container:
                        container[heading_key] = []
                        
                    container[heading_key].append({
                        "p_id": p_id,
                        "p_number": p_num or paragraph_count,
                        "text": paragraph_text
                    })
                    paragraph_count += 1
        
        # Save to file if output file is specified
        if output_file:
            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(document_structure, f, indent=2, ensure_ascii=False)
            print(f"Saved mapping to {output_file}")
        
        return document_structure
    
    except Exception as e:
        print(f"Error processing {xml_file_path}: {e}")
        import traceback
        traceback.print_exc()
        return {}

# Example usage
if __name__ == "__main__":
    xml_file_path = "EP23838664W1A9.xml"
    output_file = "EP23838664W1A9_structured.json"
    map_paragraphs_to_headings(xml_file_path, output_file)

Saved mapping to EP23838664W1A9_structured.json


In [16]:
import xml.etree.ElementTree as ET
import json
import os
import re

def extract_heading_text(elem):
    """Extract text from heading element handling nested tags"""
    elem_str = ET.tostring(elem, encoding='unicode')
    for pattern in [r'<[bu]><[bu]>(.*?)</[bu]></[bu]>', r'<[bu]>(.*?)</[bu]>']:
        match = re.search(pattern, elem_str)
        if match:
            return match.group(1).strip()
    return (elem.text or "").strip()

def categorize_heading(heading_text):
    """Categorize heading as main section, example, or compound"""
    # Main section headings
    if re.match(r'^(Technical|Background|Summary|Description|Field|Introduction|Claims)', heading_text):
        return "main_section"
    
    # Example headings
    if re.match(r'^Example\s+\d+', heading_text):
        return "example"
    
    # Look for compound patterns
    compound_patterns = [
        r'^\([0-9][sS]\)-',          # (2S)-
        r'^[Ss]tep [A-Z]:',          # Step A:
        r'^[Cc]ompound [0-9]',       # Compound 1
        r'acid$',                    # ends with "acid"
        r'phosphoryl',               # Contains phosphoryl
        r'benzamido'                 # Contains benzamido
    ]
    for pattern in compound_patterns:
        if re.search(pattern, heading_text):
            return "compound"
    
    # Default to subsection
    return "subsection"

def map_hierarchical_structure(xml_file_path, output_file=None):
    """Create hierarchical structure of patent document"""
    try:
        tree = ET.parse(xml_file_path)
        root = tree.getroot()
        
        # Find description section
        description = None
        for elem in root.iter():
            if elem.tag == 'description':
                description = elem
                break
        
        if not description:
            print(f"No description section found in {xml_file_path}")
            return {}
        
        # Extract all headings and paragraphs
        elements = []
        for elem in description:
            if elem.tag == 'heading' or elem.tag == 'p':
                elements.append(elem)
        
        # Build hierarchical structure
        document_structure = {
            "main_sections": {}
        }
        
        current_main_section = None
        current_example = None
        current_heading = None
        
        for elem in elements:
            if elem.tag == 'heading':
                heading_id = elem.get('id')
                heading_text = extract_heading_text(elem)
                heading_type = categorize_heading(heading_text)
                
                if heading_type == "main_section":
                    # This is a main section heading
                    current_main_section = {
                        "id": heading_id,
                        "title": heading_text,
                        "paragraphs": [],
                        "examples": {},
                        "subsections": {}
                    }
                    document_structure["main_sections"][heading_text] = current_main_section
                    current_example = None
                    current_heading = current_main_section
                
                elif heading_type == "example" and current_main_section:
                    # This is an example heading
                    current_example = {
                        "id": heading_id,
                        "title": heading_text,
                        "paragraphs": [],
                        "compounds": {}
                    }
                    current_main_section["examples"][heading_text] = current_example
                    current_heading = current_example
                
                elif heading_type == "compound" and current_example:
                    # This is a compound within an example
                    compound = {
                        "id": heading_id,
                        "title": heading_text,
                        "paragraphs": []
                    }
                    current_example["compounds"][heading_text] = compound
                    current_heading = compound
                
                elif heading_type == "compound" and current_main_section:
                    # This is a compound directly under a main section
                    compound = {
                        "id": heading_id,
                        "title": heading_text,
                        "paragraphs": []
                    }
                    if "compounds" not in current_main_section:
                        current_main_section["compounds"] = {}
                    current_main_section["compounds"][heading_text] = compound
                    current_heading = compound
                
                elif current_main_section:
                    # This is a regular subsection
                    subsection = {
                        "id": heading_id,
                        "title": heading_text,
                        "paragraphs": []
                    }
                    current_main_section["subsections"][heading_text] = subsection
                    current_heading = subsection
            
            elif elem.tag == 'p' and current_heading:
                p_id = elem.get('id')
                p_num = elem.get('num')
                paragraph_text = elem.text.strip() if elem.text else ""
                
                if paragraph_text:
                    current_heading["paragraphs"].append({
                        "p_id": p_id,
                        "p_number": p_num,
                        "text": paragraph_text
                    })
        
        if output_file:
            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(document_structure, f, indent=2, ensure_ascii=False)
            print(f"Saved hierarchical structure to {output_file}")
        
        return document_structure
    
    except Exception as e:
        print(f"Error processing {xml_file_path}: {e}")
        import traceback
        traceback.print_exc()
        return {}

# Example usage

xml_file_path = "EP23838664W1A9.xml"
output_file = "EP23838664W1A9_hierarchical_.json"
map_hierarchical_structure(xml_file_path, output_file)

Saved hierarchical structure to EP23838664W1A9_hierarchical_.json


{'main_sections': {}}

In [18]:
import xml.etree.ElementTree as ET
import json
import re
import os

def extract_heading_text(elem):
    """Extract text from heading element handling nested tags"""
    elem_str = ET.tostring(elem, encoding='unicode')
    for pattern in [r'<[bu]><[bu]>(.*?)</[bu]></[bu]>', r'<[bu]>(.*?)</[bu]>']:
        match = re.search(pattern, elem_str)
        if match:
            return match.group(1).strip()
    return (elem.text or "").strip()

def is_main_heading(text):
    """Determine if this is a main section heading"""
    main_heading_patterns = [
        r'^(Technical|Field|Background|Summary|Introduction|Description)',
        r'(^|of\s+)(the\s+)?[Ii]nvention',
        r'^[Cc]laims',
        r'^[Aa]bstract'
    ]
    
    for pattern in main_heading_patterns:
        if re.search(pattern, text):
            return True
    return False

def extract_text_from_element(elem):
    """Extract all text from element and its children"""
    if elem is None:
        return ""
    text = elem.text or ""
    for child in elem:
        text += extract_text_from_element(child)
        if child.tail:
            text += child.tail
    return text

def extract_patent_data(xml_file_path, output_file=None):
    """Extract patent data including bibliographic information, main sections and claims"""
    try:
        tree = ET.parse(xml_file_path)
        root = tree.getroot()
        
        # Prepare structure for extraction
        patent_data = {
            "bibliographic_data": {},
            "main_sections": [],
            "claims": []
        }
        
        # Extract basic attributes from root
        if root.tag == 'ep-patent-document':
            patent_data["bibliographic_data"].update({
                "doc_id": root.get('id'),
                "file": root.get('file'),
                "language": root.get('lang'),
                "country": root.get('country'),
                "doc_number": root.get('doc-number'),
                "kind_code": root.get('kind'),
                "publication_date": root.get('date-publ')
            })
        
        # Process SDOBI (Standard Document BIbliography) section
        sdobi = root.find('.//SDOBI')
        if sdobi:
            # Extract title
            title_elem = sdobi.find('.//B541/STIENG') or sdobi.find('.//B540/STIENG')
            if title_elem is not None:
                patent_data["bibliographic_data"]["title"] = extract_text_from_element(title_elem)
            
            # Extract inventors
            inventors = []
            for inv_elem in sdobi.findall('.//B721/INNOG'):
                name_elem = inv_elem.find('.//NAM')
                if name_elem is not None:
                    inventor_name = ""
                    snm = name_elem.find('SNM')
                    fnm = name_elem.find('FNM')
                    if snm is not None:
                        inventor_name += (snm.text or "")
                    if fnm is not None:
                        inventor_name += ", " + (fnm.text or "")
                    if inventor_name:
                        inventors.append(inventor_name.strip())
            
            patent_data["bibliographic_data"]["inventors"] = inventors
            
            # Extract applicants/assignees
            applicants = []
            for app_elem in sdobi.findall('.//B731/ASGNG'):
                name_elem = app_elem.find('.//NAM')
                if name_elem is not None:
                    applicant_name = extract_text_from_element(name_elem)
                    if applicant_name:
                        applicants.append(applicant_name.strip())
            
            patent_data["bibliographic_data"]["applicants"] = applicants
            
            # Extract application date
            app_date = sdobi.find('.//B220/DATE')
            if app_date is not None and app_date.text:
                patent_data["bibliographic_data"]["application_date"] = app_date.text
            
            # Extract IPC classifications
            ipcs = []
            for ipc_elem in sdobi.findall('.//B510/IPCR'):
                ipc_text = extract_text_from_element(ipc_elem)
                if ipc_text:
                    ipcs.append(ipc_text.strip())
            
            patent_data["bibliographic_data"]["ipc_classes"] = ipcs
            
            # Extract application number
            app_num = sdobi.find('.//B210/DNUM')
            if app_num is not None and app_num.text:
                patent_data["bibliographic_data"]["application_number"] = app_num.text
        
        # Process abstract
        abstract = root.find('.//abstract')
        if abstract:
            abstract_text = ""
            for p in abstract.findall('.//p'):
                if p.text:
                    abstract_text += p.text.strip() + " "
            
            if abstract_text.strip():
                patent_data["bibliographic_data"]["abstract"] = abstract_text.strip()
        
        # Process description section
        description = None
        for elem in root.iter():
            if elem.tag == 'description':
                description = elem
                break
        
        if description:
            current_main_section = None
            
            # Process elements in description
            for elem in description:
                if elem.tag == 'heading':
                    heading_text = extract_heading_text(elem)
                    heading_id = elem.get('id')
                    
                    if is_main_heading(heading_text):
                        # This is a main section heading
                        current_main_section = {
                            "heading_id": heading_id,
                            "heading_text": heading_text,
                            "paragraphs": []
                        }
                        patent_data["main_sections"].append(current_main_section)
                
                elif elem.tag == 'p' and current_main_section:
                    # Add paragraph to current main section
                    p_text = elem.text.strip() if elem.text else ""
                    if p_text:
                        current_main_section["paragraphs"].append({
                            "p_id": elem.get('id'),
                            "text": p_text
                        })
        
        # Process claims section
        for claims_elem in root.findall('.//claims'):
            # Extract claims - handle different claim structures
            for claim_elem in claims_elem.findall('.//claim'):
                claim_text = ""
                claim_num = claim_elem.get('num')
                
                # Extract claim text from nested structure
                for text_elem in claim_elem.iter():
                    if text_elem.tag in ['claim-text', 'p'] and text_elem.text:
                        claim_text += text_elem.text.strip() + " "
                
                if claim_text.strip():
                    patent_data["claims"].append({
                        "claim_num": claim_num,
                        "text": claim_text.strip()
                    })
        
        # Save to file if needed
        if output_file:
            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(patent_data, f, indent=2, ensure_ascii=False)
            print(f"Saved patent data to {output_file}")
        
        return patent_data
    
    except Exception as e:
        print(f"Error extracting patent data: {e}")
        import traceback
        traceback.print_exc()
        return {}

def process_directory(directory_path, output_dir=None):
    """Process all XML files in the given directory"""
    if output_dir is None:
        output_dir = directory_path
    
    os.makedirs(output_dir, exist_ok=True)
    
    xml_files = [f for f in os.listdir(directory_path) if f.endswith('.xml')]
    
    print(f"Found {len(xml_files)} XML files")
    
    for xml_file in xml_files:
        print(f"Processing {xml_file}...")
        xml_path = os.path.join(directory_path, xml_file)
        output_file = os.path.join(output_dir, f"{os.path.splitext(xml_file)[0]}_data.json")
        extract_patent_data(xml_path, output_file)

# Example usage

xml_file_path = "EP18823397W1B9.xml"  # Change to your file
output_file = "EP18823397W1B9_complete.json"
extract_patent_data(xml_file_path, output_file)
    
    # To process all XML files in a directory:
    # process_directory("path/to/your/directory")

Saved patent data to EP18823397W1B9_complete.json


{'bibliographic_data': {'doc_id': 'EP18823397B9W1',
  'file': 'EP18823397W1B9.xml',
  'language': 'en',
  'country': 'EP',
  'doc_number': '3647315',
  'kind_code': 'B9',
  'publication_date': '20250604',
  'inventors': [],
  'applicants': [],
  'ipc_classes': []},
 'main_sections': [{'heading_id': 'h0001',
   'heading_text': 'Technical field',
   'paragraphs': [{'p_id': 'p0001',
     'text': 'The present invention relates to the field of pharmaceuticals, and in particular to phosphorus-containing compound, preparation method thereof, and use for treating dry eye.'}]},
  {'heading_id': 'h0002',
   'heading_text': 'Technical background',
   'paragraphs': [{'p_id': 'p0002',
     'text': 'Tears provide long-lasting moisturization and lubrication to the eyes, which is the key to maintaining vision and eye comfort. Tears are composed of water, lipids, mucus, antibodies, and specific proteins with anti-infective properties. These components are secreted by specific glands located around the 

In [25]:
import xml.etree.ElementTree as ET
import re
import json
import os
from collections import defaultdict

def extract_patent_data(xml_file_path, output_file=None):
    """
    Extract complete patent data including detailed bibliographic information,
    headers, and claims from an EPO XML patent document.
    """
    try:
        tree = ET.parse(xml_file_path)
        root = tree.getroot()
        
        # Initialize data structure
        patent_data = {
            "bibliographic_data": {},
            "main_sections": [],
            "claims": []
        }
        
        # Extract root document attributes
        if root.tag == 'ep-patent-document':
            patent_data["bibliographic_data"].update({
                "doc_id": root.get('id'),
                "file": root.get('file'),
                "language": root.get('lang'),
                "country": root.get('country'),
                "doc_number": root.get('doc-number'),
                "kind_code": root.get('kind'),
                "correction_code": root.get('correction-code'),
                "publication_date": root.get('date-publ'),
                "status": root.get('status'),
                "dtd_version": root.get('dtd-version')
            })
        
        # Process SDOBI (Standard Document BIbliography) section
        sdobi = root.find('.//SDOBI')
        if sdobi:
            # Extract document type/title (B121)
            doc_type_elem = sdobi.find('.//B121')
            if doc_type_elem is not None and doc_type_elem.text:
                patent_data["bibliographic_data"]["document_type"] = doc_type_elem.text
            
            # Extract document kind code (B130)
            kind_elem = sdobi.find('.//B130')
            if kind_elem is not None and kind_elem.text:
                patent_data["bibliographic_data"]["kind"] = kind_elem.text

            # Extract publication date (B140)
            pub_date_elem = sdobi.find('.//B140/date')
            if pub_date_elem is not None and pub_date_elem.text:
                patent_data["bibliographic_data"]["publication_date_full"] = pub_date_elem.text
            
            # Extract correction information (B150)
            if sdobi.find('.//B150') is not None:
                correction_info = {}
                correction_code = sdobi.find('.//B151')
                if correction_code is not None and correction_code.text:
                    correction_info["correction_code"] = correction_code.text
                
                # Extract correction details from B154 and B155
                for section in ['B154', 'B155']:
                    details = []
                    for i, lang_elem in enumerate(sdobi.findall(f'.//{section}/B{section[1:]}1')):
                        if i % 2 == 0 and lang_elem.text:
                            lang = lang_elem.text
                            # Get corresponding text from next element
                            if i+1 < len(sdobi.findall(f'.//{section}/B{section[1:]}2')):
                                text_elem = sdobi.findall(f'.//{section}/B{section[1:]}2')[i//2]
                                if text_elem is not None and text_elem.text:
                                    details.append({
                                        "language": lang,
                                        "text": text_elem.text
                                    })
                    
                    if details:
                        correction_info[f"correction_details_{section.lower()}"] = details
                
                patent_data["bibliographic_data"]["correction_information"] = correction_info
            
            # Extract application information
            app_num = sdobi.find('.//B210')
            if app_num is not None and app_num.text:
                patent_data["bibliographic_data"]["application_number"] = app_num.text
            
            app_date = sdobi.find('.//B220/date')
            if app_date is not None and app_date.text:
                patent_data["bibliographic_data"]["application_date"] = app_date.text
            
            # Extract priority information (B300)
            priorities = []
            priority_section = sdobi.find('.//B300')
            if priority_section is not None:
                for i in range(len(priority_section.findall('.//B310'))):
                    priority = {}
                    number = priority_section.findall('.//B310')[i] if i < len(priority_section.findall('.//B310')) else None
                    date = priority_section.findall('.//B320/date')[i] if i < len(priority_section.findall('.//B320/date')) else None
                    country = priority_section.findall('.//B330/ctry')[i] if i < len(priority_section.findall('.//B330/ctry')) else None
                    
                    if number is not None and number.text:
                        priority["number"] = number.text
                    if date is not None and date.text:
                        priority["date"] = date.text
                    if country is not None and country.text:
                        priority["country"] = country.text
                    
                    if priority:
                        priorities.append(priority)
            
            if priorities:
                patent_data["bibliographic_data"]["priorities"] = priorities
            
            # Extract bulletin information
            for field in ['B405', 'B430', 'B450']:
                bulletin_elem = sdobi.find(f'.//{field}')
                if bulletin_elem is not None:
                    date_elem = bulletin_elem.find('./date')
                    bnum_elem = bulletin_elem.find('./bnum')
                    
                    info = {}
                    if date_elem is not None and date_elem.text:
                        info["date"] = date_elem.text
                    if bnum_elem is not None and bnum_elem.text:
                        info["bulletin_number"] = bnum_elem.text
                    
                    if info:
                        field_name = {
                            'B405': 'corrigendum_bulletin',
                            'B430': 'publication_bulletin',
                            'B450': 'grant_bulletin'
                        }.get(field)
                        patent_data["bibliographic_data"][field_name] = info
            
            # Extract IPC (International Patent Classification) information
            ipc_classes = []
            for ipc_elem in sdobi.findall('.//classification-ipcr'):
                text_elem = ipc_elem.find('./text')
                if text_elem is not None and text_elem.text:
                    ipc_text = text_elem.text.strip()
                    ipc_classes.append(ipc_text)
            
            if ipc_classes:
                patent_data["bibliographic_data"]["ipc_classes"] = ipc_classes
            
            # Extract CPC (Cooperative Patent Classification) information
            cpc_classes = []
            for cpc_elem in sdobi.findall('.//classification-cpc'):
                text_elem = cpc_elem.find('./text')
                if text_elem is not None and text_elem.text:
                    cpc_text = text_elem.text.strip()
                    cpc_classes.append(cpc_text)
            
            if cpc_classes:
                patent_data["bibliographic_data"]["cpc_classes"] = cpc_classes
            
            # Extract title information
            title_info = {}
            title_section = sdobi.find('.//B540')
            if title_section is not None:
                for i in range(len(title_section.findall('.//B541'))):
                    lang = title_section.findall('.//B541')[i] if i < len(title_section.findall('.//B541')) else None
                    title = title_section.findall('.//B542')[i] if i < len(title_section.findall('.//B542')) else None
                    
                    if lang is not None and lang.text and title is not None and title.text:
                        title_info[lang.text] = title.text
            
            if title_info:
                patent_data["bibliographic_data"]["title"] = title_info
            
            # Extract inventors information
            inventors = []
            for inv_elem in sdobi.findall('.//B721'):
                inventor = {}
                
                name_elem = inv_elem.find('./snm')
                if name_elem is not None and name_elem.text:
                    inventor["name"] = name_elem.text
                
                # Extract address information
                addr_elem = inv_elem.find('./adr')
                if addr_elem is not None:
                    address = {}
                    
                    street_elem = addr_elem.find('./str')
                    if street_elem is not None and street_elem.text:
                        address["street"] = street_elem.text
                    
                    city_elem = addr_elem.find('./city')
                    if city_elem is not None and city_elem.text:
                        address["city"] = city_elem.text
                    
                    country_elem = addr_elem.find('./ctry')
                    if country_elem is not None and country_elem.text:
                        address["country"] = country_elem.text
                    
                    if address:
                        inventor["address"] = address
                
                if inventor:
                    inventors.append(inventor)
            
            if inventors:
                patent_data["bibliographic_data"]["inventors"] = inventors
            
            # Extract applicant information
            applicants = []
            for app_elem in sdobi.findall('.//B731'):
                applicant = {}
                
                name_elem = app_elem.find('./snm')
                if name_elem is not None and name_elem.text:
                    applicant["name"] = name_elem.text
                
                id_elem = app_elem.find('./iid')
                if id_elem is not None and id_elem.text:
                    applicant["id"] = id_elem.text
                
                ref_elem = app_elem.find('./irf')
                if ref_elem is not None and ref_elem.text:
                    applicant["reference"] = ref_elem.text
                
                # Extract address information
                addr_elem = app_elem.find('./adr')
                if addr_elem is not None:
                    address = {}
                    
                    street_elem = addr_elem.find('./str')
                    if street_elem is not None and street_elem.text:
                        address["street"] = street_elem.text
                    
                    city_elem = addr_elem.find('./city')
                    if city_elem is not None and city_elem.text:
                        address["city"] = city_elem.text
                    
                    country_elem = addr_elem.find('./ctry')
                    if country_elem is not None and country_elem.text:
                        address["country"] = country_elem.text
                    
                    if address:
                        applicant["address"] = address
                
                if applicant:
                    applicants.append(applicant)
            
            if applicants:
                patent_data["bibliographic_data"]["applicants"] = applicants
            
            # Extract representative/agent information
            representatives = []
            for rep_elem in sdobi.findall('.//B741'):
                representative = {}
                
                name_elem = rep_elem.find('./snm')
                if name_elem is not None and name_elem.text:
                    representative["name"] = name_elem.text
                
                id_elem = rep_elem.find('./iid')
                if id_elem is not None and id_elem.text:
                    representative["id"] = id_elem.text
                
                # Extract address information
                addr_elem = rep_elem.find('./adr')
                if addr_elem is not None:
                    address = {}
                    
                    street_elem = addr_elem.find('./str')
                    if street_elem is not None and street_elem.text:
                        address["street"] = street_elem.text
                    
                    city_elem = addr_elem.find('./city')
                    if city_elem is not None and city_elem.text:
                        address["city"] = city_elem.text
                    
                    country_elem = addr_elem.find('./ctry')
                    if country_elem is not None and country_elem.text:
                        address["country"] = country_elem.text
                    
                    if address:
                        representative["address"] = address
                
                if representative:
                    representatives.append(representative)
            
            if representatives:
                patent_data["bibliographic_data"]["representatives"] = representatives
            
            # Extract designated states information
            designated_states = []
            states_elem = sdobi.find('.//B840')
            if states_elem is not None:
                for state_elem in states_elem.findall('./ctry'):
                    if state_elem.text:
                        designated_states.append(state_elem.text)
            
            if designated_states:
                patent_data["bibliographic_data"]["designated_states"] = designated_states
        
        # Process abstract
        abstract = root.find('.//abstract')
        if abstract:
            abstract_text = ""
            for p in abstract.findall('.//p'):
                p_text = p.text or ""
                # Remove XML comments
                p_text = re.sub(r'<!--.*?-->', '', p_text)
                if p_text.strip():
                    abstract_text += p_text.strip() + " "
            
            if abstract_text.strip():
                patent_data["bibliographic_data"]["abstract"] = abstract_text.strip()
        
        # Extract main sections (description)
        description = root.find('.//description')
        if description:
            current_heading = None
            
            for elem in description.iter():
                if elem.tag == 'heading':
                    heading_id = elem.get('id')
                    heading_text = ""
                    
                    # Extract heading text from nested tags
                    if len(elem) > 0:
                        if elem[0].tag in ['u', 'b', 'i']:
                            if len(elem[0]) > 0 and elem[0][0].tag in ['u', 'b', 'i']:
                                heading_text = elem[0][0].text or ""
                            else:
                                heading_text = elem[0].text or ""
                        else:
                            heading_text = elem[0].text or ""
                    else:
                        heading_text = elem.text or ""
                    
                    # If still no text, try converting to string and regex
                    if not heading_text.strip():
                        elem_str = ET.tostring(elem, encoding='unicode')
                        match = re.search(r'<[bu]><[bu]>(.*?)</[bu]></[bu]>', elem_str)
                        if match:
                            heading_text = match.group(1)
                        else:
                            match = re.search(r'<[bu]>(.*?)</[bu]>', elem_str)
                            if match:
                                heading_text = match.group(1)
                    
                    heading_text = heading_text.strip()
                    
                    # Only add main section headings (filtering out compound descriptors)
                    if heading_text and not re.search(r'^\([0-9][sS]\)-|^[Ss]tep [A-Z]:|^[Cc]ompound [0-9]', heading_text):
                        current_heading = {
                            "heading_id": heading_id,
                            "heading_text": heading_text,
                            "paragraphs": []
                        }
                        patent_data["main_sections"].append(current_heading)
                
                elif elem.tag == 'p' and current_heading:
                    p_id = elem.get('id')
                    p_num = elem.get('num')
                    p_text = elem.text or ""
                    
                    # Remove XML comments
                    p_text = re.sub(r'<!--.*?-->', '', p_text)
                    
                    if p_text.strip():
                        current_heading["paragraphs"].append({
                            "p_id": p_id,
                            "p_number": p_num,
                            "text": p_text.strip()
                        })
        
        # Extract claims
        claims_section = root.find('.//claims')
        if claims_section:
            for claim_elem in claims_section.findall('.//claim'):
                claim_num = claim_elem.get('num')
                claim_text = ""
                
                # Extract claim text from various possible structures
                for text_elem in claim_elem.iter():
                    if text_elem.tag in ['claim-text', 'p'] and text_elem.text:
                        text = text_elem.text.strip()
                        if text:
                            claim_text += text + " "
                
                if claim_text.strip():
                    patent_data["claims"].append({
                        "claim_number": claim_num,
                        "text": claim_text.strip()
                    })
        
        # Save to file if needed
        if output_file:
            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(patent_data, f, indent=2, ensure_ascii=False)
            print(f"Saved patent data to {output_file}")
        
        return patent_data
    
    except Exception as e:
        print(f"Error extracting patent data: {e}")
        import traceback
        traceback.print_exc()
        return {}

def process_directory(directory_path, output_dir=None):
    """Process all XML files in the given directory"""
    if output_dir is None:
        output_dir = directory_path
    
    os.makedirs(output_dir, exist_ok=True)
    
    xml_files = [f for f in os.listdir(directory_path) if f.endswith('.xml')]
    
    print(f"Found {len(xml_files)} XML files")
    
    for xml_file in xml_files:
        print(f"Processing {xml_file}...")
        xml_path = os.path.join(directory_path, xml_file)
        output_file = os.path.join(output_dir, f"{os.path.splitext(xml_file)[0]}_full_data.json")
        extract_patent_data(xml_path, output_file)

# Run the extraction
# Process a single file
xml_file_path = "EP18823397W1B9.xml"  # Change to your file path
output_file = "EP18823397W1B9_full_data.json"
extract_patent_data(xml_file_path, output_file)


Saved patent data to EP18823397W1B9_full_data.json


{'bibliographic_data': {'doc_id': 'EP18823397B9W1',
  'file': 'EP18823397W1B9.xml',
  'language': 'en',
  'country': 'EP',
  'doc_number': '3647315',
  'kind_code': 'B9',
  'correction_code': 'W1',
  'publication_date': '20250604',
  'status': 'c',
  'dtd_version': 'ep-patent-document-v1-7',
  'document_type': 'CORRECTED EUROPEAN PATENT SPECIFICATION',
  'kind': 'B9',
  'publication_date_full': '20250604',
  'correction_information': {'correction_code': 'W1',
   'correction_details_b155': [{'language': 'de', 'text': 'Ansprüche DE'},
    {'language': 'de', 'text': 'Ansprüche EN'},
    {'language': 'en', 'text': 'Ansprüche FR'},
    {'language': 'fr', 'text': 'Claims DE'}]},
  'application_number': '18823397.7',
  'application_date': '20180521',
  'priorities': [{'number': '201710502653',
    'date': '20170627',
    'country': 'CN'},
   {'number': '201810291023', 'date': '20180403', 'country': 'CN'}],
  'corrigendum_bulletin': {'date': '20250604', 'bulletin_number': '202523'},
  'publica

In [27]:
import xml.etree.ElementTree as ET
import re
import json
import os
from collections import defaultdict

def is_main_heading(heading_text):
    """
    Determine if a heading is a main section heading (not a compound or example).
    """
    # List of common main section heading patterns
    main_heading_patterns = [
        r'^(Technical|Field|Background|Summary|Introduction|Description)',
        r'^(Action|Effect|Detailed|Brief|Abstract)',
        r'^(Advantage|Claim[s]?$)',  # Example as a main section, not specific examples
        r'^(Embodiment[s]?|Disclosure|Object|Figure|Drawing)',
        r'of\s+([Tt]he\s+)?[Ii]nvention',
    ]
    
    # Check if it matches any main heading pattern
    for pattern in main_heading_patterns:
        if re.search(pattern, heading_text):
            return True
    
    # Check for compound or example patterns to exclude
    compound_patterns = [
        r'^\([0-9][sS]\)-',            # (2S)-
        r'^[Ss]tep [A-Z]:',            # Step A:
        r'^[Cc]ompound [0-9]',         # Compound 1
        r'acid$',                      # Ends with "acid"
        r'phosphoryl',                 # Contains phosphoryl
        r'benzamido',                  # Contains benzamido
        r'phenyl',                     # Contains phenyl
        r'Example\s+\d+[^$]',          # Example 1 (with something after it)
        r'[Cc]hemical\s+[Ff]ormula'    # Chemical formula
    ]
    
    for pattern in compound_patterns:
        if re.search(pattern, heading_text):
            return False
    
    # If the heading is very long, it's probably a compound
    if len(heading_text) > 30:
        return False
        
    # Assume it's a main heading if it's short and doesn't match compound patterns
    return True

def extract_heading_text(elem):
    """Extract text from heading element handling nested tags"""
    # Try to convert the element to a string and use regex
    elem_str = ET.tostring(elem, encoding='unicode')
    
    # Try various patterns to extract the text
    for pattern in [r'<[bu]><[bu]>(.*?)</[bu]></[bu]>', r'<[bu]>(.*?)</[bu]>']:
        match = re.search(pattern, elem_str)
        if match:
            return match.group(1).strip()
    
    # If regex didn't work, try direct extraction
    if len(elem) > 0:
        if elem[0].tag in ['u', 'b', 'i']:
            if len(elem[0]) > 0 and elem[0][0].tag in ['u', 'b', 'i']:
                return (elem[0][0].text or "").strip()
            else:
                return (elem[0].text or "").strip()
        else:
            return (elem[0].text or "").strip()
    else:
        return (elem.text or "").strip()

def extract_patent_data(xml_file_path, output_file=None, debug=False):
    """
    Extract complete patent data including detailed bibliographic information,
    main headers (without compounds/examples), and claims.
    """
    try:
        tree = ET.parse(xml_file_path)
        root = tree.getroot()
        
        # Initialize data structure
        patent_data = {
            "bibliographic_data": {},
            "main_sections": [],
            "claims": []
        }
        
        # Extract root document attributes
        if root.tag == 'ep-patent-document':
            patent_data["bibliographic_data"].update({
                "doc_id": root.get('id'),
                "file": root.get('file'),
                "language": root.get('lang'),
                "country": root.get('country'),
                "doc_number": root.get('doc-number'),
                "kind_code": root.get('kind'),
                "correction_code": root.get('correction-code'),
                "publication_date": root.get('date-publ'),
                "status": root.get('status'),
                "dtd_version": root.get('dtd-version')
            })
        
        # Process SDOBI (Standard Document BIbliography) section
        # [BIBLIOGRAPHIC DATA EXTRACTION - SAME AS BEFORE]
        # ...
        
        # Extract main sections (description) - FIXED APPROACH
        description = root.find('.//description')
        if description:
            # First, collect all elements in order
            ordered_elements = []
            
            for elem in description:
                if elem.tag in ['heading', 'p']:
                    ordered_elements.append(elem)
            
            # Process elements and associate paragraphs with headings
            current_main_section = None
            
            for elem in ordered_elements:
                if elem.tag == 'heading':
                    heading_id = elem.get('id')
                    heading_text = extract_heading_text(elem)
                    
                    if heading_text and is_main_heading(heading_text):
                        current_main_section = {
                            "heading_id": heading_id,
                            "heading_text": heading_text,
                            "paragraphs": []
                        }
                        patent_data["main_sections"].append(current_main_section)
                        
                        if debug:
                            print(f"Main heading found: {heading_text} (ID: {heading_id})")
                    elif debug:
                        print(f"Skipping non-main heading: {heading_text} (ID: {heading_id})")
                
                elif elem.tag == 'p' and current_main_section:
                    p_id = elem.get('id')
                    p_num = elem.get('num')
                    p_text = elem.text or ""
                    
                    # Remove XML comments
                    p_text = re.sub(r'<!--.*?-->', '', p_text)
                    
                    if p_text.strip():
                        current_main_section["paragraphs"].append({
                            "p_id": p_id,
                            "p_number": p_num,
                            "text": p_text.strip()
                        })
        
        # Extract claims
        claims_section = root.find('.//claims')
        if claims_section:
            for claim_elem in claims_section.findall('.//claim'):
                claim_num = claim_elem.get('num')
                claim_text = ""
                
                # Extract claim text from various possible structures
                for text_elem in claim_elem.iter():
                    if text_elem.tag in ['claim-text', 'p'] and text_elem.text:
                        text = text_elem.text.strip()
                        if text:
                            claim_text += text + " "
                
                if claim_text.strip():
                    patent_data["claims"].append({
                        "claim_number": claim_num,
                        "text": claim_text.strip()
                    })
        
        # Save to file if needed
        if output_file:
            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(patent_data, f, indent=2, ensure_ascii=False)
            print(f"Saved patent data to {output_file}")
        
        return patent_data
    
    except Exception as e:
        print(f"Error extracting patent data: {e}")
        import traceback
        traceback.print_exc()
        return {}

# Run the extraction
    # Process a single file with debug info
xml_file_path = "EP18823397W1B9.xml"  # Change to your file path
output_file = "EP18823397W1B9_full_data_1.json"
extract_patent_data(xml_file_path, output_file, debug=True)

Main heading found: Technical field (ID: h0001)
Main heading found: Technical background (ID: h0002)
Main heading found: Summary of the Invention (ID: h0003)
Main heading found: Action and effect of the invention: (ID: h0004)
Main heading found: Detailed description of the invention (ID: h0005)
Main heading found: Example 1 (ID: h0006)
Skipping non-main heading: (2s)-2-(2,6-dichloro-4-(2-(hydroxy(phenyl)phosphoryl)ethyl)benzamido)-3-(3-(methylsulfon yl)phenyl)propionic acid (ID: h0007)
Skipping non-main heading: Step A: methoxyphenylphosphoryl chloride (Compound 1.1) (ID: h0008)
Skipping non-main heading: Step B: Methyl 2,6-dichloro-4-((phenyl(methoxy)phosphoryl)ethynyl)benzoate (Compound 1.2) (ID: h0009)
Skipping non-main heading: Step C: 2,6-dichloro-4-((hydroxy(phenyl)phosphoryl)ethynyl)benzoic acid (Compound 1.3) (ID: h0010)
Skipping non-main heading: Step D: (ID: h0011)
Skipping non-main heading: (2s)-2-(2,6-dichloro-4-((hydroxy(phenyl)phosphoryl)ethynyl)benzamido)-3-(3-(methylsul

{'bibliographic_data': {'doc_id': 'EP18823397B9W1',
  'file': 'EP18823397W1B9.xml',
  'language': 'en',
  'country': 'EP',
  'doc_number': '3647315',
  'kind_code': 'B9',
  'correction_code': 'W1',
  'publication_date': '20250604',
  'status': 'c',
  'dtd_version': 'ep-patent-document-v1-7'},
 'main_sections': [{'heading_id': 'h0001',
   'heading_text': 'Technical field',
   'paragraphs': [{'p_id': 'p0001',
     'p_number': '0001',
     'text': 'The present invention relates to the field of pharmaceuticals, and in particular to phosphorus-containing compound, preparation method thereof, and use for treating dry eye.'}]},
  {'heading_id': 'h0002',
   'heading_text': 'Technical background',
   'paragraphs': [{'p_id': 'p0002',
     'p_number': '0002',
     'text': 'Tears provide long-lasting moisturization and lubrication to the eyes, which is the key to maintaining vision and eye comfort. Tears are composed of water, lipids, mucus, antibodies, and specific proteins with anti-infective pr

In [30]:
import xml.etree.ElementTree as ET
import re
import json
import os
from collections import defaultdict

def extract_heading_text(elem):
    """Extract text from heading element handling nested formatting tags"""
    # Try to convert the element to a string and use regex
    elem_str = ET.tostring(elem, encoding='unicode')
    
    # Try various patterns to extract the text
    for pattern in [r'<[bu]><[bu]>(.*?)</[bu]></[bu]>', r'<[bu]>(.*?)</[bu]>']:
        match = re.search(pattern, elem_str)
        if match:
            return match.group(1).strip()
    
    # If regex didn't work, try direct extraction
    if len(elem) > 0:
        if elem[0].tag in ['u', 'b', 'i']:
            if len(elem[0]) > 0 and elem[0][0].tag in ['u', 'b', 'i']:
                return (elem[0][0].text or "").strip()
            else:
                return (elem[0].text or "").strip()
        else:
            return (elem[0].text or "").strip()
    else:
        return (elem.text or "").strip()

def is_main_heading(heading_text):
    """
    Determine if a heading is a main section heading (not a compound or example).
    """
    # List of common main section heading patterns
    main_heading_patterns = [
        r'^(Technical|Field|Background|Summary|Introduction|Description)',
        r'^(Action|Effect|Detailed|Brief|Abstract)',
        r'^(Advantage|Example[s]?$|Claim[s]?$)',  # Only match "Example" or "Examples" as whole words
        r'^(Embodiment[s]?|Disclosure|Object|Figure|Drawing)',
        r'of\s+([Tt]he\s+)?[Ii]nvention',
    ]
    
    # Check if it matches any main heading pattern
    for pattern in main_heading_patterns:
        if re.search(pattern, heading_text):
            return True
    
    # Check for compound or example patterns to exclude
    compound_patterns = [
        r'^\([0-9][sS]\)-',            # (2S)-
        r'^[Ss]tep [A-Z]:',            # Step A:
        r'^[Cc]ompound [0-9]',         # Compound 1
        r'acid$',                      # Ends with "acid"
        r'phosphoryl',                 # Contains phosphoryl
        r'benzamido',                  # Contains benzamido
        r'phenyl',                     # Contains phenyl
        r'Example\s+\d+[^$]',          # Example 1 (with something after it)
        r'[Cc]hemical\s+[Ff]ormula'    # Chemical formula
    ]
    
    for pattern in compound_patterns:
        if re.search(pattern, heading_text):
            return False
    
    # If the heading is very long, it's probably a compound
    if len(heading_text) > 30:
        return False
        
    # Assume it's a main heading if it's short and doesn't match compound patterns
    return True

def extract_bibliographic_data(root):
    """
    Extract bibliographic data from patent XML.
    
    Args:
        root: Root element of the XML document
        
    Returns:
        dict: Bibliographic data
    """
    bibliographic_data = {}
    
    # Extract root document attributes
    if root.tag == 'ep-patent-document':
        bibliographic_data.update({
            "doc_id": root.get('id'),
            "file": root.get('file'),
            "language": root.get('lang'),
            "country": root.get('country'),
            "doc_number": root.get('doc-number'),
            "kind_code": root.get('kind'),
            "correction_code": root.get('correction-code'),
            "publication_date": root.get('date-publ'),
            "status": root.get('status'),
            "dtd_version": root.get('dtd-version')
        })
    
    # Process SDOBI (Standard Document BIbliography) section
    sdobi = root.find('.//SDOBI')
    if sdobi:
        # Extract document type/title (B121)
        doc_type_elem = sdobi.find('.//B121')
        if doc_type_elem is not None and doc_type_elem.text:
            bibliographic_data["document_type"] = doc_type_elem.text
        
        # Extract document kind code (B130)
        kind_elem = sdobi.find('.//B130')
        if kind_elem is not None and kind_elem.text:
            bibliographic_data["kind"] = kind_elem.text

        # Extract publication date (B140)
        pub_date_elem = sdobi.find('.//B140/date')
        if pub_date_elem is not None and pub_date_elem.text:
            bibliographic_data["publication_date_full"] = pub_date_elem.text
        
        # Extract correction information (B150)
        if sdobi.find('.//B150') is not None:
            correction_info = {}
            correction_code = sdobi.find('.//B151')
            if correction_code is not None and correction_code.text:
                correction_info["correction_code"] = correction_code.text
            
            # Extract correction details from B154 and B155
            for section in ['B154', 'B155']:
                details = []
                section_elems = sdobi.findall(f'.//{section}/*')
                for i in range(0, len(section_elems), 2):
                    if i+1 < len(section_elems):
                        lang_elem = section_elems[i]
                        text_elem = section_elems[i+1]
                        if lang_elem.text and text_elem.text:
                            details.append({
                                "language": lang_elem.text,
                                "text": text_elem.text
                            })
                
                if details:
                    correction_info[f"correction_details_{section.lower()}"] = details
            
            if correction_info:
                bibliographic_data["correction_information"] = correction_info
        
        # Extract application information
        app_num = sdobi.find('.//B210')
        if app_num is not None and app_num.text:
            bibliographic_data["application_number"] = app_num.text
        
        app_date = sdobi.find('.//B220/date')
        if app_date is not None and app_date.text:
            bibliographic_data["application_date"] = app_date.text
        
        # Extract priority information (B300)
        priorities = []
        for i, priority_num in enumerate(sdobi.findall('.//B310')):
            if priority_num.text:
                priority = {"number": priority_num.text}
                
                # Find corresponding date and country
                if i < len(sdobi.findall('.//B320/date')):
                    date_elem = sdobi.findall('.//B320/date')[i]
                    if date_elem is not None and date_elem.text:
                        priority["date"] = date_elem.text
                
                if i < len(sdobi.findall('.//B330/ctry')):
                    country_elem = sdobi.findall('.//B330/ctry')[i]
                    if country_elem is not None and country_elem.text:
                        priority["country"] = country_elem.text
                
                priorities.append(priority)
        
        if priorities:
            bibliographic_data["priorities"] = priorities
        
        # Extract bulletin information
        for field in ['B405', 'B430', 'B450']:
            bulletin_elem = sdobi.find(f'.//{field}')
            if bulletin_elem is not None:
                date_elem = bulletin_elem.find('./date')
                bnum_elem = bulletin_elem.find('./bnum')
                
                info = {}
                if date_elem is not None and date_elem.text:
                    info["date"] = date_elem.text
                if bnum_elem is not None and bnum_elem.text:
                    info["bulletin_number"] = bnum_elem.text
                
                if info:
                    field_name = {
                        'B405': 'corrigendum_bulletin',
                        'B430': 'publication_bulletin',
                        'B450': 'grant_bulletin'
                    }.get(field)
                    bibliographic_data[field_name] = info
        
        # Extract IPC (International Patent Classification) information
        ipc_classes = []
        for ipc_elem in sdobi.findall('.//classification-ipcr'):
            text_elem = ipc_elem.find('./text')
            if text_elem is not None and text_elem.text:
                ipc_classes.append(text_elem.text.strip())
        
        if ipc_classes:
            bibliographic_data["ipc_classes"] = ipc_classes
        
        # Extract CPC (Cooperative Patent Classification) information
        cpc_classes = []
        for cpc_elem in sdobi.findall('.//classification-cpc'):
            text_elem = cpc_elem.find('./text')
            if text_elem is not None and text_elem.text:
                cpc_classes.append(text_elem.text.strip())
        
        if cpc_classes:
            bibliographic_data["cpc_classes"] = cpc_classes
        
        # Extract title information
        title_info = {}
        title_section = sdobi.find('.//B540')
        if title_section is not None:
            for i, lang_elem in enumerate(title_section.findall('.//B541')):
                if lang_elem.text and i < len(title_section.findall('.//B542')):
                    title_elem = title_section.findall('.//B542')[i]
                    if title_elem is not None and title_elem.text:
                        title_info[lang_elem.text] = title_elem.text
        
        if title_info:
            bibliographic_data["title"] = title_info
        
        # Extract inventors information
        inventors = []
        for inv_elem in sdobi.findall('.//B721'):
            inventor = {}
            
            name_elem = inv_elem.find('./snm')
            if name_elem is not None and name_elem.text:
                inventor["name"] = name_elem.text
            
            # Extract address information
            addr_elem = inv_elem.find('./adr')
            if addr_elem is not None:
                address = {}
                
                street_elem = addr_elem.find('./str')
                if street_elem is not None and street_elem.text:
                    address["street"] = street_elem.text
                
                city_elem = addr_elem.find('./city')
                if city_elem is not None and city_elem.text:
                    address["city"] = city_elem.text
                
                country_elem = addr_elem.find('./ctry')
                if country_elem is not None and country_elem.text:
                    address["country"] = country_elem.text
                
                if address:
                    inventor["address"] = address
            
            if inventor:
                inventors.append(inventor)
        
        if inventors:
            bibliographic_data["inventors"] = inventors
        
        # Extract applicant information
        applicants = []
        for app_elem in sdobi.findall('.//B731'):
            applicant = {}
            
            name_elem = app_elem.find('./snm')
            if name_elem is not None and name_elem.text:
                applicant["name"] = name_elem.text
            
            id_elem = app_elem.find('./iid')
            if id_elem is not None and id_elem.text:
                applicant["id"] = id_elem.text
            
            ref_elem = app_elem.find('./irf')
            if ref_elem is not None and ref_elem.text:
                applicant["reference"] = ref_elem.text
            
            # Extract address information
            addr_elem = app_elem.find('./adr')
            if addr_elem is not None:
                address = {}
                
                street_elem = addr_elem.find('./str')
                if street_elem is not None and street_elem.text:
                    address["street"] = street_elem.text
                
                city_elem = addr_elem.find('./city')
                if city_elem is not None and city_elem.text:
                    address["city"] = city_elem.text
                
                country_elem = addr_elem.find('./ctry')
                if country_elem is not None and country_elem.text:
                    address["country"] = country_elem.text
                
                if address:
                    applicant["address"] = address
            
            if applicant:
                applicants.append(applicant)
        
        if applicants:
            bibliographic_data["applicants"] = applicants
        
        # Extract representative/agent information
        representatives = []
        for rep_elem in sdobi.findall('.//B741'):
            representative = {}
            
            name_elem = rep_elem.find('./snm')
            if name_elem is not None and name_elem.text:
                representative["name"] = name_elem.text
            
            id_elem = rep_elem.find('./iid')
            if id_elem is not None and id_elem.text:
                representative["id"] = id_elem.text
            
            # Extract address information
            addr_elem = rep_elem.find('./adr')
            if addr_elem is not None:
                address = {}
                
                street_elem = addr_elem.find('./str')
                if street_elem is not None and street_elem.text:
                    address["street"] = street_elem.text
                
                city_elem = addr_elem.find('./city')
                if city_elem is not None and city_elem.text:
                    address["city"] = city_elem.text
                
                country_elem = addr_elem.find('./ctry')
                if country_elem is not None and country_elem.text:
                    address["country"] = country_elem.text
                
                if address:
                    representative["address"] = address
            
            if representative:
                representatives.append(representative)
        
        if representatives:
            bibliographic_data["representatives"] = representatives
        
        # Extract designated states information
        designated_states = []
        states_elem = sdobi.find('.//B840')
        if states_elem is not None:
            for state_elem in states_elem.findall('./ctry'):
                if state_elem.text:
                    designated_states.append(state_elem.text)
        
        if designated_states:
            bibliographic_data["designated_states"] = designated_states
    
    # Process abstract
    abstract = root.find('.//abstract')
    if abstract:
        abstract_text = ""
        for p in abstract.findall('.//p'):
            p_text = p.text or ""
            # Remove XML comments
            p_text = re.sub(r'<!--.*?-->', '', p_text)
            if p_text.strip():
                abstract_text += p_text.strip() + " "
        
        if abstract_text.strip():
            bibliographic_data["abstract"] = abstract_text.strip()
    
    return bibliographic_data

def extract_main_sections(root):
    """
    Extract main sections from patent XML, excluding compound headers and examples.
    
    Args:
        root: Root element of the XML document
        
    Returns:
        list: Main sections with their headers and paragraphs
    """
    main_sections = []
    
    # Find description section
    description = root.find('.//description')
    if description:
        # First, collect all elements in order
        ordered_elements = []
        for elem in description:
            if elem.tag in ['heading', 'p']:
                ordered_elements.append(elem)
        
        # Process elements and associate paragraphs with headings
        current_main_section = None
        
        for elem in ordered_elements:
            if elem.tag == 'heading':
                heading_id = elem.get('id')
                heading_text = extract_heading_text(elem)
                
                if heading_text and is_main_heading(heading_text):
                    current_main_section = {
                        "heading_id": heading_id,
                        "heading_text": heading_text,
                        "paragraphs": []
                    }
                    main_sections.append(current_main_section)
            
            elif elem.tag == 'p' and current_main_section:
                p_id = elem.get('id')
                p_num = elem.get('num')
                p_text = elem.text or ""
                
                # Remove XML comments
                p_text = re.sub(r'<!--.*?-->', '', p_text)
                
                if p_text.strip():
                    current_main_section["paragraphs"].append({
                        "p_id": p_id,
                        "p_number": p_num,
                        "text": p_text.strip()
                    })
    
    return main_sections

def extract_claims(root):
    """
    Extract claims from patent XML.
    
    Args:
        root: Root element of the XML document
        
    Returns:
        list: Claims with their numbers and text
    """
    claims = []
    
    # Find claims section
    claims_section = root.find('.//claims')
    if claims_section:
        for claim_elem in claims_section.findall('.//claim'):
            claim_num = claim_elem.get('num')
            claim_text = ""
            
            # Extract claim text from various possible structures
            for text_elem in claim_elem.iter():
                if text_elem.tag in ['claim-text', 'p'] and text_elem.text:
                    text = text_elem.text.strip()
                    if text:
                        claim_text += text + " "
            
            claim_text = claim_text.strip()
            if claim_text:
                claims.append({
                    "claim_number": claim_num,
                    "text": claim_text
                })
    
    return claims

def process_patent_xml(xml_file_path, output_file=None):
    """
    Process patent XML and extract structured data including bibliographic data,
    main sections, and claims.
    
    Args:
        xml_file_path: Path to the XML file
        output_file: Path where to save the JSON output (optional)
    
    Returns:
        dict: Structured patent data
    """
    try:
        # Parse XML file
        tree = ET.parse(xml_file_path)
        root = tree.getroot()
        
        # Initialize result structure
        patent_data = {
            "bibliographic_data": {},
            "main_sections": [],
            "claims": []
        }
        
        # Extract data using the specialized functions
        patent_data["bibliographic_data"] = extract_bibliographic_data(root)
        patent_data["main_sections"] = extract_main_sections(root)
        patent_data["claims"] = extract_claims(root)
        
        # Save to JSON file if specified
        if output_file:
            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(patent_data, f, indent=2, ensure_ascii=False)
            print(f"Patent data saved to {output_file}")
        
        return patent_data
    
    except Exception as e:
        print(f"Error processing patent XML: {e}")
        import traceback
        traceback.print_exc()
        return {
            "bibliographic_data": {},
            "main_sections": [],
            "claims": []
        }

def process_directory(directory_path, output_dir=None):
    """
    Process all XML files in a directory.
    
    Args:
        directory_path: Directory containing XML files
        output_dir: Directory where to save the JSON outputs (optional)
    """
    if output_dir is None:
        output_dir = directory_path
    
    os.makedirs(output_dir, exist_ok=True)
    
    xml_files = [f for f in os.listdir(directory_path) if f.endswith('.xml')]
    print(f"Found {len(xml_files)} XML files")
    
    for xml_file in xml_files:
        print(f"Processing {xml_file}...")
        xml_path = os.path.join(directory_path, xml_file)
        output_file = os.path.join(output_dir, f"{os.path.splitext(xml_file)[0]}_data.json")
        process_patent_xml(xml_path, output_file)

# if __name__ == "__main__":
# Process a single file
xml_file_path = "EP18823397W1B9.xml"  # Change to your file path
output_file = "EP18823397W1B9_data_2.json"
process_patent_xml(xml_file_path, output_file)
    
    # Uncomment to process all XML files in a directory
    # process_directory("path/to/your/xml/files", "path/to/output/directory")

Patent data saved to EP18823397W1B9_data_2.json


{'bibliographic_data': {'doc_id': 'EP18823397B9W1',
  'file': 'EP18823397W1B9.xml',
  'language': 'en',
  'country': 'EP',
  'doc_number': '3647315',
  'kind_code': 'B9',
  'correction_code': 'W1',
  'publication_date': '20250604',
  'status': 'c',
  'dtd_version': 'ep-patent-document-v1-7',
  'document_type': 'CORRECTED EUROPEAN PATENT SPECIFICATION',
  'kind': 'B9',
  'publication_date_full': '20250604',
  'correction_information': {'correction_code': 'W1',
   'correction_details_b155': [{'language': 'de', 'text': 'Ansprüche DE'},
    {'language': 'de', 'text': 'Ansprüche EN'},
    {'language': 'de', 'text': 'Ansprüche FR'},
    {'language': 'en', 'text': 'Claims DE'},
    {'language': 'en', 'text': 'Claims EN'},
    {'language': 'en', 'text': 'Claims FR'},
    {'language': 'fr', 'text': 'Revendications DE'},
    {'language': 'fr', 'text': 'Revendications EN'},
    {'language': 'fr', 'text': 'Revendications FR'}]},
  'application_number': '18823397.7',
  'application_date': '20180521

In [41]:
import xml.etree.ElementTree as ET
import re
import json
import os
from collections import defaultdict

def extract_heading_text(elem):
    """Extract text from heading element handling nested formatting tags"""
    # Try to convert the element to a string and use regex
    elem_str = ET.tostring(elem, encoding='unicode')
    
    # Try various patterns to extract the text
    for pattern in [r'<[bu]><[bu]>(.*?)</[bu]></[bu]>', r'<[bu]>(.*?)</[bu]>']:
        match = re.search(pattern, elem_str)
        if match:
            return match.group(1).strip()
    
    # If regex didn't work, try direct extraction
    if len(elem) > 0:
        if elem[0].tag in ['u', 'b', 'i']:
            if len(elem[0]) > 0 and elem[0][0].tag in ['u', 'b', 'i']:
                return (elem[0][0].text or "").strip()
            else:
                return (elem[0].text or "").strip()
        else:
            return (elem[0].text or "").strip()
    else:
        return (elem.text or "").strip()

def is_main_heading(heading_text, debug=False):
    """
    Stricter determination if a heading is a main section heading (not a compound or example).
    
    Args:
        heading_text: The heading text to analyze
        debug: Whether to print debug info
        
    Returns:
        bool: True if it's a main heading, False otherwise
    """
    # Strip and clean the heading text
    if not heading_text:
        return False
        
    # Convert to lowercase for case-insensitive matching
    heading_lower = heading_text.lower()
    
    # Explicit whitelist of main heading patterns (more precise than before)
    main_heading_whitelist = [
        r'^technical field$',
        r'^technical background$',
        r'^field of the invention$',
        r'^background$', 
        r'^background of the invention$',
        r'^summary$',
        r'^summary of the invention$',
        r'^brief summary$',
        r'^introduction$',
        r'^brief description of the drawings$',
        r'^detailed description$',
        r'^detailed description of the invention$',
        r'^abstract$',
        r'^description of embodiments$',
        r'^description of the embodiments$',
        r'^description of preferred embodiments$',
        r'^advantages$',
        r'^advantages of the invention$',
        r'^industrial applicability$',
        r'^brief description$',
        r'^objects of the invention$',
        r'^disclosure of the invention$',
        r'^action and effect of the invention$',
        r'^action and effect$'
    ]
    
    # Check exact matches for main headings first
    for pattern in main_heading_whitelist:
        if re.match(pattern, heading_lower):
            if debug:
                print(f"MAIN HEADING (whitelist): {heading_text}")
            return True
    
    # Extended patterns that require more context for verification
    extended_main_patterns = [
        r'^(figure|figures|drawings)',
        r'^embodiment',
        r'^description of',
        r'of the( present)? invention$',
    ]
    
    # Check extended patterns if no exact match was found
    for pattern in extended_main_patterns:
        if re.search(pattern, heading_lower):
            # Further verify it's not an example or compound
            if not re.search(r'example|compound|step|acid|\d+\s*[a-zA-Z]|\([0-9][a-zA-Z]+\)', heading_lower):
                if debug:
                    print(f"MAIN HEADING (extended): {heading_text}")
                return True
    
    # Extensive blacklist to catch all examples, compounds, and specific structures
    blacklist_patterns = [
        # Examples
        r'example\s*\d+',
        r'examples?\s*\d*[^$]',
        r'experimental example',
        r'comparative example',
        r'reference example',
        r'preparative example',
        
        # Compounds, chemicals, and formulas
        r'compound',
        r'chemical',
        r'formula',
        r'synthesis',
        r'preparation of',
        r'structure',
        r'moiety',
        r'\b[a-z]*acid',
        r'phosphoryl',
        r'benzamido',
        r'phenyl',
        r'methyl',
        r'ethyl',
        
        # Steps and procedures
        r'step [a-z0-9]',
        r'procedure',
        r'stage',
        
        # Statistical patterns
        r'\d+\s*\.?\s*\d*\s*[a-zA-Z]',  # Numbers with letters like "1.2a"
        r'^\([a-z0-9]+\)',              # Parenthetical labels like "(2S)"
        r'^\[[a-z0-9]+\]',              # Bracket labels
        
        # Very specific chemical names
        r'butyl',
        r'propyl',
        r'amino',
        r'hydroxy',
        r'glycol',
        r'ester',
        r'ether',
        r'oxide',
        r'polymer',
        r'peptide'
    ]
    
    # Check if heading matches any blacklist pattern
    for pattern in blacklist_patterns:
        if re.search(pattern, heading_lower):
            if debug:
                print(f"EXCLUDED (blacklist): {heading_text} - Pattern: {pattern}")
            return False
    
    # Length-based criteria
    if len(heading_text) > 25:
        if debug:
            print(f"EXCLUDED (too long): {heading_text}")
        return False
    
    # If nothing matched specifically, check if it has characteristics of a main heading
    if heading_text.istitle() and len(heading_text.split()) <= 4:
        if debug:
            print(f"MAIN HEADING (fallback): {heading_text}")
        return True
        
    # Default to exclusion for anything that doesn't match our positive criteria
    if debug:
        print(f"EXCLUDED (default): {heading_text}")
    return False

def extract_bibliographic_data(root):
    """Extract bibliographic data from patent XML."""
    bibliographic_data = {}
    
    # Extract root document attributes
    if root.tag == 'ep-patent-document':
        bibliographic_data.update({
            "doc_id": root.get('id'),
            "file": root.get('file'),
            "language": root.get('lang'),
            "country": root.get('country'),
            "doc_number": root.get('doc-number'),
            "kind_code": root.get('kind'),
            "correction_code": root.get('correction-code'),
            "publication_date": root.get('date-publ'),
            "status": root.get('status'),
            "dtd_version": root.get('dtd-version')
        })
    
    # Process SDOBI (Standard Document BIbliography) section
    sdobi = root.find('.//SDOBI')
    if sdobi:
        # Document type/title (B121)
        doc_type_elem = sdobi.find('.//B121')
        if doc_type_elem is not None and doc_type_elem.text:
            bibliographic_data["document_type"] = doc_type_elem.text
        
        # Document kind code (B130)
        kind_elem = sdobi.find('.//B130')
        if kind_elem is not None and kind_elem.text:
            bibliographic_data["kind"] = kind_elem.text

        # Publication date (B140)
        pub_date_elem = sdobi.find('.//B140/date')
        if pub_date_elem is not None and pub_date_elem.text:
            bibliographic_data["publication_date_full"] = pub_date_elem.text
        
        # Correction information (B150)
        if sdobi.find('.//B150') is not None:
            correction_info = {}
            correction_code = sdobi.find('.//B151')
            if correction_code is not None and correction_code.text:
                correction_info["correction_code"] = correction_code.text
            
            # Extract correction details from B154 and B155
            for section in ['B154', 'B155']:
                details = []
                section_elems = sdobi.findall(f'.//{section}/*')
                for i in range(0, len(section_elems), 2):
                    if i+1 < len(section_elems):
                        lang_elem = section_elems[i]
                        text_elem = section_elems[i+1]
                        if lang_elem.text and text_elem.text:
                            details.append({
                                "language": lang_elem.text,
                                "text": text_elem.text
                            })
                
                if details:
                    correction_info[f"correction_details_{section.lower()}"] = details
            
            if correction_info:
                bibliographic_data["correction_information"] = correction_info
        
        # Application information
        app_num = sdobi.find('.//B210')
        if app_num is not None and app_num.text:
            bibliographic_data["application_number"] = app_num.text
        
        app_date = sdobi.find('.//B220/date')
        if app_date is not None and app_date.text:
            bibliographic_data["application_date"] = app_date.text
        
        # Priority information (B300)
        priorities = []
        for i, priority_num in enumerate(sdobi.findall('.//B310')):
            if priority_num.text:
                priority = {"number": priority_num.text}
                
                # Find corresponding date and country
                if i < len(sdobi.findall('.//B320/date')):
                    date_elem = sdobi.findall('.//B320/date')[i]
                    if date_elem is not None and date_elem.text:
                        priority["date"] = date_elem.text
                
                if i < len(sdobi.findall('.//B330/ctry')):
                    country_elem = sdobi.findall('.//B330/ctry')[i]
                    if country_elem is not None and country_elem.text:
                        priority["country"] = country_elem.text
                
                priorities.append(priority)
        
        if priorities:
            bibliographic_data["priorities"] = priorities
        
        # Bulletin information
        for field in ['B405', 'B430', 'B450']:
            bulletin_elem = sdobi.find(f'.//{field}')
            if bulletin_elem is not None:
                date_elem = bulletin_elem.find('./date')
                bnum_elem = bulletin_elem.find('./bnum')
                
                info = {}
                if date_elem is not None and date_elem.text:
                    info["date"] = date_elem.text
                if bnum_elem is not None and bnum_elem.text:
                    info["bulletin_number"] = bnum_elem.text
                
                if info:
                    field_name = {
                        'B405': 'corrigendum_bulletin',
                        'B430': 'publication_bulletin',
                        'B450': 'grant_bulletin'
                    }.get(field)
                    bibliographic_data[field_name] = info
        
        # IPC (International Patent Classification) information
        ipc_classes = []
        for ipc_elem in sdobi.findall('.//classification-ipcr'):
            text_elem = ipc_elem.find('./text')
            if text_elem is not None and text_elem.text:
                ipc_classes.append(text_elem.text.strip())
        
        if ipc_classes:
            bibliographic_data["ipc_classes"] = ipc_classes
        
        # CPC (Cooperative Patent Classification) information
        cpc_classes = []
        for cpc_elem in sdobi.findall('.//classification-cpc'):
            text_elem = cpc_elem.find('./text')
            if text_elem is not None and text_elem.text:
                cpc_classes.append(text_elem.text.strip())
        
        if cpc_classes:
            bibliographic_data["cpc_classes"] = cpc_classes
        
        # Title information
        title_info = {}
        title_section = sdobi.find('.//B540')
        if title_section is not None:
            for i, lang_elem in enumerate(title_section.findall('.//B541')):
                if lang_elem.text and i < len(title_section.findall('.//B542')):
                    title_elem = title_section.findall('.//B542')[i]
                    if title_elem is not None and title_elem.text:
                        title_info[lang_elem.text] = title_elem.text
        
        if title_info:
            bibliographic_data["title"] = title_info
        
        # Extract inventors, applicants, representatives, etc. (code unchanged)
        # ... [keep the rest of your existing bibliographic extraction code] ...
        
        # Extract inventors information
        inventors = []
        for inv_elem in sdobi.findall('.//B721'):
            inventor = {}
            
            name_elem = inv_elem.find('./snm')
            if name_elem is not None and name_elem.text:
                inventor["name"] = name_elem.text
            
            # Extract address information
            addr_elem = inv_elem.find('./adr')
            if addr_elem is not None:
                address = {}
                
                street_elem = addr_elem.find('./str')
                if street_elem is not None and street_elem.text:
                    address["street"] = street_elem.text
                
                city_elem = addr_elem.find('./city')
                if city_elem is not None and city_elem.text:
                    address["city"] = city_elem.text
                
                country_elem = addr_elem.find('./ctry')
                if country_elem is not None and country_elem.text:
                    address["country"] = country_elem.text
                
                if address:
                    inventor["address"] = address
            
            if inventor:
                inventors.append(inventor)
        
        if inventors:
            bibliographic_data["inventors"] = inventors
        
        # Extract applicant information
        applicants = []
        for app_elem in sdobi.findall('.//B731'):
            applicant = {}
            
            name_elem = app_elem.find('./snm')
            if name_elem is not None and name_elem.text:
                applicant["name"] = name_elem.text
            
            id_elem = app_elem.find('./iid')
            if id_elem is not None and id_elem.text:
                applicant["id"] = id_elem.text
            
            ref_elem = app_elem.find('./irf')
            if ref_elem is not None and ref_elem.text:
                applicant["reference"] = ref_elem.text
            
            # Extract address information
            addr_elem = app_elem.find('./adr')
            if addr_elem is not None:
                address = {}
                
                street_elem = addr_elem.find('./str')
                if street_elem is not None and street_elem.text:
                    address["street"] = street_elem.text
                
                city_elem = addr_elem.find('./city')
                if city_elem is not None and city_elem.text:
                    address["city"] = city_elem.text
                
                country_elem = addr_elem.find('./ctry')
                if country_elem is not None and country_elem.text:
                    address["country"] = country_elem.text
                
                if address:
                    applicant["address"] = address
            
            if applicant:
                applicants.append(applicant)
        
        if applicants:
            bibliographic_data["applicants"] = applicants
        
        # Extract representative/agent information
        representatives = []
        for rep_elem in sdobi.findall('.//B741'):
            representative = {}
            
            name_elem = rep_elem.find('./snm')
            if name_elem is not None and name_elem.text:
                representative["name"] = name_elem.text
            
            id_elem = rep_elem.find('./iid')
            if id_elem is not None and id_elem.text:
                representative["id"] = id_elem.text
            
            # Extract address information
            addr_elem = rep_elem.find('./adr')
            if addr_elem is not None:
                address = {}
                
                street_elem = addr_elem.find('./str')
                if street_elem is not None and street_elem.text:
                    address["street"] = street_elem.text
                
                city_elem = addr_elem.find('./city')
                if city_elem is not None and city_elem.text:
                    address["city"] = city_elem.text
                
                country_elem = addr_elem.find('./ctry')
                if country_elem is not None and country_elem.text:
                    address["country"] = country_elem.text
                
                if address:
                    representative["address"] = address
            
            if representative:
                representatives.append(representative)
        
        if representatives:
            bibliographic_data["representatives"] = representatives
        
        # Extract designated states information
        designated_states = []
        states_elem = sdobi.find('.//B840')
        if states_elem is not None:
            for state_elem in states_elem.findall('./ctry'):
                if state_elem.text:
                    designated_states.append(state_elem.text)
        
        if designated_states:
            bibliographic_data["designated_states"] = designated_states
    
    # Process abstract
    abstract = root.find('.//abstract')
    if abstract:
        abstract_text = ""
        for p in abstract.findall('.//p'):
            p_text = p.text or ""
            # Remove XML comments
            p_text = re.sub(r'<!--.*?-->', '', p_text)
            if p_text.strip():
                abstract_text += p_text.strip() + " "
        
        if abstract_text.strip():
            bibliographic_data["abstract"] = abstract_text.strip()
    
    return bibliographic_data

def extract_main_sections(root, debug=False):
    """
    Extract main sections from patent XML, with stricter filtering to exclude 
    ALL compounds and examples.
    """
    main_sections = []
    
    # Find description section
    description = root.find('.//description')
    if description:
        # First, collect all elements in order
        ordered_elements = []
        for elem in description:
            if elem.tag in ['heading', 'p']:
                ordered_elements.append(elem)
        
        # Process elements and associate paragraphs with headings
        current_main_section = None
        
        for elem in ordered_elements:
            if elem.tag == 'heading':
                heading_id = elem.get('id')
                heading_text = extract_heading_text(elem)
                
                # Use the enhanced is_main_heading function with debug option
                if heading_text and is_main_heading(heading_text, debug=debug):
                    current_main_section = {
                        "heading_id": heading_id,
                        "heading_text": heading_text,
                        "paragraphs": []
                    }
                    main_sections.append(current_main_section)
            
            elif elem.tag == 'p' and current_main_section:
                p_id = elem.get('id')
                p_num = elem.get('num')
                p_text = elem.text or ""
                
                # Remove XML comments
                p_text = re.sub(r'<!--.*?-->', '', p_text)
                
                if p_text.strip():
                    current_main_section["paragraphs"].append({
                        "p_id": p_id,
                        "p_number": p_num,
                        "text": p_text.strip()
                    })
    
    return main_sections

def extract_claims(root):
    """Extract claims from patent XML."""
    claims = []
    
    # Find claims section
    claims_section = root.find('.//claims')
    if claims_section:
        for claim_elem in claims_section.findall('.//claim'):
            claim_num = claim_elem.get('num')
            claim_text = ""
            
            # Extract claim text from various possible structures
            for text_elem in claim_elem.iter():
                if text_elem.tag in ['claim-text', 'p'] and text_elem.text:
                    text = text_elem.text.strip()
                    if text:
                        claim_text += text + " "
            
            claim_text = claim_text.strip()
            if claim_text:
                claims.append({
                    "claim_number": claim_num,
                    "text": claim_text
                })
    
    return claims

def process_patent_xml(xml_file_path, output_file=None, debug=False):
    """
    Process patent XML and extract structured data with enhanced filtering.
    """
    try:
        # Parse XML file
        tree = ET.parse(xml_file_path)
        root = tree.getroot()
        
        # Initialize result structure
        patent_data = {
            "bibliographic_data": {},
            "main_sections": [],
            "claims": []
        }
        
        # Extract data using the specialized functions
        patent_data["bibliographic_data"] = extract_bibliographic_data(root)
        patent_data["main_sections"] = extract_main_sections(root, debug=debug)
        patent_data["claims"] = extract_claims(root)
        
        # Save to JSON file if specified
        if output_file:
            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(patent_data, f, indent=2, ensure_ascii=False)
            print(f"Patent data saved to {output_file}")
        
        return patent_data
    
    except Exception as e:
        print(f"Error processing patent XML: {e}")
        import traceback
        traceback.print_exc()
        return {
            "bibliographic_data": {},
            "main_sections": [],
            "claims": []
        }

def process_directory(directory_path, output_dir=None, debug=False):
    """Process all XML files in a directory with enhanced filtering."""
    if output_dir is None:
        output_dir = directory_path
    
    os.makedirs(output_dir, exist_ok=True)
    
    xml_files = [f for f in os.listdir(directory_path) if f.endswith('.xml')]
    print(f"Found {len(xml_files)} XML files")
    
    for xml_file in xml_files:
        print(f"Processing {xml_file}...")
        xml_path = os.path.join(directory_path, xml_file)
        output_file = os.path.join(output_dir, f"{os.path.splitext(xml_file)[0]}_data.json")
        process_patent_xml(xml_path, output_file, debug=debug)

if __name__ == "__main__":
    # Process a single file with debug output
    xml_file_path = "EP22952569NWA1.xml"  # Change to your file path
    output_file = "EP16731796W1B8_data_6.json"
    process_patent_xml(xml_file_path, output_file, debug=True)
    
    # Uncomment to process all XML files in a directory
    # process_directory("path/to/your/xml/files", "path/to/output/directory", debug=True)

MAIN HEADING (whitelist): TECHNICAL FIELD
MAIN HEADING (whitelist): BACKGROUND
MAIN HEADING (whitelist): SUMMARY
MAIN HEADING (whitelist): BRIEF DESCRIPTION OF THE DRAWINGS
MAIN HEADING (whitelist): DETAILED DESCRIPTION
Patent data saved to EP16731796W1B8_data_6.json
