In [1]:
import xml.etree.ElementTree as ET
import json
import os
from collections import defaultdict
import re  # This line is missing!

In [2]:
# EPO Field Definitions - Clean Structure
document_root_attributes = {
    'id': 'Document ID',
    'file': 'File reference',
    'lang': 'Document language',
    'country': 'Country code',
    'doc-number': 'Document number',
    'kind': 'Kind code',
    'correction-code': 'Correction code',
    'date-publ': 'Publication date',
    'status': 'Document status',
    'dtd-version': 'DTD version'
}

sdobi_document_info = {
    'B121': 'Document type/title',
    'B130': 'Document kind code',
    'B140': 'Publication date',
    'B150': 'Correction information',
    'B151': 'Correction code',
    'B154': 'Correction details section 1',
    'B155': 'Correction details section 2'
}

sdobi_application_info = {
    'B210': 'Application number',
    'B220': 'Application date'
}

sdobi_priority_info = {
    'B310': 'Priority number',
    'B320': 'Priority date', 
    'B330': 'Priority country'
}

sdobi_bulletin_info = {
    'B405': 'Corrigendum bulletin',
    'B430': 'Publication bulletin',
    'B450': 'Grant bulletin'
}

sdobi_title_info = {
    'B540': 'Title section',
    'B541': 'Title language',
    'B542': 'Title text'
}

sdobi_inventor_info = {
    'B721': 'Inventor section',
    'snm': 'Name',
    'adr': 'Address section',
    'str': 'Street address',
    'city': 'City',
    'ctry': 'Country'
}

sdobi_applicant_info = {
    'B731': 'Applicant section',
    'snm': 'Name',
    'iid': 'ID',
    'irf': 'Reference',
    'adr': 'Address section',
    'str': 'Street address',
    'city': 'City',
    'ctry': 'Country'
}

sdobi_representative_info = {
    'B741': 'Representative section',
    'snm': 'Name',
    'iid': 'ID',
    'adr': 'Address section',
    'str': 'Street address',
    'city': 'City',
    'ctry': 'Country'
}

classification_info = {
    'classification-ipcr': 'International Patent Classification',
    'classification-cpc': 'Cooperative Patent Classification',
    'text': 'Classification text'
}

designated_states_info = {
    'B840': 'Designated states section',
    'ctry': 'Country code'
}


In [4]:

# Main extraction function using the clean structure
def extract_bibliographic_data(root):
    """Extract bibliographic data from patent XML using clean field definitions."""
    bibliographic_data = {}
    
    # Extract root document attributes using field definitions
    if root.tag == 'ep-patent-document':
        for field_code, field_description in document_root_attributes.items():
            value = root.get(field_code)
            if value is not None:
                # Convert field code to clean field name
                clean_field_name = field_code.replace('-', '_')
                bibliographic_data[clean_field_name] = value
    
    # Process SDOBI section using field definitions
    sdobi = root.find('.//SDOBI')
    if sdobi:
        
        # Extract document information using field definitions
        for field_code, field_description in sdobi_document_info.items():
            if field_code == 'B121':  # Document type
                elem = sdobi.find(f'.//{field_code}')
                if elem is not None and elem.text:
                    bibliographic_data["document_type"] = elem.text
                    
            elif field_code == 'B130':  # Document kind code
                elem = sdobi.find(f'.//{field_code}')
                if elem is not None and elem.text:
                    bibliographic_data["kind"] = elem.text
                    
            elif field_code == 'B140':  # Publication date
                elem = sdobi.find(f'.//{field_code}/date')
                if elem is not None and elem.text:
                    bibliographic_data["publication_date_full"] = elem.text
        
        # Extract application information using field definitions
        for field_code, field_description in sdobi_application_info.items():
            if field_code == 'B210':  # Application number
                elem = sdobi.find(f'.//{field_code}')
                if elem is not None and elem.text:
                    bibliographic_data["application_number"] = elem.text
                    
            elif field_code == 'B220':  # Application date
                elem = sdobi.find(f'.//{field_code}/date')
                if elem is not None and elem.text:
                    bibliographic_data["application_date"] = elem.text
        
        # Extract priority information using field definitions
        priorities = []
        priority_numbers = sdobi.findall('.//B310')  # Using B310 from field definitions
        priority_dates = sdobi.findall('.//B320/date')  # Using B320 from field definitions
        priority_countries = sdobi.findall('.//B330/ctry')  # Using B330 from field definitions
        
        for i, priority_num_elem in enumerate(priority_numbers):
            if priority_num_elem.text:
                priority = {"number": priority_num_elem.text}
                
                # Add corresponding date if available
                if i < len(priority_dates) and priority_dates[i] is not None and priority_dates[i].text:
                    priority["date"] = priority_dates[i].text
                
                # Add corresponding country if available
                if i < len(priority_countries) and priority_countries[i] is not None and priority_countries[i].text:
                    priority["country"] = priority_countries[i].text
                
                priorities.append(priority)
        
        if priorities:
            bibliographic_data["priorities"] = priorities
        
        # Extract bulletin information using field definitions
        bulletin_mapping = {
            'B405': 'corrigendum_bulletin',
            'B430': 'publication_bulletin', 
            'B450': 'grant_bulletin'
        }
        
        for field_code, output_field in bulletin_mapping.items():
            if field_code in sdobi_bulletin_info:  # Verify field exists in our definitions
                bulletin_elem = sdobi.find(f'.//{field_code}')
                if bulletin_elem is not None:
                    info = {}
                    date_elem = bulletin_elem.find('./date')
                    if date_elem is not None and date_elem.text:
                        info["date"] = date_elem.text
                    
                    bnum_elem = bulletin_elem.find('./bnum')
                    if bnum_elem is not None and bnum_elem.text:
                        info["bulletin_number"] = bnum_elem.text
                    
                    if info:
                        bibliographic_data[output_field] = info
        
        # Extract classification information using field definitions
        for class_type, class_description in classification_info.items():
            if class_type == 'classification-ipcr':
                classes = []
                for class_elem in sdobi.findall(f'.//{class_type}'):
                    text_elem = class_elem.find('./text')
                    if text_elem is not None and text_elem.text:
                        classes.append(text_elem.text.strip())
                if classes:
                    bibliographic_data["ipc_classes"] = classes
                    
            elif class_type == 'classification-cpc':
                classes = []
                for class_elem in sdobi.findall(f'.//{class_type}'):
                    text_elem = class_elem.find('./text')
                    if text_elem is not None and text_elem.text:
                        classes.append(text_elem.text.strip())
                if classes:
                    bibliographic_data["cpc_classes"] = classes
        
        # Extract title information using field definitions
        title_section = sdobi.find('.//B540')  # Using B540 from field definitions
        if title_section is not None:
            title_info = {}
            lang_elems = title_section.findall('.//B541')  # Using B541 from field definitions
            text_elems = title_section.findall('.//B542')  # Using B542 from field definitions
            
            for i, lang_elem in enumerate(lang_elems):
                if lang_elem.text and i < len(text_elems):
                    text_elem = text_elems[i]
                    if text_elem is not None and text_elem.text:
                        title_info[lang_elem.text] = text_elem.text
            
            if title_info:
                bibliographic_data["title"] = title_info
        
        # Extract inventors using field definitions
        inventors = []
        for inv_elem in sdobi.findall('.//B721'):  # Using B721 from field definitions
            inventor = {}
            
            # Extract name using field definitions
            name_elem = inv_elem.find('./snm')  # Using 'snm' from field definitions
            if name_elem is not None and name_elem.text:
                inventor["name"] = name_elem.text
            
            # Extract address using field definitions
            addr_elem = inv_elem.find('./adr')  # Using 'adr' from field definitions
            if addr_elem is not None:
                address = {}
                
                # Use field definitions for address components
                for addr_field, addr_description in sdobi_inventor_info.items():
                    if addr_field == 'str':  # Street
                        street_elem = addr_elem.find('./str')
                        if street_elem is not None and street_elem.text:
                            address["street"] = street_elem.text
                    elif addr_field == 'city':  # City
                        city_elem = addr_elem.find('./city')
                        if city_elem is not None and city_elem.text:
                            address["city"] = city_elem.text
                    elif addr_field == 'ctry':  # Country
                        country_elem = addr_elem.find('./ctry')
                        if country_elem is not None and country_elem.text:
                            address["country"] = country_elem.text
                
                if address:
                    inventor["address"] = address
            
            if inventor:
                inventors.append(inventor)
        
        if inventors:
            bibliographic_data["inventors"] = inventors
        
        # Extract applicants using field definitions
        applicants = []
        for app_elem in sdobi.findall('.//B731'):  # Using B731 from field definitions
            applicant = {}
            
            # Use field definitions for applicant fields
            name_elem = app_elem.find('./snm')  # Using 'snm' from field definitions
            if name_elem is not None and name_elem.text:
                applicant["name"] = name_elem.text
            
            id_elem = app_elem.find('./iid')  # Using 'iid' from field definitions
            if id_elem is not None and id_elem.text:
                applicant["id"] = id_elem.text
            
            ref_elem = app_elem.find('./irf')  # Using 'irf' from field definitions
            if ref_elem is not None and ref_elem.text:
                applicant["reference"] = ref_elem.text
            
            # Extract address using field definitions
            addr_elem = app_elem.find('./adr')  # Using 'adr' from field definitions
            if addr_elem is not None:
                address = {}
                
                for addr_field, addr_description in sdobi_applicant_info.items():
                    if addr_field == 'str':
                        street_elem = addr_elem.find('./str')
                        if street_elem is not None and street_elem.text:
                            address["street"] = street_elem.text
                    elif addr_field == 'city':
                        city_elem = addr_elem.find('./city')
                        if city_elem is not None and city_elem.text:
                            address["city"] = city_elem.text
                    elif addr_field == 'ctry':
                        country_elem = addr_elem.find('./ctry')
                        if country_elem is not None and country_elem.text:
                            address["country"] = country_elem.text
                
                if address:
                    applicant["address"] = address
            
            if applicant:
                applicants.append(applicant)
        
        if applicants:
            bibliographic_data["applicants"] = applicants
        
        # Extract representatives using field definitions
        representatives = []
        for rep_elem in sdobi.findall('.//B741'):  # Using B741 from field definitions
            representative = {}
            
            # Use field definitions for representative fields
            name_elem = rep_elem.find('./snm')  # Using 'snm' from field definitions
            if name_elem is not None and name_elem.text:
                representative["name"] = name_elem.text
            
            id_elem = rep_elem.find('./iid')  # Using 'iid' from field definitions
            if id_elem is not None and id_elem.text:
                representative["id"] = id_elem.text
            
            # Extract address using field definitions
            addr_elem = rep_elem.find('./adr')  # Using 'adr' from field definitions
            if addr_elem is not None:
                address = {}
                
                for addr_field, addr_description in sdobi_representative_info.items():
                    if addr_field == 'str':
                        street_elem = addr_elem.find('./str')
                        if street_elem is not None and street_elem.text:
                            address["street"] = street_elem.text
                    elif addr_field == 'city':
                        city_elem = addr_elem.find('./city')
                        if city_elem is not None and city_elem.text:
                            address["city"] = city_elem.text
                    elif addr_field == 'ctry':
                        country_elem = addr_elem.find('./ctry')
                        if country_elem is not None and country_elem.text:
                            address["country"] = country_elem.text
                
                if address:
                    representative["address"] = address
            
            if representative:
                representatives.append(representative)
        
        if representatives:
            bibliographic_data["representatives"] = representatives
        
        # Extract designated states using field definitions
        states_elem = sdobi.find('.//B840')  # Using B840 from field definitions
        if states_elem is not None:
            designated_states = []
            for state_elem in states_elem.findall('./ctry'):  # Using 'ctry' from field definitions
                if state_elem.text:
                    designated_states.append(state_elem.text)
            
            if designated_states:
                bibliographic_data["designated_states"] = designated_states
    
    # Process abstract (separate from SDOBI)
    abstract = root.find('.//abstract')
    if abstract:
        abstract_text = ""
        for p in abstract.findall('.//p'):
            p_text = p.text or ""
            # Remove XML comments
            p_text = re.sub(r'<!--.*?-->', '', p_text)
            if p_text.strip():
                abstract_text += p_text.strip() + " "
        
        if abstract_text.strip():
            bibliographic_data["abstract"] = abstract_text.strip()
    
    return bibliographic_data

In [11]:
xml_file_path = "EP22914805W1A9.xml"
output_file = "EP22914805W1A9_mapping.json"


TypeError: extract_bibliographic_data() missing 1 required positional argument: 'root'

In [None]:
# Quick test script
import xml.etree.ElementTree as ET

# Parse XML and extract biblio data directly
xml_file = "EP22914805W1A9.xml"
tree = ET.parse(xml_file)
root = tree.getroot()

In [18]:
# Get bibliographic data
biblio_data  = extract_bibliographic_data(root)

print("Bibliographic Data:")
print(biblio_data)

Bibliographic Data:
{'id': 'EP22914805A9W1', 'file': 'EP22914805W1A9.xml', 'lang': 'en', 'country': 'EP', 'doc_number': '4458984', 'kind': 'A9', 'correction_code': 'W1', 'date_publ': '20250604', 'status': 'c', 'dtd_version': 'ep-patent-document-v1-7', 'document_type': 'CORRECTED EUROPEAN PATENT APPLICATION', 'publication_date_full': '20250604', 'application_number': '22914805.1', 'application_date': '20221227', 'priorities': [{'number': '202111614026', 'date': '20211227', 'country': 'CN'}, {'number': '202211271772', 'date': '20221018', 'country': 'CN'}, {'number': '202211539752', 'date': '20221202', 'country': 'CN'}], 'corrigendum_bulletin': {'date': '20250604', 'bulletin_number': '202523'}, 'publication_bulletin': {'date': '20241106', 'bulletin_number': '202445'}, 'ipc_classes': ['C12Q   1/686       20180101AFI20230707BHEP', 'C12N  15/11        20060101ALI20230707BHEP'], 'cpc_classes': ['C12Q   1/686       20130101 LI20241111BHEP', 'C12Q2563/159       20130101 LI20241111BHEP', 'C12Q25

In [5]:
# Define field structures for description sections
description_fields = {
    'heading_patterns': [
        r'<[bu]><[bu]>(.*?)</[bu]></[bu]>',  # Bold/underline nested
        r'<[bu]>(.*?)</[bu]>'                # Single bold/underline
    ],
    'main_heading_whitelist': [
        r'^technical field$',
        r'^technical background$',
        r'^field of the invention$',
        r'^background$', 
        r'^background of the invention$',
        r'^summary$',
        r'^summary of the invention$',
        r'^brief summary$',
        r'^introduction$',
        r'^brief description of the drawings$',
        r'^detailed description$',
        r'^detailed description of the invention$',
        r'^abstract$',
        r'^description of embodiments$',
        r'^description of the embodiments$',
        r'^description of preferred embodiments$',
        r'^advantages$',
        r'^advantages of the invention$',
        r'^industrial applicability$',
        r'^brief description$',
        r'^objects of the invention$',
        r'^disclosure of the invention$',
        r'^action and effect of the invention$',
        r'^action and effect$'
    ],
    'extended_main_patterns': [
        r'^(figure|figures|drawings)',
        r'^embodiment',
        r'^description of',
        r'of the( present)? invention$',
    ],
    'blacklist_patterns': [
        r'example\s*\d+',
        r'examples?\s*\d*[^$]',
        r'experimental example',
        r'comparative example',
        r'reference example',
        r'preparative example',
        
        # Compounds, chemicals, and formulas
        r'compound',
        r'chemical',
        r'formula',
        r'synthesis',
        r'preparation of',
        r'structure',
        r'moiety',
        r'\b[a-z]*acid',
        r'phosphoryl',
        r'benzamido',
        r'phenyl',
        r'methyl',
        r'ethyl',
        
        # Steps and procedures
        r'step [a-z0-9]',
        r'procedure',
        r'stage',
        
        # Statistical patterns
        r'\d+\s*\.?\s*\d*\s*[a-zA-Z]',  # Numbers with letters like "1.2a"
        r'^\([a-z0-9]+\)',              # Parenthetical labels like "(2S)"
        r'^\[[a-z0-9]+\]',              # Bracket labels
        
        # Very specific chemical names
        r'butyl',
        r'propyl',
        r'amino',
        r'hydroxy',
        r'glycol',
        r'ester',
        r'ether',
        r'oxide',
        r'polymer',
        r'peptide'
    ]
}

# Define field structures for claims
claims_fields = {
    'path': './/claims',
    'item_path': './/claim',
    'num_attribute': 'num',
    'text_elements': ['claim-text', 'p']
}

In [6]:
"""EPO patent field definitions and extraction helpers."""
import re

# Root document attributes mapping
document_root_attributes = {
    'id': {'output_field': 'doc_id', 'description': 'Document ID'},
    'file': {'output_field': 'file', 'description': 'File reference'},
    'lang': {'output_field': 'language', 'description': 'Document language'},
    'country': {'output_field': 'country', 'description': 'Country code'},
    'doc-number': {'output_field': 'doc_number', 'description': 'Document number'},
    'kind': {'output_field': 'kind_code', 'description': 'Kind code'},
    'correction-code': {'output_field': 'correction_code', 'description': 'Correction code'},
    'date-publ': {'output_field': 'publication_date', 'description': 'Publication date'},
    'status': {'output_field': 'status', 'description': 'Document status'},
    'dtd-version': {'output_field': 'dtd_version', 'description': 'DTD version'}
}

# Comprehensive SDOBI field structure with paths and output mappings
sdobi_fields = {
    # Document information
    'document_info': {
        'path': './/SDOBI',
        'fields': {
            'B121': {'xpath': './/B121', 'output_field': 'document_type'},
            'B130': {'xpath': './/B130', 'output_field': 'kind'},
            'B140': {'xpath': './/B140/date', 'output_field': 'publication_date_full'}
        }
    },
    
    # Application information
    'application_info': {
        'path': './/SDOBI',
        'fields': {
            'B210': {'xpath': './/B210', 'output_field': 'application_number'},
            'B220': {'xpath': './/B220/date', 'output_field': 'application_date'}
        }
    },
    
    # Priority information - special handling for multiple entries
    'priority_info': {
        'path': './/SDOBI',
        'type': 'collection',
        'output_field': 'priorities',
        'collection_item': {
            'B310': {'xpath': './/B310', 'field': 'number'},
            'B320': {'xpath': './/B320/date', 'field': 'date'},
            'B330': {'xpath': './/B330/ctry', 'field': 'country'}
        }
    },
    
    # Bulletin information
    'bulletin_info': {
        'path': './/SDOBI',
        'type': 'mapping',
        'fields': {
            'B405': {
                'xpath': './/B405',
                'output_field': 'corrigendum_bulletin',
                'subfields': {
                    'date': {'xpath': './date'},
                    'bulletin_number': {'xpath': './bnum'}
                }
            },
            'B430': {
                'xpath': './/B430',
                'output_field': 'publication_bulletin',
                'subfields': {
                    'date': {'xpath': './date'},
                    'bulletin_number': {'xpath': './bnum'}
                }
            },
            'B450': {
                'xpath': './/B450',
                'output_field': 'grant_bulletin',
                'subfields': {
                    'date': {'xpath': './date'},
                    'bulletin_number': {'xpath': './bnum'}
                }
            }
        }
    },
    
    # Classification information
    'classification_info': {
        'path': './/SDOBI',
        'fields': {
            'ipc_classes': {
                'xpath': './/classification-ipcr',
                'output_field': 'ipc_classes',
                'multiple': True,
                'text_path': './text'
            },
            'cpc_classes': {
                'xpath': './/classification-cpc',
                'output_field': 'cpc_classes',
                'multiple': True,
                'text_path': './text'
            }
        }
    },
    
    # Title information - needs special handling for multiple languages
    'title_info': {
        'path': './/SDOBI',
        'type': 'language_collection',
        'xpath': './/B540',
        'output_field': 'title',
        'language_tag': './/B541',
        'text_tag': './/B542'
    },
    
    # People and organizations
    'people': {
        'inventors': {
            'path': './/SDOBI',
            'type': 'entity_collection',
            'xpath': './/B721',
            'output_field': 'inventors',
            'fields': {
                'name': {'xpath': './snm'},
                'address': {
                    'type': 'address',
                    'path': './adr',
                    'fields': {
                        'street': {'xpath': './str'},
                        'city': {'xpath': './city'},
                        'country': {'xpath': './ctry'}
                    }
                }
            }
        },
        'applicants': {
            'path': './/SDOBI',
            'type': 'entity_collection',
            'xpath': './/B731',
            'output_field': 'applicants',
            'fields': {
                'name': {'xpath': './snm'},
                'id': {'xpath': './iid'},
                'reference': {'xpath': './irf'},
                'address': {
                    'type': 'address',
                    'path': './adr',
                    'fields': {
                        'street': {'xpath': './str'},
                        'city': {'xpath': './city'},
                        'country': {'xpath': './ctry'}
                    }
                }
            }
        },
        'representatives': {
            'path': './/SDOBI',
            'type': 'entity_collection',
            'xpath': './/B741',
            'output_field': 'representatives',
            'fields': {
                'name': {'xpath': './snm'},
                'id': {'xpath': './iid'},
                'address': {
                    'type': 'address',
                    'path': './adr',
                    'fields': {
                        'street': {'xpath': './str'},
                        'city': {'xpath': './city'},
                        'country': {'xpath': './ctry'}
                    }
                }
            }
        }
    },
    
    # Designated states
    'designated_states': {
        'path': './/SDOBI',
        'type': 'simple_collection',
        'xpath': './/B840/ctry',
        'output_field': 'designated_states'
    }
}

# Abstract section (outside SDOBI)
abstract_field = {
    'path': './/abstract',
    'type': 'text_collection',
    'xpath': './/p',
    'output_field': 'abstract'
}

def extract_bibliographic_data(root):
    """Extract bibliographic data using the structured field definitions."""
    bibliographic_data = {}
    
    # 1. Extract root document attributes
    extract_root_attributes(root, bibliographic_data)
    
    # 2. Extract SDOBI section data
    extract_sdobi_data(root, bibliographic_data)
    
    # 3. Extract abstract (outside SDOBI)
    extract_abstract(root, bibliographic_data)
    
    return bibliographic_data

def extract_root_attributes(root, output_dict):
    """Extract document root attributes."""
    if root.tag == 'ep-patent-document':
        for attr, config in document_root_attributes.items():
            value = root.get(attr)
            if value is not None:
                output_dict[config['output_field']] = value

def extract_sdobi_data(root, output_dict):
    """Extract all SDOBI section data."""
    sdobi = root.find('.//SDOBI')
    if not sdobi:
        return
    
    # Process simple field sections
    for section in ['document_info', 'application_info', 'classification_info']:
        if section in sdobi_fields:
            config = sdobi_fields[section]
            extract_simple_fields(sdobi, config.get('fields', {}), output_dict)
    
    # Process priority information (collection type)
    extract_priority_info(sdobi, output_dict)
    
    # Process bulletin information
    extract_bulletin_info(sdobi, output_dict)
    
    # Process title information (language collection)
    extract_title_info(sdobi, output_dict)
    
    # Process people information (inventors, applicants, representatives)
    extract_people_info(sdobi, output_dict)
    
    # Process designated states (simple collection)
    extract_designated_states(sdobi, output_dict)

def extract_simple_fields(parent, fields_config, output_dict):
    """Extract simple field values from parent element."""
    for field_code, config in fields_config.items():
        xpath = config.get('xpath')
        output_field = config.get('output_field')
        
        if xpath and output_field:
            # Handle multiple field case
            if config.get('multiple'):
                values = []
                elems = parent.findall(xpath)
                for elem in elems:
                    text_elem = elem.find(config.get('text_path', '.'))
                    if text_elem is not None and text_elem.text:
                        values.append(text_elem.text.strip())
                if values:
                    output_dict[output_field] = values
            else:
                # Handle single field case
                elem = parent.find(xpath)
                if elem is not None and elem.text:
                    output_dict[output_field] = elem.text.strip()

def extract_priority_info(parent, output_dict):
    """Extract priority information."""
    config = sdobi_fields['priority_info']
    if config['type'] != 'collection':
        return
    
    # Get the collection items data
    priorities = []
    priority_numbers = parent.findall(config['collection_item']['B310']['xpath'])
    
    for i, priority_num_elem in enumerate(priority_numbers):
        if priority_num_elem.text:
            priority = {"number": priority_num_elem.text}
            
            # Find corresponding date
            date_elems = parent.findall(config['collection_item']['B320']['xpath'])
            if i < len(date_elems) and date_elems[i].text:
                priority["date"] = date_elems[i].text
            
            # Find corresponding country
            country_elems = parent.findall(config['collection_item']['B330']['xpath'])
            if i < len(country_elems) and country_elems[i].text:
                priority["country"] = country_elems[i].text
            
            priorities.append(priority)
    
    if priorities:
        output_dict[config['output_field']] = priorities

def extract_bulletin_info(parent, output_dict):
    """Extract bulletin information."""
    config = sdobi_fields['bulletin_info']
    if config['type'] != 'mapping':
        return
    
    for field_code, field_config in config['fields'].items():
        elem = parent.find(field_config['xpath'])
        if elem is not None:
            info = {}
            for subfield, subfield_config in field_config['subfields'].items():
                subfield_elem = elem.find(subfield_config['xpath'])
                if subfield_elem is not None and subfield_elem.text:
                    info[subfield] = subfield_elem.text
            
            if info:
                output_dict[field_config['output_field']] = info

def extract_title_info(parent, output_dict):
    """Extract title information in multiple languages."""
    config = sdobi_fields['title_info']
    if config['type'] != 'language_collection':
        return
    
    title_section = parent.find(config['xpath'])
    if title_section is not None:
        title_info = {}
        lang_elems = title_section.findall(config['language_tag'])
        text_elems = title_section.findall(config['text_tag'])
        
        for i, lang_elem in enumerate(lang_elems):
            if lang_elem.text and i < len(text_elems) and text_elems[i].text:
                title_info[lang_elem.text] = text_elems[i].text
        
        if title_info:
            output_dict[config['output_field']] = title_info

def extract_people_info(parent, output_dict):
    """Extract information about people and organizations."""
    people_config = sdobi_fields['people']
    
    for entity_type, config in people_config.items():
        if config['type'] != 'entity_collection':
            continue
        
        entities = []
        entity_elems = parent.findall(config['xpath'])
        
        for entity_elem in entity_elems:
            entity = {}
            
            # Extract simple fields
            for field_name, field_config in config['fields'].items():
                if isinstance(field_config, dict) and field_config.get('type') != 'address':
                    elem = entity_elem.find(field_config['xpath'])
                    if elem is not None and elem.text:
                        entity[field_name] = elem.text.strip()
            
            # Extract address fields
            if 'address' in config['fields']:
                addr_config = config['fields']['address']
                addr_elem = entity_elem.find(addr_config['path'])
                
                if addr_elem is not None:
                    address = {}
                    for addr_field, addr_field_config in addr_config['fields'].items():
                        field_elem = addr_elem.find(addr_field_config['xpath'])
                        if field_elem is not None and field_elem.text:
                            address[addr_field] = field_elem.text.strip()
                    
                    if address:
                        entity['address'] = address
            
            if entity:
                entities.append(entity)
        
        if entities:
            output_dict[config['output_field']] = entities

def extract_designated_states(parent, output_dict):
    """Extract designated states."""
    config = sdobi_fields['designated_states']
    if config['type'] != 'simple_collection':
        return
    
    states = []
    state_elems = parent.findall(config['xpath'])
    
    for state_elem in state_elems:
        if state_elem.text:
            states.append(state_elem.text)
    
    if states:
        output_dict[config['output_field']] = states

def extract_abstract(root, output_dict):
    """Extract abstract from patent document."""
    config = abstract_field
    if config['type'] != 'text_collection':
        return
    
    abstract_section = root.find(config['path'])
    if abstract_section:
        abstract_text = ""
        for p in abstract_section.findall(config['xpath']):
            p_text = p.text or ""
            p_text = re.sub(r'<!--.*?-->', '', p_text)  # Remove XML comments
            if p_text.strip():
                abstract_text += p_text.strip() + " "
        
        if abstract_text.strip():
            output_dict[config['output_field']] = abstract_text.strip()

In [7]:
def extract_heading_text(elem):
    """Extract text from heading element using defined patterns."""
    elem_str = ET.tostring(elem, encoding='unicode')
    
    # Try patterns from the field definition
    for pattern in description_fields['heading_patterns']:
        match = re.search(pattern, elem_str)
        if match:
            return match.group(1).strip()
    
    # Fallback extraction logic
    if len(elem) > 0:
        if elem[0].tag in ['u', 'b', 'i']:
            if len(elem[0]) > 0 and elem[0][0].tag in ['u', 'b', 'i']:
                return (elem[0][0].text or "").strip()
            else:
                return (elem[0].text or "").strip()
        else:
            return (elem[0].text or "").strip()
    else:
        return (elem.text or "").strip()

def is_main_heading(heading_text, debug=False):
    """Determine if a heading is a main section heading using field definitions."""
    if not heading_text:
        return False
        
    # Convert to lowercase for case-insensitive matching
    heading_lower = heading_text.lower()
    
    # Check against whitelist from field definitions
    for pattern in description_fields['main_heading_whitelist']:
        if re.match(pattern, heading_lower):
            if debug:
                print(f"MAIN HEADING (whitelist): {heading_text}")
            return True
    
    # Check against extended patterns
    for pattern in description_fields['extended_main_patterns']:
        if re.search(pattern, heading_lower):
            # Verify it's not in the blacklist
            blacklisted = False
            for bl_pattern in description_fields['blacklist_patterns']:
                if re.search(bl_pattern, heading_lower):
                    blacklisted = True
                    break
            
            if not blacklisted:
                if debug:
                    print(f"MAIN HEADING (extended): {heading_text}")
                return True
    
    # Additional checks from your existing implementation
    if len(heading_text) > 25:
        if debug:
            print(f"EXCLUDED (too long): {heading_text}")
        return False
    
    if heading_text.istitle() and len(heading_text.split()) <= 4:
        if debug:
            print(f"MAIN HEADING (fallback): {heading_text}")
        return True
    
    if debug:
        print(f"EXCLUDED (default): {heading_text}")
    return False

def extract_main_sections(root, debug=False):
    """
    Extract main sections from patent XML, with stricter filtering to exclude 
    ALL compounds and examples.
    """
    main_sections = []
    
    # Find description section
    description = root.find('.//description')
    if description:
        # First, collect all elements in order
        ordered_elements = []
        for elem in description:
            if elem.tag in ['heading', 'p']:
                ordered_elements.append(elem)
        
        # Process elements and associate paragraphs with headings
        current_main_section = None
        
        for elem in ordered_elements:
            if elem.tag == 'heading':
                heading_id = elem.get('id')
                heading_text = extract_heading_text(elem)
                
                # Use the enhanced is_main_heading function with debug option
                if heading_text and is_main_heading(heading_text, debug=debug):
                    current_main_section = {
                        "heading_id": heading_id,
                        "heading_text": heading_text,
                        "paragraphs": []
                    }
                    main_sections.append(current_main_section)
            
            elif elem.tag == 'p' and current_main_section:
                p_id = elem.get('id')
                p_num = elem.get('num')
                p_text = elem.text or ""
                
                # Remove XML comments
                p_text = re.sub(r'<!--.*?-->', '', p_text)
                
                if p_text.strip():
                    current_main_section["paragraphs"].append({
                        "p_id": p_id,
                        "p_number": p_num,
                        "text": p_text.strip()
                    })
    
    return main_sections


def extract_claims(root):
    """Extract claims from patent XML."""
    claims = []
    
    # Find claims section
    claims_section = root.find('.//claims')
    if claims_section:
        for claim_elem in claims_section.findall('.//claim'):
            claim_num = claim_elem.get('num')
            claim_text = ""
            
            # Extract claim text from various possible structures
            for text_elem in claim_elem.iter():
                if text_elem.tag in ['claim-text', 'p'] and text_elem.text:
                    text = text_elem.text.strip()
                    if text:
                        claim_text += text + " "
            
            claim_text = claim_text.strip()
            if claim_text:
                claims.append({
                    "claim_number": claim_num,
                    "text": claim_text
                })
    
    return claims


In [8]:
def process_patent_xml(xml_file_path, output_file=None, debug=False):
    """Process patent XML and extract structured data."""
    try:
        tree = ET.parse(xml_file_path)
        root = tree.getroot()
        
        patent_data = {
            "bibliographic_data": extract_bibliographic_data(root),
            "main_sections": extract_main_sections(root, debug),
            "claims": extract_claims(root)
        }
        
        # Save output if needed
        if output_file:
            import json
            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(patent_data, f, indent=2, ensure_ascii=False)
        
        return patent_data
    
    except Exception as e:
        print(f"Error processing patent XML: {e}")
        import traceback
        traceback.print_exc()
        return {"bibliographic_data": {}, "main_sections": [], "claims": []}

In [10]:
import os
import sys
# from xml_loader import process_patent_xml

def process_epo_xml_to_json(xml_file_path, output_file=None):
    """
    Process an EPO patent XML file and save the result as JSON
    
    Args:
        xml_file_path: Path to the XML file
        output_file: Path to save the output JSON (if None, will use the same name as XML with .json extension)
    
    Returns:
        Path to the saved JSON file
    """
    # Create default output path if not provided
    if output_file is None:
        base_name = os.path.splitext(os.path.basename(xml_file_path))[0]
        output_dir = os.path.dirname(xml_file_path)
        output_file = os.path.join(output_dir, f"{base_name}_parsed.json")
    
    # Ensure output directory exists
    os.makedirs(os.path.dirname(os.path.abspath(output_file)), exist_ok=True)
    
    print(f"🔍 Processing {os.path.basename(xml_file_path)}...")
    print(f"💾 Saving output to {output_file}")
    
    # Process the patent and save to JSON
    result = process_patent_xml(xml_file_path, output_file=output_file)
    
    # Output summary
    print("\n✅ Processing complete!")
    print(f"  • Bibliographic fields: {len(result['bibliographic_data'])}")
    print(f"  • Main sections: {len(result['main_sections'])}")
    print(f"  • Claims: {len(result['claims'])}")
    
    return output_file

# # Use command line argument if provided, otherwise prompt for file path
# if len(sys.argv) > 1:
#     xml_path = sys.argv[1]
# else:
#     xml_path = input("Enter path to EPO XML file: ")

# # Optional output path
# out_path = input("Enter output JSON path (leave blank for default): ")
# if not out_path.strip():
#     out_path = None
    
# json_file = process_epo_xml_to_json(xml_path, out_path)

# print(f"\n📄 JSON file saved: {json_file}")

In [30]:
process_patent_xml("EP22914805W1A9.xml")

{'bibliographic_data': {'doc_id': 'EP22914805A9W1',
  'file': 'EP22914805W1A9.xml',
  'language': 'en',
  'country': 'EP',
  'doc_number': '4458984',
  'kind_code': 'A9',
  'correction_code': 'W1',
  'publication_date': '20250604',
  'status': 'c',
  'dtd_version': 'ep-patent-document-v1-7',
  'document_type': 'CORRECTED EUROPEAN PATENT APPLICATION',
  'kind': 'A9',
  'publication_date_full': '20250604',
  'application_number': '22914805.1',
  'application_date': '20221227',
  'ipc_classes': ['C12Q   1/686       20180101AFI20230707BHEP',
   'C12N  15/11        20060101ALI20230707BHEP'],
  'cpc_classes': ['C12Q   1/686       20130101 LI20241111BHEP',
   'C12Q2563/159       20130101 LI20241111BHEP',
   'C12Q2563/107       20130101 LI20241111BHEP',
   'C12Q2527/101       20130101 LI20241111BHEP',
   'C12Q2537/143       20130101 LI20241111BHEP',
   'C12Q2561/113       20130101 LI20241111BHEP',
   'C12Q   1/686       20130101 FI20230724BHEP',
   'C12Q   1/6886      20130101 LA20241111BHEP']

In [15]:
# Process multiple files
xml_files = [
    'EP18823397W1B9.xml',
]

output_dir = './output'
os.makedirs(output_dir, exist_ok=True)

for xml_file in xml_files:
    base_name = os.path.splitext(os.path.basename(xml_file))[0]
    output_file = os.path.join(output_dir, f"{base_name}.json")
    process_epo_xml_to_json(xml_file, output_file)

🔍 Processing EP18823397W1B9.xml...
💾 Saving output to ./output/EP18823397W1B9.json
Error processing patent XML: mismatched tag: line 5, column 1701

✅ Processing complete!
  • Bibliographic fields: 0
  • Main sections: 0
  • Claims: 0


Traceback (most recent call last):
  File "/tmp/ipykernel_20/1692404564.py", line 4, in process_patent_xml
    tree = ET.parse(xml_file_path)
  File "/usr/local/lib/python3.10/xml/etree/ElementTree.py", line 1222, in parse
    tree.parse(source, parser)
  File "/usr/local/lib/python3.10/xml/etree/ElementTree.py", line 580, in parse
    self._root = parser._parse_whole(source)
xml.etree.ElementTree.ParseError: mismatched tag: line 5, column 1701


In [14]:
def examine_xml_structure(xml_file):
    """Debug helper to see what sections exist in the XML"""
    tree = ET.parse(xml_file)
    root = tree.getroot()
    
    sections = {
        "description": root.find('.//description') is not None,
        "headings": len(root.findall('.//description/heading')) if root.find('.//description') else 0,
        "paragraphs": len(root.findall('.//description/p')) if root.find('.//description') else 0,
        "claims": root.find('.//claims') is not None,
        "claim_count": len(root.findall('.//claims/claim')) if root.find('.//claims') else 0
    }
    
    print(f"\n🔍 XML Structure for {os.path.basename(xml_file)}:")
    for section, found in sections.items():
        print(f"  • {section}: {found}")
        
    # Try printing the first heading if available
    if sections["headings"] > 0:
        first_heading = root.find('.//description/heading')
        print(f"\nSample heading structure:")
        print(ET.tostring(first_heading, encoding='unicode')[:200])
    
    return sections

# Use this for debugging
examine_xml_structure('EP18823397W1B9.xml')

ParseError: mismatched tag: line 5, column 1701 (<string>)