In [2]:
import xml.etree.ElementTree as ET
import json
import os

def xml_to_json(xml_file_path, json_file_path=None):
    """
    Convert XML file to JSON file
    
    Args:
        xml_file_path (str): Path to XML file
        json_file_path (str): Path to output JSON file (optional)
    """
    
    def xml_to_dict(element):
        """Convert XML element to dictionary"""
        result = {}
        
        # Add attributes
        if element.attrib:
            result['@attributes'] = element.attrib
        
        # Add text content
        if element.text and element.text.strip():
            if len(element) == 0:  # No children
                return element.text.strip()
            else:
                result['#text'] = element.text.strip()
        
        # Add children
        for child in element:
            child_data = xml_to_dict(child)
            
            if child.tag in result:
                # Convert to list if multiple elements with same tag
                if not isinstance(result[child.tag], list):
                    result[child.tag] = [result[child.tag]]
                result[child.tag].append(child_data)
            else:
                result[child.tag] = child_data
        
        return result
    
    try:
        # Parse XML
        tree = ET.parse(xml_file_path)
        root = tree.getroot()
        
        # Convert to dictionary
        json_data = {root.tag: xml_to_dict(root)}
        
        # Generate output filename if not provided
        if json_file_path is None:
            base_name = os.path.splitext(xml_file_path)[0]
            json_file_path = f"{base_name}.json"
        
        # Save as JSON
        with open(json_file_path, 'w', encoding='utf-8') as json_file:
            json.dump(json_data, json_file, indent=2, ensure_ascii=False)
        
        print(f"✅ Successfully converted {xml_file_path} to {json_file_path}")
        return json_file_path
        
    except ET.ParseError as e:
        print(f"❌ Error parsing XML: {e}")
    except Exception as e:
        print(f"❌ Error during conversion: {e}")
    
    return None

def batch_convert_xml_to_json(directory_path="."):
    """Convert all XML files in directory to JSON"""
    xml_files = [f for f in os.listdir(directory_path) if f.endswith('.xml')]
    
    if not xml_files:
        print("No XML files found in the directory.")
        return
    
    print(f"Found {len(xml_files)} XML file(s):")
    
    for xml_file in xml_files:
        xml_path = os.path.join(directory_path, xml_file)
        print(f"\nConverting: {xml_file}")
        xml_to_json(xml_path)



In [9]:
# Example usage


xml_to_json("EP22914805W1A9.xml")
    
    # Or convert all XML files in current directory
    # batch_convert_xml_to_json()

✅ Successfully converted EP22914805W1A9.xml to EP22914805W1A9.json


'EP22914805W1A9.json'

In [4]:
import json

def extract_patent_structure(json_file_path):
    """Extract structured data from patent JSON file"""
    
    with open(json_file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    # Navigate to the patent document
    patent_doc = data.get('ep-patent-document', {})
    
    # Extract basic information
    basic_info = {
        'patent_id': patent_doc.get('@attributes', {}).get('id'),
        'doc_number': patent_doc.get('@attributes', {}).get('doc-number'),
        'country': patent_doc.get('@attributes', {}).get('country'),
        'kind': patent_doc.get('@attributes', {}).get('kind'),
        'date_published': patent_doc.get('@attributes', {}).get('date-publ')
    }
    
    # Extract description and headings
    description = patent_doc.get('description', {})
    headings = description.get('heading', [])
    paragraphs = description.get('p', [])
    
    # Structure headings
    structured_headings = []
    for heading in headings:
        if isinstance(heading, dict):
            if '@attributes' in heading:
                # Heading with attributes
                heading_text = heading.get('u') or heading.get('b') or heading.get('#text', '')
                structured_headings.append({
                    'id': heading.get('@attributes', {}).get('id'),
                    'text': heading_text
                })
            else:
                # Simple text heading
                structured_headings.append({
                    'id': None,
                    'text': str(heading)
                })
        else:
            # String heading
            structured_headings.append({
                'id': None,
                'text': str(heading)
            })
    
    # Extract claims
    claims_section = patent_doc.get('claims', [])
    extracted_claims = []
    
    if isinstance(claims_section, list):
        for claim_group in claims_section:
            if 'claim' in claim_group:
                extracted_claims.extend(claim_group['claim'])
    
    # Extract examples and compounds
    examples = []
    compounds = []
    
    # Look for examples in headings
    for heading in structured_headings:
        text = heading.get('text', '')
        if 'Example' in text or 'Step' in text:
            examples.append({
                'title': text,
                'id': heading.get('id')
            })
        elif any(compound_indicator in text for compound_indicator in ['Compound', '(2s)-2-', 'Step A:', 'Step B:']):
            compounds.append({
                'name': text,
                'id': heading.get('id')
            })
    
    return {
        'basic_info': basic_info,
        'headings': structured_headings,
        'examples': examples,
        'compounds': compounds,
        'claims': extracted_claims,
        'total_paragraphs': len(paragraphs)
    }

# Usage
structured_data = extract_patent_structure('EP18823397W1B9.json')

# Pretty print the results
import json
print(json.dumps(structured_data, indent=2, ensure_ascii=False))

{
  "basic_info": {
    "patent_id": "EP18823397B9W1",
    "doc_number": "3647315",
    "country": "EP",
    "kind": "B9",
    "date_published": "20250604"
  },
  "headings": [
    {
      "id": "h0001",
      "text": "Technical field"
    },
    {
      "id": "h0002",
      "text": "Technical background"
    },
    {
      "id": "h0003",
      "text": "Summary of the Invention"
    },
    {
      "id": "h0004",
      "text": "Action and effect of the invention:"
    },
    {
      "id": "h0005",
      "text": "Detailed description of the invention"
    },
    {
      "id": "h0006",
      "text": "Example 1"
    },
    {
      "id": null,
      "text": "(2s)-2-(2,6-dichloro-4-(2-(hydroxy(phenyl)phosphoryl)ethyl)benzamido)-3-(3-(methylsulfon yl)phenyl)propionic acid"
    },
    {
      "id": null,
      "text": "Step A: methoxyphenylphosphoryl chloride (Compound 1.1)"
    },
    {
      "id": null,
      "text": "Step B: Methyl 2,6-dichloro-4-((phenyl(methoxy)phosphoryl)ethynyl)benzoate

In [8]:
!pip install pandas
import json
import pandas as pd

def create_patent_dataframes(json_file_path):
    """Create structured DataFrames from patent JSON"""
    
    with open(json_file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    patent_doc = data.get('ep-patent-document', {})
    description = patent_doc.get('description', {})
    
    # 1. Basic Info DataFrame
    basic_info = pd.DataFrame([{
        'patent_id': patent_doc.get('@attributes', {}).get('id'),
        'doc_number': patent_doc.get('@attributes', {}).get('doc-number'),
        'country': patent_doc.get('@attributes', {}).get('country'),
        'kind': patent_doc.get('@attributes', {}).get('kind'),
        'date_published': patent_doc.get('@attributes', {}).get('date-publ'),
        'language': patent_doc.get('@attributes', {}).get('lang')
    }])
    
    # 2. Headings DataFrame
    headings = description.get('heading', [])
    headings_data = []
    
    for i, heading in enumerate(headings):
        if isinstance(heading, dict):
            heading_text = heading.get('u') or heading.get('b') or heading.get('#text', '')
            headings_data.append({
                'sequence': i + 1,
                'id': heading.get('@attributes', {}).get('id') if '@attributes' in heading else None,
                'text': heading_text,
                'type': 'formatted' if ('@attributes' in heading) else 'simple'
            })
        else:
            headings_data.append({
                'sequence': i + 1,
                'id': None,
                'text': str(heading),
                'type': 'text'
            })
    
    headings_df = pd.DataFrame(headings_data)
    
    # 3. Examples DataFrame
    examples_data = []
    for i, heading_row in headings_df.iterrows():
        text = heading_row['text']
        if 'Example' in text and text.startswith('Example'):
            example_number = text.split()[1] if len(text.split()) > 1 else 'Unknown'
            examples_data.append({
                'example_number': example_number,
                'title': text,
                'sequence': heading_row['sequence'],
                'id': heading_row['id']
            })
    
    examples_df = pd.DataFrame(examples_data)
    
    # 4. Compounds DataFrame
    compounds_data = []
    for i, heading_row in headings_df.iterrows():
        text = heading_row['text']
        if '(2s)-2-' in text or 'Compound' in text or text.startswith('Step'):
            compounds_data.append({
                'compound_name': text,
                'sequence': heading_row['sequence'],
                'id': heading_row['id'],
                'category': 'step' if text.startswith('Step') else 'compound'
            })
    
    compounds_df = pd.DataFrame(compounds_data)
    
    return {
        'basic_info': basic_info,
        'headings': headings_df,
        'examples': examples_df,
        'compounds': compounds_df
    }

# Usage
dataframes = create_patent_dataframes('EP18823397W1B9.json')

# Display results
print("Basic Info:")
print(dataframes['basic_info'])
print("\nExamples:")
print(dataframes['examples'])
print("\nFirst 10 Headings:")
print(dataframes['headings'].head(10))

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


FileNotFoundError: [Errno 2] No such file or directory: 'EP22914805W1A9.json'