### Input File extraction and XSLT transformation with lxml

In [148]:
from lxml import etree as et
from pathlib import Path

input_doc_simple = Path.cwd() / 'files' / 'schema' / 'input' / 'AccountAttributes.xsd'
with open(input_doc_simple, 'r') as input_file_sim:
    schema_sim = et.parse(input_file_sim)

input_doc_complex = Path.cwd() / 'files' / 'schema' / 'input' / 'AccountsComplex.xsd'
with open(input_doc_complex, 'r') as input_file_comp:
    schema_comp = et.parse(input_file_comp)

xslt_doc =  Path.cwd() / 'files' / 'common' / 'xslt_doc.xml'
with open(xslt_doc, 'r') as xslt_file:
    xslt = et.XSLT(et.parse(xslt_file))

transformed_data_sim = xslt(schema_sim)
transformed_data_comp = xslt(schema_comp)

### Methods to build a consolidated inline Schema (No indirect refs)

In [149]:
def clone_element(elem):
    """
    Create a lightweight shallow copy of an <element> node.
    Copies tag, attributes, and direct children (no deep copy).
    """
    new_elem = et.Element(elem.tag, attrib=dict(elem.attrib))
    for child in elem:
        # Append only valid XML child elements (skip text/comments)
        if isinstance(child.tag, str):
            new_elem.append(child)
    return new_elem


In [150]:
def resolve_refs(node, definitions):

    # Iterate over all descendant <element> nodes only (skips text/comments)
    for elem in list(node.iter("element")):
        ref_name = elem.get("ref")
        if ref_name:
            ref_def = definitions.get(ref_name)
            if ref_def is not None:
                # Step 1: clone the referenced definition
                new_elem = clone_element(ref_def)

                # Step 2: preserve attributes from the referring node (except 'ref')
                for k, v in elem.attrib.items():
                    if k != "ref":
                        new_elem.attrib[k] = v

                # Step 3: replace the <element ref="..."> with resolved <element>
                parent = elem.getparent()
                parent.replace(elem, new_elem)

                # Step 4: recursively resolve any nested refs inside the new element
                resolve_refs(new_elem, definitions)

### Reading through the Schema

In [151]:
schema_root = transformed_data_comp.getroot()
schema_root_sim = transformed_data_sim.getroot()

# Retrieving all the named elements together to prepare an inline schema (No refs)
named_elements = {
    elem.get("name"): elem
    for elem in schema_root.xpath(".//element[@name]")
}

# Update the schema_root to build the inline schema structure
resolve_refs(schema_root, named_elements)
resolve_refs(schema_root_sim, named_elements)

### Method to dump JSON contents into a file

In [152]:
import json

def write_json(content, path):
    with open(path, "w", encoding="utf-8") as json_file:
        json.dump(content, json_file, ensure_ascii=False, indent=4)

### Create flattened Python Dictionary for inline Schema

In [153]:
def flatten_schema(elem, parent_path="", result=None):
    if result is None:
        result = {}

    # Build current element path
    name = elem.get("name")
    if name:
        path = f"{parent_path}/{name}" if parent_path else name
    else:
        path = parent_path

    # Register metadata for <element>
    if elem.tag.endswith("element") and name:
        result[path] = {
            "type": elem.get("type", "complexType" if elem.find(".//complexType") is not None else "unknown"),
            "minOccurs": elem.get("minOccurs", "1"),
            "maxOccurs": elem.get("maxOccurs", "1")
        }

    # Define all content model tags we care about
    content_blocks = ["sequence", "choice", "all"]

    # Recurse into any child elements inside those blocks
    for tag in content_blocks:
        for child in elem.xpath(f"./complexType/{tag}/element | ./{tag}/element"):
            flatten_schema(child, path, result)

    return result

output_comp = Path.cwd() / 'files' / 'schema' / 'generated' / 'AccountsComplex.json'
acctDetails_comp = flatten_schema(schema_root.find(".//element[@name='AcctDetails']"))

output_sim = Path.cwd() / 'files' / 'schema' / 'generated' / 'AccountsSimple.json'
acctDetails_sim = flatten_schema(schema_root_sim.find(".//element[@name='AcctDetails']"))

write_json(acctDetails_comp, output_comp)
write_json(acctDetails_sim, output_sim)