### Input File extraction and XSLT transformation with lxml

In [132]:
from lxml import etree as et
from pathlib import Path

input_doc_simple = Path.cwd() / 'files' / 'schema' / 'input' / 'AccountAttributes.xsd'
with open(input_doc_simple, 'r') as input_file_sim:
    schema_sim = et.parse(input_file_sim)

input_doc_complex = Path.cwd() / 'files' / 'schema' / 'input' / 'AccountsComplex.xsd'
with open(input_doc_complex, 'r') as input_file_comp:
    schema_comp = et.parse(input_file_comp)


xslt_doc =  Path.cwd() / 'files' / 'common' / 'xslt_doc.xml'
with open(xslt_doc, 'r') as xslt_file:
    xslt = et.XSLT(et.parse(xslt_file))

transformed_data_sim = xslt(schema_sim)
transformed_data_comp = xslt(schema_comp)

transformed_doc_sim = Path.cwd() / 'files' / 'schema' / 'generated' / 'transformed_doc_simple.xml'
transformed_doc_comp = Path.cwd() / 'files' / 'schema' / 'generated' / 'transformed_doc_complex.xml'
transformed_data_sim.write(transformed_doc_sim)
transformed_data_comp.write(transformed_doc_comp)

### Methods to build a consolidated inline Schema (No indirect refs)

In [133]:
def clone_element(elem):
    """
    Create a lightweight shallow copy of an <element> node.
    Copies tag, attributes, and direct children (no deep copy).
    """
    new_elem = et.Element(elem.tag, attrib=dict(elem.attrib))
    for child in elem:
        # Append only valid XML child elements (skip text/comments)
        if isinstance(child.tag, str):
            new_elem.append(child)
    return new_elem


In [134]:
def resolve_refs(node, definitions):

    # Iterate over all descendant <element> nodes only (skips text/comments)
    for elem in list(node.iter("element")):
        ref_name = elem.get("ref")
        if ref_name:
            ref_def = definitions.get(ref_name)
            if ref_def is not None:
                # Step 1: clone the referenced definition
                new_elem = clone_element(ref_def)

                # Step 2: preserve attributes from the referring node (except 'ref')
                for k, v in elem.attrib.items():
                    if k != "ref":
                        new_elem.attrib[k] = v

                # Step 3: replace the <element ref="..."> with resolved <element>
                parent = elem.getparent()
                parent.replace(elem, new_elem)

                # Step 4: recursively resolve any nested refs inside the new element
                resolve_refs(new_elem, definitions)

### Reading through the Schema

In [135]:
with open(transformed_doc_comp, 'r') as complex_file:
    complex_doc = et.parse(complex_file)

schema_root = complex_doc.getroot()

# Retrieving all the named elements together to prepare an inline schema (No refs)
named_elements = {
    elem.get("name"): elem
    for elem in schema_root.xpath(".//element[@name]")
}

# Update the schema_root to build the inline schema structure
resolve_refs(schema_root, named_elements)

output_path = Path.cwd() / 'files' / 'schema' / 'generated' / 'consolidated_schema.xml'
with open(output_path, "w", encoding="utf-8") as conso_file:
    conso_file.write(et.tostring(schema_root, encoding="unicode"))

### Create flattened Python Dictionary for inline Schema

In [136]:
def flatten_schema(elem, parent_path="", result=None):
    if result is None:
        result = {}

    # Build current element path
    name = elem.get("name")
    if name:
        path = f"{parent_path}/{name}" if parent_path else name
    else:
        path = parent_path

    # Register metadata for <element>
    if elem.tag.endswith("element") and name:
        result[path] = {
            "type": elem.get("type", "complexType" if elem.find(".//complexType") is not None else "unknown"),
            "minOccurs": elem.get("minOccurs", "1"),
            "maxOccurs": elem.get("maxOccurs", "1")
        }

    # Define all content model tags we care about
    content_blocks = ["sequence", "choice", "all"]

    # Recurse into any child elements inside those blocks
    for tag in content_blocks:
        for child in elem.xpath(f"./complexType/{tag}/element | ./{tag}/element"):
            flatten_schema(child, path, result)

    return result

acctDetails = flatten_schema(schema_root.find(".//element[@name='AcctDetails']"))
print(acctDetails)

{'AcctDetails': {'type': 'complexType', 'minOccurs': '1', 'maxOccurs': '1'}, 'AcctDetails/RecordIndicator': {'type': 'xs:string', 'minOccurs': '1', 'maxOccurs': '1'}, 'AcctDetails/IdType': {'type': 'xs:string', 'minOccurs': '1', 'maxOccurs': '1'}, 'AcctDetails/AcctNumber': {'type': 'xs:string', 'minOccurs': '1', 'maxOccurs': '1'}, 'AcctDetails/TxnInfo': {'type': 'complexType', 'minOccurs': '1', 'maxOccurs': '1'}, 'AcctDetails/TxnInfo/IdType': {'type': 'xs:string', 'minOccurs': '1', 'maxOccurs': '1'}, 'AcctDetails/TxnInfo/AmountType': {'type': 'xs:decimal', 'minOccurs': '1', 'maxOccurs': '1'}, 'AcctDetails/TxnInfo/AcctNumber': {'type': 'xs:string', 'minOccurs': '1', 'maxOccurs': '1'}, 'AcctDetails/TxnInfo/Currency': {'type': 'xs:string', 'minOccurs': '1', 'maxOccurs': '1'}, 'AcctDetails/TxnInfo/TxnDate': {'type': 'xs:string', 'minOccurs': '1', 'maxOccurs': '1'}, 'AcctDetails/TxnTypeCode': {'type': 'xs:string', 'minOccurs': '0', 'maxOccurs': '1'}, 'AcctDetails/TxnTime': {'type': 'xs:stri