In [23]:
import random
import guardrails.datatypes as dt

from lxml.builder import E
from lxml.etree import tostring
from lxml import etree as ET

from rich import print



In [5]:
def random_scalar_datatype():
    selected_datatype = random.choice(
        [
            dt.String,
            dt.Integer,
            dt.Float,
            dt.Boolean,
            dt.Date,
            dt.Time,
        ]
    )
    return selected_datatype(None, None, None).rail_alias

In [6]:
depth = 4
width = 10

def generate_schema(curr_depth):
    if curr_depth < depth:

        # Type of current node is choice between "object", "list", "scalar"
        node_type = random.choice(["object", "list", "scalar"])

        if node_type == "object":
            # If "object", then generate random number of children
            num_children = random.randint(1, width)
            children = []
            for _ in range(num_children):
                children.append(generate_schema(curr_depth + 1))
            return E.object(*children, name=f"random_object_{random.randint(0, 1000)}")
        elif node_type == "list":
            # If "list", then generate a single child
            return E.list(generate_schema(curr_depth + 1), name=f"random_list_{random.randint(0, 1000)}")
        else:
            # If "scalar", then return a random primitive type
            datatype = random_scalar_datatype()
            return E(datatype, name=f"random_{datatype}_{random.randint(0, 1000)}")

    else:
        datatype = random_scalar_datatype()
        return E(datatype, name=f"random_{datatype}_{random.randint(0, 1000)}")

In [7]:
output = generate_schema(curr_depth=0)

In [8]:
print(tostring(output, pretty_print=True, encoding="unicode"))

In [9]:
from typing import List


def generate_random_schemas(n: int, depth: int = 4, width: int = 10) -> List[str]:
    """Generate random schemas that represent a valid schema.

    Args:
        n: The number of schemas to generate.
        depth: The depth of nesting
    """

    def random_scalar_datatype():
        selected_datatype = random.choice(
            [
                dt.String,
                dt.Integer,
                dt.Float,
                dt.Boolean,
                dt.Date,
                dt.Time,
            ]
        )
        return selected_datatype(None, None, None).rail_alias

    def generate_schema(curr_depth):
        if curr_depth < depth:
            # Type of current node is choice between "object", "list", "scalar"
            node_type = random.choice(["object", "list", "scalar"])

            if node_type == "object":
                # If "object", then generate random number of children
                num_children = random.randint(1, width)
                children = []
                for _ in range(num_children):
                    children.append(generate_schema(curr_depth + 1))
                return E.object(
                    *children, name=f"random_object_{random.randint(0, 1000)}"
                )
            elif node_type == "list":
                # If "list", then generate a single child
                return E.list(
                    generate_schema(curr_depth + 1),
                    name=f"random_list_{random.randint(0, 1000)}",
                )
            else:
                # If "scalar", then return a random primitive type
                datatype = random_scalar_datatype()
                return E(datatype, name=f"random_{datatype}_{random.randint(0, 1000)}")

        else:
            datatype = random_scalar_datatype()
            return E(datatype, name=f"random_{datatype}_{random.randint(0, 1000)}")

    schemas = []
    for _ in range(n):
        root = E("output")
        children = []
        num_children = random.randint(1, width)
        for _ in range(num_children):
            children.append(generate_schema(curr_depth=1))
        root.extend(children)

        schemas.append(root)
    return schemas


In [26]:
output = generate_random_schemas(1)[0]

print(tostring(output, pretty_print=True, encoding="unicode"))

In [36]:
from typing import Any, Dict
from dataclasses import dataclass


@dataclass
class Placeholder:
    expected_type: str


def generate_json_skeleton_from_schema(schema: ET._Element) -> Dict[str, Any]:
    def _recurse_schema(schema):
        if schema.tag == "object":
            return {
                child.attrib["name"]: _recurse_schema(child)
                for child in schema
            }
        elif schema.tag == "list":
            return [
                _recurse_schema(schema[0])
            ]
        else:
            return Placeholder(schema.tag)

    return {child.attrib["name"]: _recurse_schema(child) for child in schema}

In [37]:
generate_json_skeleton_from_schema(output)

{'random_list_473': [Placeholder(expected_type='float')],
 'random_list_224': [[[Placeholder(expected_type='integer')]]],
 'random_time_837': Placeholder(expected_type='time')}

In [43]:
from dataclasses import dataclass
from lxml import etree as ET
from typing import Any, Dict


@dataclass
class Placeholder:
    expected_type: str

    @classmethod
    def type_dict(cls):
        return {
            "string": str,
            "integer": int,
            "float": float,
            "bool": bool,
            "time": str,
            "object": dict,
            "list": list,
        }


def generate_json_skeleton_from_schema(schema: ET._Element) -> Dict[str, Any]:
    """Generate a JSON skeleton from an XML schema."""

    def _recurse_schema(schema):
        if schema.tag == "object":
            return {
                child.attrib["name"]: _recurse_schema(child)
                for child in schema
            }
        elif schema.tag == "list":
            return [
                _recurse_schema(schema[0])
            ]
        else:
            return Placeholder(schema.tag)

    return {child.attrib["name"]: _recurse_schema(child) for child in schema}


def verify_schema_against_json(xml_schema: ET._Element, generated_json: Dict[str, Any]):
    """Verify that a JSON schema is valid for a given XML."""

    json_schema = generate_json_skeleton_from_schema(xml_schema)

    def _verify_dict(schema, json):
        if set(schema.keys()) != set(json.keys()):
            return False

        for key in schema.keys():
            if isinstance(schema[key], Placeholder):
                expected_type = Placeholder.type_dict[schema[key].expected_type]
                if not isinstance(json[key], expected_type):
                    return False
            else:
                if isinstance(schema[key], dict):
                    if not isinstance(json[key], dict):
                        return False
                    if not _verify_dict(schema[key], json[key]):
                        return False
                elif isinstance(schema[key], list):
                    if not isinstance(json[key], list):
                        return False
                    if not _verify_list(schema[key][0], json[key][0]):
                        return False
                else:
                    raise ValueError(f"Unknown type {type(schema[key])}")

        return True

    def _verify_list(schema, json):
        assert len(schema) == 1  # Schema for a list should only have one child
        if not isinstance(json, list):
            return False

        if isinstance(schema[0], Placeholder):
            expected_type = Placeholder.type_dict[schema[0].expected_type]

            for item in json:
                if not isinstance(item, expected_type):
                    return False
        else:
            expected_type = type(schema[0])
            for item in json:
                if not isinstance(item, expected_type):
                    return False
                if isinstance(item, dict):
                    if not _verify_dict(schema[0], item):
                        return False
                elif isinstance(item, list):
                    if not _verify_list(schema[0], item):
                        return False
                else:
                    raise ValueError(f"Unknown type {type(item)}")

        return True

    return _verify_dict(json_schema, generated_json)


In [45]:
print(tostring(output, pretty_print=True, encoding="unicode"))

In [46]:
print(generate_json_skeleton_from_schema(output))

In [50]:
generated_output = {
    'random_list_473': [0.5, 0.6, 0.7, 0.8, 0.9],
    'random_list_224': [
        [
            [1, 2, 3, 4, 5],
            [6, 7, 8, 9, 10],
        ],
        [
            [11, 12, 13, 14, 15],
            [16, 17, 18, 19, 20],
        ],
        [
            [21, 22, 23, 24, 25],
            [26, 27, 28, 29, 30],
        ],
    ],
    'random_time_837': "12:00:00",
}

In [54]:
verify_schema_against_json(output, generated_output)

TypeError: object of type 'Placeholder' has no len()