# XSD to CSV Conversion Testing

### Import needed libraries

- `zipfile` – for working with ZIP archives
- `json` – for working with jsons
- `xmlschema` – to read and validate XSD files (`pip install xmlschema`)
- `pandas` – for working with tables and dataframes (`pip install pandas`)
- `lxml.etree` – to parse and read XML/XSD structure (`pip install lxml`)
- `pathlib.Path` – for path handling

Make sure to install missing ones using `pip` if needed.

In [1]:
import zipfile
import json
import xmlschema
import pandas as pd
from lxml import etree
from pathlib import Path

### Set paths for input ZIP and output folder

This cell sets:
- `input_zip_path`: path to the input ZIP file with XSDs
- `output_path`: where to extract files

If the folder doesn't exist, it will be created.

In [3]:
# Define the path to the input zip file and output directory
input_zip_path = Path("../tests/data/JVF_DTM_143_XSD.zip")
output_path = Path("../tests/output/JVF_DTM_143_XSD")

# Create the output directory if it doesn't exist
output_path.mkdir(parents=True, exist_ok=True)

### Unzip the input file

Extracts the ZIP archive to the output folder and prints the extracted file names.

In [4]:
# Extract the zip file
with zipfile.ZipFile(input_zip_path, 'r') as zip_ref:
    zip_ref.extractall(output_path)

# Show extracted files
print(f"Extracted to: {output_path}")
print(sorted(f.name for f in output_path.iterdir()))

Extracted to: ..\tests\output\JVF_DTM_143_XSD
['str1.csv', 'str2.csv', 'xsd']


### Load and parse XSD files

Defines a function that finds all `.xsd` files in the folder, parses them into XML trees, and returns a list of (file name, XML root element) pairs.

In [5]:
# Set the path to the folder with extracted XSD files
xsd_dir = output_path / "xsd"

def load_xsd_files(directory):
    """Load all XSD files from the given directory and parse them into XML trees.
    
    Args:
        directory (Path): Base directory to search for .xsd files.

    Returns:
        list of (filename, XML root element) tuples.
    """
    
    # Initialize an empty list to store parsed XSD file information
    xsd_files = []
    
    # Recursively find all .xsd files
    for path in directory.rglob("*.xsd"):
        try:
            # Parse the file into an XML tree
            tree = etree.parse(str(path))
    
            # Add the file name and root element to the list
            xsd_files.append((path.name, tree.getroot()))
    
        except etree.XMLSyntaxError as e:
            # Handle invalid XML syntax
            print(f"[XMLSyntaxError] Skipping {path.name}: {e}")
    
        except Exception as e:
            #  Handle all unexpected errors
            print(f"[UnexpectedError] Could not process {path.name}: {e}")
            raise  # Re-raise the exception

    return xsd_files

# Call the function to load and parse the XSD files
xsd_files = load_xsd_files(xsd_dir)

### Create summary table of XSD files

Builds a simple table showing each XSD file's name, root tag, and number of elements.

In [6]:
# Initialize empty lists for storing data
summary = []

for file_name, root in xsd_files:
    summary.append({
        "File Name": file_name,
        "Root Tag": root.tag,
        "Number of Elements": len(root)
    })

df_summary = pd.DataFrame(summary)
df_summary

Unnamed: 0,File Name,Root Tag,Number of Elements
0,atributy.xsd,{http://www.w3.org/2001/XMLSchema}schema,185
1,common.xsd,{http://www.w3.org/2001/XMLSchema}schema,4
2,doprovodne_informace.xsd,{http://www.w3.org/2001/XMLSchema}schema,29
3,extenze.xsd,{http://www.w3.org/2001/XMLSchema}schema,1
4,servis.xsd,{http://www.w3.org/2001/XMLSchema}schema,1
...,...,...,...
442,spatialReferencing.xsd,{http://www.w3.org/2001/XMLSchema}schema,14
443,geometry.xsd,{http://www.w3.org/2001/XMLSchema}schema,19
444,gss.xsd,{http://www.w3.org/2001/XMLSchema}schema,7
445,gts.xsd,{http://www.w3.org/2001/XMLSchema}schema,7


### Table of XSD Element References

This cell generates a table `str2.csv`

Each row includes:
- **filename**: the XSD file name,
- **nazev**: the reference name (`atr:` or `gml:`),
- **minOccurs**: optionality (`0` if the element is optional).

In [13]:
# Path to the folder with extracted XSD files
xsd_objects_path = output_path / "xsd" / "objekty"

records = []
seen_global = set()  # Track all (filename, nazev) pairs to avoid duplicates

# Loop through all .xsd files in the directory
for file_path in xsd_objects_path.glob("*.xsd"):
    try:
        # Parse the XSD file into an XML tree
        tree = etree.parse(str(file_path))
        root = tree.getroot()

        # Find all <complexType> elements in the schema
        complex_types = root.xpath(".//*[local-name()='complexType']")

        for complex_type in complex_types:
            atr_normal = []
            atr_ki = []
            gml_refs = []
            gml_min_flags = []
            atr_ki_with_0 = False

            # Find nested <element> definitions
            for element in complex_type.xpath(".//*[local-name()='element']"):
                ref = element.get("ref")
                min_occurs = element.get("minOccurs")

                if not ref:
                    continue

                key = (file_path.name, ref)
                if key in seen_global:
                    continue
                seen_global.add(key)

                # Handle atr: references
                if ref.startswith("atr:"):
                    entry = {
                        "filename": file_path.name,
                        "nazev": ref,
                        "minOccurs": min_occurs  # May be None
                    }
                    if ref.endswith("KI"):
                        atr_ki.append(entry)
                        if min_occurs == "0":
                            atr_ki_with_0 = True
                    else:
                        atr_normal.append(entry)

                # Handle gml: references
                elif ref.startswith("gml:"):
                    gml_refs.append(ref)
                    gml_min_flags.append(min_occurs)

            # Add atr: not ending with KI
            records.extend(atr_normal)

            # Add gml: references
            if gml_refs:
                min_occurs_final = "0" if "0" in gml_min_flags or atr_ki_with_0 else None
                records.append({
                    "filename": file_path.name,
                    "nazev": str(gml_refs),  # Format as list string e.g. ['gml:...']
                    "minOccurs": min_occurs_final
                })

            # Add atr: ending with KI
            records.extend(atr_ki)

    except etree.XMLSyntaxError as e:
        print(f"[XMLSyntaxError] {file_path.name}: {e}")
    except Exception as e:
        print(f"[Error] Failed to process {file_path.name}: {e}")
        raise

# Create DataFrame from extracted records
df_str2 = pd.DataFrame(records)

# Save the DataFrame to CSV
df_str2.to_csv(output_path.parent / "str2.csv", index=False)

# Display the resulting DataFrame
df_str2

Unnamed: 0,filename,nazev,minOccurs
0,BP_plynovodni_site-plocha.xsd,atr:SpolecneAtributyVsechObjektu,
1,BP_plynovodni_site-plocha.xsd,atr:SpolecneAtributyObjektuPasemTI,
2,BP_plynovodni_site-plocha.xsd,['gml:surfaceProperty'],0
3,BP_plynovodni_site-plocha.xsd,atr:OblastObjektuKI,0
4,BP_podzemniho_zasobniku_plynu-plocha.xsd,atr:SpolecneAtributyVsechObjektu,
...,...,...,...
1541,zemedelska_plocha-defbod.xsd,['gml:pointProperty'],
1542,zemedelska_plocha-plocha.xsd,atr:SpolecneAtributyVsechObjektu,
1543,zemedelska_plocha-plocha.xsd,atr:SpolecneAtributyObjektuZPS,
1544,zemedelska_plocha-plocha.xsd,atr:TypZemedelskePlochy,


## str1 - version 1 (old)

* manually defined columns


In [7]:
# # Path to the folder with extracted .xsd files
# xsd_objects_path = output_path / "xsd" / "objekty"
# 
# # Initialize an empty list for the result records
# records = []
# 
# # Iterate over all .xsd files in the folder
# for file_path in xsd_objects_path.glob("*.xsd"):
#     try:
#         # Parse the XSD file into an XML tree
#         tree = etree.parse(str(file_path))
#         root = tree.getroot()
# 
#         # Extract the target namespace of the schema (if present)
#         namespace = root.attrib.get("targetNamespace", "")
#         
#         # Initialize default placeholder values
#         element_name = ""
#         element_type = ""
#         code_base_fixed = ""
#         code_base_use = ""
#         code_suffix_fixed = ""
#         code_suffix_use = ""
#         kategorie_objektu = ""
#         skupina_objektu = ""
#         obsahova_cast = ""
#         geom_minOccurs = ""
#         geom = ""
#         oblastObjektuKI = 0
#         ki_minOccurs = ""
# 
#         # Find all complexType definitions
#         complex_types = root.xpath(".//*[local-name()='complexType']")
# 
#         for complex_type in complex_types:
#             # Search for nested element definitions within each complexType
#             for element in complex_type.xpath(".//*[local-name()='element']"):
#                 name = element.get("name")
#                 ref = element.get("ref")
#                 min_occurs = element.get("minOccurs")
# 
#                 # Locate the first element whose type starts with this namespace
#                 first_element = root.xpath(
#                     f".//*[local-name()='element'][starts-with(@type, '{namespace}:')]"
#                 )
#                 if first_element:
#                     element_name = first_element[0].get("name")
#                     element_type = first_element[0].get("type")
# 
#                 # If this element is ObjektovyTypNazev, extract its attributes
#                 if name == "ObjektovyTypNazev":
#                     code_base_fixed_list = element.xpath(
#                         ".//*[local-name()='attribute' and @name='code_base']/@fixed"
#                     )
#                     code_base_use_list = element.xpath(
#                         ".//*[local-name()='attribute' and @name='code_base']/@use"
#                     )
#                     code_suffix_fixed_list = element.xpath(
#                         ".//*[local-name()='attribute' and @name='code_suffix']/@fixed"
#                     )
#                     code_suffix_use_list = element.xpath(
#                         ".//*[local-name()='attribute' and @name='code_suffix']/@use"
#                     )
# 
#                     if code_base_fixed_list:
#                         code_base_fixed = int(code_base_fixed_list[0])
#                     if code_base_use_list:
#                         code_base_use = code_base_use_list[0]
#                     if code_suffix_fixed_list:
#                         code_suffix_fixed = int(code_suffix_fixed_list[0])
#                     if code_suffix_use_list:
#                         code_suffix_use = code_suffix_use_list[0]
# 
#                 # If this element is KategorieObjektu, record its fixed value
#                 elif name == "KategorieObjektu":
#                     kategorie_objektu = element.get("fixed", "")
#                 # If this element is SkupinaObjektu, record its fixed value
#                 elif name == "SkupinaObjektu":
#                     skupina_objektu = element.get("fixed", "")
#                 # If this element is ObsahovaCast, record its fixed value
#                 elif name == "ObsahovaCast":
#                     obsahova_cast = element.get("fixed", "")
# 
#                 # If this element is GeometrieObjektu, record reference and minOccurs
#                 elif name == "GeometrieObjektu":
#                     geom_minOccurs = min_occurs
#                     ref_elem = element.xpath(
#                         ".//*[local-name()='element' and @ref]"
#                     )
#                     geom = ref_elem[0].get("ref") if ref_elem else ""
# 
#                 # If the element references atr:OblastObjektuKI, set the flag
#                 if ref == "atr:OblastObjektuKI":
#                     ki_minOccurs = min_occurs
#                     oblastObjektuKI = 1
# 
#         # Append the collected data for this XSD file to records
#         records.append({
#             "filename": file_path.name,
#             "namespace": namespace,
#             "zaznamy": 'XXX',
#             "geom":geom,
#             "OblastObjektuKI":oblastObjektuKI,
#             "name":element_name,
#             "type":element_type,
#             "code_base_fixed":code_base_fixed,
#             "code_base_use":code_base_use,
#             "code_suffix_fixed":code_suffix_fixed,
#             "code_suffix_use":code_suffix_use,
#             "KategorieObjektu":kategorie_objektu,
#             "SkupinaObjektu":skupina_objektu,
#             "ObsahovaCast":obsahova_cast,
#             "geom_minOccurs":geom_minOccurs,
#             "oblaskKI_minOccurs":ki_minOccurs
#         })
# 
#     except etree.XMLSyntaxError as e:
#         print(f"[XMLSyntaxError] {file_path.name}: {e}")
#     except Exception as e:
#         print(f"[Error] Failed to process {file_path.name}: {e}")
#         raise
# 
# # Create DataFrame from collected records
# df_str1 = pd.DataFrame(records)
# 
# # Save DataFrame to CSV file
# output_csv_path = output_path.parent / "str1.csv"
# output_csv_path.parent.mkdir(parents=True, exist_ok=True)
# df_str1.to_csv(output_csv_path, index=False)
# 
# # Show the DataFrame
# df_str1

Unnamed: 0,filename,namespace,zaznamy,geom,OblastObjektuKI,name,type,code_base_fixed,code_base_use,code_suffix_fixed,code_suffix_use,KategorieObjektu,SkupinaObjektu,ObsahovaCast,geom_minOccurs,oblaskKI_minOccurs
0,BP_plynovodni_site-plocha.xsd,bpplsi,XXX,gml:surfaceProperty,1,BPPlynovodniSite,bpplsi:BPPlynovodniSiteType,100000290,required,3,required,Ochranná a bezpečnostní pásma,Ochranné a bezpečnostní pásmo,TI,0,0
1,BP_podzemniho_zasobniku_plynu-plocha.xsd,bpppol,XXX,gml:surfaceProperty,1,BPPodzemnihoZasobnikuPlynu,bpppol:BPPodzemnihoZasobnikuPlynuType,100000369,required,3,required,Ochranná a bezpečnostní pásma,Ochranné a bezpečnostní pásmo,TI,0,0
2,BP_zarizeni_PKO-plocha.xsd,bpzpko,XXX,gml:surfaceProperty,1,BPZarizeniPKO,bpzpko:BPZarizeniPKOType,100000291,required,3,required,Ochranná a bezpečnostní pásma,Ochranné a bezpečnostní pásmo,TI,0,0
3,budova-defbod.xsd,buddef,XXX,gml:pointProperty,0,BudovaDefinicniBod,buddef:BudovaDefinicniBodType,100000001,required,4,required,Budovy,Objekt budovy,ZPS,,
4,budova-plocha.xsd,budpol,XXX,gml:surfaceProperty,0,BudovaPlocha,budpol:BudovaPlochaType,100000001,required,3,required,Budovy,Objekt budovy,ZPS,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
353,zed-linie.xsd,zedlin,XXX,gml:curveProperty,0,ZedLinie,zedlin:ZedLinieType,100000168,required,2,required,Součásti a příslušenství staveb,Stavba společná pro více skupin,ZPS,,
354,zed-plocha.xsd,zedpol,XXX,gml:surfaceProperty,0,ZedPlocha,zedpol:ZedPlochaType,100000168,required,3,required,Součásti a příslušenství staveb,Stavba společná pro více skupin,ZPS,,
355,zeleznicni_prejezd-plocha.xsd,zprpol,XXX,gml:surfaceProperty,0,ZeleznicniPrejezd,zprpol:ZeleznicniPrejezdType,100000022,required,3,required,Dopravní stavby,Drážní doprava,DI,,
356,zemedelska_plocha-defbod.xsd,zepdef,XXX,gml:pointProperty,0,ZemedelskaPlochaDefinicniBod,zepdef:ZemedelskaPlochaDefinicniBodType,100000207,required,4,required,"Vodstvo, vegetace a terén",Hospodářská plocha,ZPS,,


## str1 - version with config file

* target fields defined in JSON file - config_str1_test.json
* geometry is handled separately (will fix later)
* target fields are now duplicate in JSON, once as element types and once as output fields - for naming csv fields/columns

In [16]:
# Load JSON config
with open("../tests/data/config_str1_test.json", "r", encoding="utf-8") as f:
    config = json.load(f)

# Path to the folder with extracted .xsd files
xsd_objects_path = output_path / "xsd" / "objekty"
records = []
output_fields = config["output_fields"]
element_rules = config["element_types"]

for file_path in xsd_objects_path.glob("*.xsd"):
    try:
        tree = etree.parse(str(file_path))
        root = tree.getroot()
        namespace = root.attrib.get("targetNamespace", "")

        data = {key: "" for key in output_fields}
        data["filename"] = file_path.name
        data["namespace"] = namespace
        data["zaznamy"] = []
        data["OblastObjektuKI"] = 0
        

        # Get top-level element
        top_level_elems = root.xpath(f"./*[local-name()='element' and starts-with(@type, '{namespace}:')]")
        if top_level_elems:
            main_elem = top_level_elems[0]
            data["name"] = main_elem.get("name", "")
            data["type"] = main_elem.get("type", "")

        # Find all complex types and iterate over elements inside
        complex_types = root.xpath(".//*[local-name()='complexType']")
        for complex_type in complex_types:
            for element in complex_type.xpath(".//*[local-name()='element']"):
                name = element.get("name")
                ref = element.get("ref")
                min_occurs = element.get("minOccurs")
                match = None
                # Handle defined element types
                if name in element_rules:
                    etype = element_rules[name]
                    match = "name"
                elif ref in element_rules:
                    etype = element_rules[ref]
                    match = "ref"
                    clean_ref = ref.split(":")[-1]
                else:
                    continue
                    
                
                # Handle attributes
                if "attributes" in etype:
                    for attr_name, props in etype["attributes"].items():
                        for prop in props:
                            if match == "name":
                                val = element.xpath(f".//*[local-name()='attribute' and @name='{attr_name}']/@{prop}"
                                )
                            elif match == "ref":
                                val = element.xpath(f".//*[local-name()='attribute' and @ref='{attr_name}']/@{prop}"
                                )
                            if val:
                                data[f"{attr_name}_{prop}"] = val[0]
                                
                # Handle special case when asking directly for element properties
                true_flags = [key for key, value in etype.items() if value is True and key !="geometry"]

                for flag in true_flags:
                    # if flag != "geometry":
                    val = element.get(flag)
                    if val is not None:
                        if flag == "minOccurs":
                            data[f"{clean_ref}_{flag}"] = val
                            data[clean_ref] = 1
                        else:
                            if match == "name":
                                data[name] = val
                            elif match == "ref":
                                data[ref] = val
                    # else:
                    #     # val = element.xpath(".//*[local-name()='element' and @ref]")
                    #     # data[flag] = val[0].get('ref')
                    #     ref_elem = element.xpath(".//*[local-name()='element' and @ref]")
                    #     data["geom"] = ref_elem[0].get("ref") if ref_elem else ""
                        
                # Handle special case for geometry
                if etype.get("geometry") is True:
                    data["geom_minOccurs"] = min_occurs
                    ref_elem = element.xpath(".//*[local-name()='element' and @ref]")
                    data["geometry"] = [geom.get("ref") for geom in ref_elem]
                    
        records.append(data)

    except Exception as e:
        print(f"[Error] {file_path.name}: {e}")
        raise

# Output DataFrame
df_str1 = pd.DataFrame(records)
output_csv_path = output_path.parent / "str1.csv"
output_csv_path.parent.mkdir(parents=True, exist_ok=True)
df_str1.to_csv(output_csv_path, index=False)
df_str1


Unnamed: 0,filename,namespace,zaznamy,geometry,OblastObjektuKI,name,type,code_base_fixed,code_base_use,code_suffix_fixed,code_suffix_use,KategorieObjektu,SkupinaObjektu,ObsahovaCast,geom_minOccurs,OblastObjektuKI_minOccurs
0,BP_plynovodni_site-plocha.xsd,bpplsi,[],[gml:surfaceProperty],1,BPPlynovodniSite,bpplsi:BPPlynovodniSiteType,0100000290,required,03,required,Ochranná a bezpečnostní pásma,Ochranné a bezpečnostní pásmo,TI,0,0
1,BP_podzemniho_zasobniku_plynu-plocha.xsd,bpppol,[],[gml:surfaceProperty],1,BPPodzemnihoZasobnikuPlynu,bpppol:BPPodzemnihoZasobnikuPlynuType,0100000369,required,03,required,Ochranná a bezpečnostní pásma,Ochranné a bezpečnostní pásmo,TI,0,0
2,BP_zarizeni_PKO-plocha.xsd,bpzpko,[],[gml:surfaceProperty],1,BPZarizeniPKO,bpzpko:BPZarizeniPKOType,0100000291,required,03,required,Ochranná a bezpečnostní pásma,Ochranné a bezpečnostní pásmo,TI,0,0
3,budova-defbod.xsd,buddef,[],[gml:pointProperty],0,BudovaDefinicniBod,buddef:BudovaDefinicniBodType,0100000001,required,04,required,Budovy,Objekt budovy,ZPS,,
4,budova-plocha.xsd,budpol,[],"[gml:surfaceProperty, gml:multiCurveProperty]",0,BudovaPlocha,budpol:BudovaPlochaType,0100000001,required,03,required,Budovy,Objekt budovy,ZPS,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
353,zed-linie.xsd,zedlin,[],[gml:curveProperty],0,ZedLinie,zedlin:ZedLinieType,0100000168,required,02,required,Součásti a příslušenství staveb,Stavba společná pro více skupin,ZPS,,
354,zed-plocha.xsd,zedpol,[],"[gml:surfaceProperty, gml:multiCurveProperty]",0,ZedPlocha,zedpol:ZedPlochaType,0100000168,required,03,required,Součásti a příslušenství staveb,Stavba společná pro více skupin,ZPS,,
355,zeleznicni_prejezd-plocha.xsd,zprpol,[],[gml:surfaceProperty],0,ZeleznicniPrejezd,zprpol:ZeleznicniPrejezdType,0100000022,required,03,required,Dopravní stavby,Drážní doprava,DI,,
356,zemedelska_plocha-defbod.xsd,zepdef,[],[gml:pointProperty],0,ZemedelskaPlochaDefinicniBod,zepdef:ZemedelskaPlochaDefinicniBodType,0100000207,required,04,required,"Vodstvo, vegetace a terén",Hospodářská plocha,ZPS,,
