# XSD to CSV Conversion Testing

In [1]:
!pip install xmlschema

import os
import zipfile
import xmlschema
from lxml import etree
import pandas as pd

Defaulting to user installation because normal site-packages is not writeable


In [2]:
# Define the path to the zip file
input_file_path = r'..\tests\data\JVF_DTM_143_XSD.zip'
output_file_path = r'..\tests\output\JVF_DTM_143_XSD'

In [3]:
# Create the extraction directory if it does not exist
if not os.path.exists(output_file_path):
    os.makedirs(output_file_path)

# Extract the zip file
with zipfile.ZipFile(input_file_path, 'r') as zip_ref:
    zip_ref.extractall(output_file_path)

# List the extracted files
extracted_files = os.listdir(output_file_path)
print(output_file_path)

..\tests\output\JVF_DTM_143_XSD


In [4]:
# Set the path to the folder with extracted XSD files
xsd_dir = os.path.join(output_file_path, 'xsd')

def load_xsd_files(directory):
    """
    Load all XSD files from the given directory and parse them into XML trees.

    Args:
        directory (str): Path to the folder with XSD files.

    Returns:
        list: List of tuples (file_name, parsed_xsd_tree).
    """
    xsd_files = []
    
    # Walk through the directory and find all files
    for root, dirs, files in os.walk(directory):
        for file in files:
            # Only process .xsd files
            if file.endswith('.xsd'):
                file_path = os.path.join(root, file)
                
                # Read the file content
                with open(file_path, 'rb') as f:
                    xsd_content = f.read()
                
                # Parse content as XML
                xsd_tree = etree.XML(xsd_content)
                
                # Save filename and parsed tree
                xsd_files.append((file, xsd_tree))
    
    return xsd_files

# Load and parse the XSD files
xsd_files = load_xsd_files(xsd_dir)

In [5]:
# Initialize an empty list for storing data
data = []

# Loop through the loaded XSD files
for file_name, xsd_tree in xsd_files:
    # Get the root tag
    root_tag = xsd_tree.tag
    
    # Count the number of elements in the root
    num_elements = len(xsd_tree)
    
    # Add the data to the list
    data.append({
        "File Name": file_name,
        "Root Tag": root_tag,
        "Number of Elements": num_elements
    })

# Create a pandas DataFrame from the list
df = pd.DataFrame(data)

# Show the DataFrame
df

Unnamed: 0,File Name,Root Tag,Number of Elements
0,atributy.xsd,{http://www.w3.org/2001/XMLSchema}schema,185
1,common.xsd,{http://www.w3.org/2001/XMLSchema}schema,4
2,doprovodne_informace.xsd,{http://www.w3.org/2001/XMLSchema}schema,29
3,extenze.xsd,{http://www.w3.org/2001/XMLSchema}schema,1
4,servis.xsd,{http://www.w3.org/2001/XMLSchema}schema,1
...,...,...,...
442,zed-linie.xsd,{http://www.w3.org/2001/XMLSchema}schema,5
443,zed-plocha.xsd,{http://www.w3.org/2001/XMLSchema}schema,5
444,zeleznicni_prejezd-plocha.xsd,{http://www.w3.org/2001/XMLSchema}schema,5
445,zemedelska_plocha-defbod.xsd,{http://www.w3.org/2001/XMLSchema}schema,5


In [6]:
# Path to the folder with extracted .xsd files
xsd_objekty_path = os.path.join(output_file_path, 'xsd', 'objekty')

# Initialize an empty list for the result records
records = []

# Iterate over all .xsd files in the folder
for filename in os.listdir(xsd_objekty_path):
    if filename.endswith(".xsd"):
        full_path = os.path.join(xsd_objekty_path, filename)
        try:
            tree = etree.parse(full_path)
            root = tree.getroot()

            # Find all complexType definitions
            complex_types = root.xpath(".//*[local-name()='complexType']")
            for ct in complex_types:
                geometries = []  # collect gml: geometry types
                for el in ct.xpath(".//*[local-name()='element']"):
                    name = el.get("name")
                    ref = el.get("ref")
                    typ = el.get("type")
                    min_occurs = el.get("minOccurs")

                    # Add referenced atr: attributes
                    if ref and ref.startswith("atr:"):
                        records.append({
                            "filename": filename,
                            "nazev": ref,
                            "minOccurs": min_occurs
                        })
                    # Collect gml: types as geometry definitions
                    elif typ and typ.startswith("gml:"):
                        geometries.append(typ)

                # If any gml: geometries were found, write them as a combined record
                if geometries:
                    records.append({
                        "filename": filename,
                        "nazev": str(geometries),
                        "minOccurs": 0
                    })

        except Exception as e:
            print(f"Error while processing {filename}: {e}")

# Create DataFrame from collected records
df_str2 = pd.DataFrame(records)

# Save DataFrame to CSV file
output_csv_path = os.path.join(output_file_path, "str2.csv")
df_str2.to_csv(output_csv_path, index=False)

# Show the DataFrame
df_str2

Unnamed: 0,filename,nazev,minOccurs
0,BP_plynovodni_site-plocha.xsd,atr:SpolecneAtributyVsechObjektu,
1,BP_plynovodni_site-plocha.xsd,atr:SpolecneAtributyObjektuPasemTI,
2,BP_plynovodni_site-plocha.xsd,atr:OblastObjektuKI,0
3,BP_plynovodni_site-plocha.xsd,atr:SpolecneAtributyVsechObjektu,
4,BP_plynovodni_site-plocha.xsd,atr:SpolecneAtributyObjektuPasemTI,
...,...,...,...
4661,zemedelska_plocha-plocha.xsd,atr:SpolecneAtributyObjektuZPS,
4662,zemedelska_plocha-plocha.xsd,atr:TypZemedelskePlochy,
4663,zemedelska_plocha-plocha.xsd,atr:SpolecneAtributyVsechObjektu,
4664,zemedelska_plocha-plocha.xsd,atr:SpolecneAtributyObjektuZPS,


In [7]:
# Path to the folder containing XSD files
xsd_objects_path = os.path.join(output_file_path, 'xsd', 'objects')

# Helper function to extract the targetNamespace from an XSD file
def get_target_namespace(file_path):
    try:
        tree = etree.parse(file_path)
        root = tree.getroot()
        return root.attrib.get("targetNamespace", "")
    except Exception as e:
        print(f"Error while reading {file_path}: {e}")
        return ""

# Build a list of dictionaries with filenames and their associated namespaces
xsd_info = []
for fname in os.listdir(xsd_objekty_path):
    if fname.endswith(".xsd"):
        full_path = os.path.join(xsd_objekty_path, fname)
        namespace = get_target_namespace(full_path)
        xsd_info.append({"filename": fname, "namespace": namespace})

# Convert the list into a pandas DataFrame
df_xsd_files = pd.DataFrame(xsd_info)

# Show the DataFrame
df_xsd_files

Unnamed: 0,filename,namespace
0,BP_plynovodni_site-plocha.xsd,bpplsi
1,BP_podzemniho_zasobniku_plynu-plocha.xsd,bpppol
2,BP_zarizeni_PKO-plocha.xsd,bpzpko
3,budova-defbod.xsd,buddef
4,budova-plocha.xsd,budpol
...,...,...
353,zed-linie.xsd,zedlin
354,zed-plocha.xsd,zedpol
355,zeleznicni_prejezd-plocha.xsd,zprpol
356,zemedelska_plocha-defbod.xsd,zepdef
