# CSV Validation: Reference vs. Generated

This cell compares reference and generated `.csv` files (both *detailed* and *summary*):

- Loads and aligns columns (based on reference file).
- Sorts rows to avoid order-related mismatches.
- Checks equality and prints differences (missing/extra rows)

Note: there is a difference between reference and generated csv version 1.5.0, the reference is missing some files

In [None]:
import pandas as pd
from pathlib import Path

In [None]:
# Define pairs (reference, generated)
file_pairs = [
    # Detailed
    ("../tests/reference/ref_detailed_1.5.0.csv", "../tests/output/detailed_1.5.0.csv"),
    ("../tests/reference/ref_detailed_1.4.3.csv", "../tests/output/detailed_1.4.3.csv"),
    # Summary
    ("../tests/reference/ref_summary_1.4.3.csv", "../tests/output/summary_1.4.3.csv"),
    ("../tests/reference/ref_summary_1.5.0.csv", "../tests/output/summary_1.5.0.csv"),
]

In [2]:
for ref_path_str, gen_path_str in file_pairs:
    reference = Path(ref_path_str)
    generated = Path(gen_path_str)

    if not reference.exists():
        print(f"Reference file not found: {reference}")
        continue
    if not generated.exists():
        print(f"Generated file not found: {generated}")
        continue

    df_ref = pd.read_csv(reference)
    df_gen = pd.read_csv(generated)

    # Sort by reference
    try:
        df_gen = df_gen[df_ref.columns.tolist()]
    except KeyError as e:
        print(f"Column mismatch between files: {reference.name} vs {generated.name}")
        print("Missing columns:", e)
        continue

    # Sort by columns ignore the order of rows
    df_ref_sorted = df_ref.sort_values(by=df_ref.columns.tolist()).reset_index(drop=True)
    df_gen_sorted = df_gen.sort_values(by=df_ref.columns.tolist()).reset_index(drop=True)

    # Compare
    are_equal = df_ref_sorted.equals(df_gen_sorted)
    print(f"\nComparing files:\nReference: {reference.name}\nGenerated: {generated.name}")
    print("Files are identical:", are_equal)

    if not are_equal:
        missing_rows = pd.concat([df_ref_sorted, df_gen_sorted, df_gen_sorted]).drop_duplicates(keep=False)
        extra_rows = pd.concat([df_gen_sorted, df_ref_sorted, df_ref_sorted]).drop_duplicates(keep=False)

        print("\nMissing rows:")
        print(missing_rows)

        print("\nExtra rows:")
        print(extra_rows)



Comparing files:
Reference: ref_detailed_1.5.0.csv
Generated: detailed_1.5.0.csv
Files are identical: False

Missing rows:
Empty DataFrame
Columns: [filename, namespace, ZaznamObjektu_type, GeometrieObjektu_type, OblastObjektuKI, name, type, code_base_fixed, code_base_use, code_suffix_fixed, code_suffix_use, dim_fixed, dim_use, gia_fixed, gia_use, KategorieObjektu_fixed, SkupinaObjektu_fixed, ObsahovaCast_fixed]
Index: []

Extra rows:
                                         filename namespace  \
30   PSPI_jina_technologicka_stavba_TI-plocha.xsd    poljit   
32                       PSPI_kolektor-plocha.xsd    polkol   
173                           kolektor-plocha.xsd    kolpol   
351            trakcni_trolejove_vedeni-linie.xsd    ttvlin   

                        ZaznamObjektu_type  \
30           ['RefN', 'Ins', 'Upd', 'Del']   
32           ['RefN', 'Ins', 'Upd', 'Del']   
173  ['RefV', 'RefN', 'Ins', 'Upd', 'Del']   
351  ['RefV', 'RefN', 'Ins', 'Upd', 'Del']   

             