In [8]:
# === Exact schema inspector (robust to pyarrow versions) ===
CSV_PATH = "Output/gridsearch_results1.csv"
PARQUET_PATH = "Output/gridsearch_segments1.parquet"
SAMPLE_ROWS = 20

import os, sys, subprocess, pandas as pd

def _hr(): print("-"*88)

def _ensure_engine():
    try:
        import pyarrow  # noqa
        return "pyarrow"
    except Exception:
        try:
            import fastparquet  # noqa
            return "fastparquet"
        except Exception:
            # try to install pyarrow
            try:
                print("Installing pyarrow ...")
                subprocess.check_call([sys.executable, "-m", "pip", "install", "--quiet", "pyarrow"])
                import pyarrow  # noqa
                return "pyarrow"
            except Exception:
                print("Installing fastparquet ...")
                subprocess.check_call([sys.executable, "-m", "pip", "install", "--quiet", "fastparquet"])
                import fastparquet  # noqa
                return "fastparquet"

def inspect_csv(path, nrows=20):
    _hr(); print(f"[CSV] {path}"); _hr()
    if not os.path.exists(path):
        print("File not found."); return
    cols = pd.read_csv(path, nrows=0).columns.tolist()
    print(f"Column count: {len(cols)}")
    for i,c in enumerate(cols): print(f"  {i:>3}: {c}")
    # fast row count
    total = 0
    for chunk in pd.read_csv(path, usecols=[cols[0]], chunksize=200_000):
        total += len(chunk)
    print(f"Row count: {total:,}")
    dfh = pd.read_csv(path, nrows=nrows)
    _hr(); print(f"Sample first {len(dfh)} rows:")
    print(dfh.to_string(index=False, max_colwidth=120))
    _hr(); print("Dtypes (pandas inferred):")
    for i,(c,t) in enumerate(dfh.dtypes.items()):
        print(f"  {i:>3}: {c:<40} {t}")

def inspect_parquet(path, nrows=20):
    _hr(); print(f"[PARQUET] {path}"); _hr()
    if not os.path.exists(path):
        print("File not found."); return
    engine = _ensure_engine()
    print(f"Engine: {engine}")

    if engine == "pyarrow":
        import pyarrow.parquet as pq
        pf = pq.ParquetFile(path)
        schema = pf.schema_arrow
        names = schema.names
        print(f"Columns: {len(names)}")
        for i, (name, field) in enumerate(zip(names, schema)):
            print(f"  {i:>3}: {name:<40} {field.type}")
        try:
            nrows_total = sum(pf.metadata.row_group(i).num_rows for i in range(pf.metadata.num_row_groups))
            print(f"Total rows: {nrows_total:,} | Row groups: {pf.metadata.num_row_groups}")
        except Exception:
            pass
        # sample
        dfh = pd.read_parquet(path, engine="pyarrow")
    else:
        import fastparquet as fp
        pf = fp.ParquetFile(path)
        names = pf.columns
        print(f"Columns: {len(names)}")
        for i, name in enumerate(names):
            print(f"  {i:>3}: {name}")
        try:
            print(f"Total rows: {pf.count():,} | Row groups: {len(pf.row_groups)}")
        except Exception:
            pass
        dfh = pf.to_pandas()  # sample from full due to engine API
    head = dfh.head(nrows)
    _hr(); print(f"Sample first {len(head)} rows:")
    print(head.to_string(index=False, max_colwidth=120))
    _hr(); print("Dtypes (pandas inferred on sample):")
    for i,(c,t) in enumerate(head.dtypes.items()):
        print(f"  {i:>3}: {c:<40} {t}")

# Run
inspect_csv(CSV_PATH, SAMPLE_ROWS)
inspect_parquet(PARQUET_PATH, SAMPLE_ROWS)


----------------------------------------------------------------------------------------
[CSV] Output/gridsearch_results1.csv
----------------------------------------------------------------------------------------
Column count: 26
    0: security
    1: config
    2: rank
    3: score
    4: dt
    5: n_regimes
    6: dim_latent
    7: single_subspace
    8: train_window
    9: overlap_window
   10: avg_inferred_regime_length
   11: elbo_start (min all runs)
   12: elbo_end (max all runs)
   13: elbo_delta (max all runs)
   14: cpll (max all runs)
   15: max cpll (proxy bound, paired)
   16: mode_usage
   17: cagr_rel
   18: cagr_strat
   19: cagr_bench
   20: cagr_rel_cusum
   21: cagr_strat_cusum
   22: cagr_bench_cusum
   23: cagr_rel_ex_ante
   24: cagr_strat_ex_ante
   25: cagr_bench_ex_ante
Row count: 852
----------------------------------------------------------------------------------------
Sample first 20 rows:
security config  rank     score    dt  n_regimes  dim_latent  sin