# Parquet vs. Vortex vs. Lance performance

Generate a wide random dataset (~100k rows x 3k columns) and benchmark on-disk formats. Decrease `N_ROWS`/`N_COLS` if you run into memory pressure.

Requires `pyarrow`, `lancedb`, and `vortex-data` (installed via uv).


**Setup**
- Run `uv run poe lab` (or `uv venv && uv sync && uv run jupyter lab`).
- Artifacts are written under `data/` (git-ignored).


In [None]:
import gc
import shutil
import time
from pathlib import Path

import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import lancedb
import vortex

pd.set_option("display.precision", 4)


In [None]:
DATA_DIR = Path("data")
DATA_DIR.mkdir(exist_ok=True)

N_ROWS = 100_000
N_COLS = 3_000
DTYPE = np.float32
REPEATS = 3
SEED = 13

PARQUET_PATH = DATA_DIR / "wide.parquet"
LANCE_PATH = DATA_DIR / "lance_db"
VORTEX_PATH = DATA_DIR / "wide.vortex"

N_ROWS, N_COLS, DTYPE


In [None]:
rng = np.random.default_rng(SEED)
column_names = [f"col_{i:04d}" for i in range(N_COLS)]
columns = [pa.array(rng.standard_normal(N_ROWS, dtype=DTYPE)) for _ in range(N_COLS)]
table = pa.Table.from_arrays(columns, names=column_names)
table


In [None]:
def drop_path(path: Path) -> None:
    """Delete a file or directory if it exists."""
    if path.is_dir():
        shutil.rmtree(path, ignore_errors=True)
    elif path.exists():
        path.unlink()


def path_size_bytes(path: Path) -> int:
    if path.is_file():
        return path.stat().st_size
    if path.is_dir():
        return sum(p.stat().st_size for p in path.rglob("*") if p.is_file())
    return 0


def make_vortex_writer():
    """Pick a vortex writer if available, otherwise raise with guidance."""
    writer = getattr(vortex, "write_dataset", None)
    if callable(writer):
        return writer
    vf_write = getattr(getattr(vortex, "VortexFile", None), "write", None)
    if callable(vf_write):
        return vf_write
    raise RuntimeError("vortex-data does not expose a write API; install a version with vortex.write_dataset or swap in the correct writer here.")


def run_benchmarks(table: pa.Table, configs, repeats: int = 3):
    results = []
    for cfg in configs:
        write_times = []
        read_times = []
        for _ in range(repeats):
            drop_path(cfg['path'])
            t0 = time.perf_counter()
            cfg['write'](table, cfg['path'])
            write_times.append(time.perf_counter() - t0)

        size_bytes = path_size_bytes(cfg['path'])

        for _ in range(repeats):
            gc.collect()
            t0 = time.perf_counter()
            _ = cfg['read'](cfg['path'])
            read_times.append(time.perf_counter() - t0)

        results.append(
            {
                'format': cfg['name'],
                'write_seconds': write_times,
                'read_seconds': read_times,
                'size_mb': size_bytes / (1024 * 1024),
            }
        )
    return results


In [None]:
vortex_write = make_vortex_writer()

def lance_write(tbl, path=LANCE_PATH):
    drop_path(path)
    db = lancedb.connect(path)
    db.create_table("bench", tbl, mode="overwrite")


def lance_read(path=LANCE_PATH):
    db = lancedb.connect(path)
    return db.open_table("bench").to_arrow()


format_configs = [
    {
        'name': 'Parquet (pyarrow, zstd)',
        'path': PARQUET_PATH,
        'write': lambda tbl, path=PARQUET_PATH: pq.write_table(tbl, path, compression='zstd'),
        'read': lambda path=PARQUET_PATH: pq.read_table(path),
    },
    {
        'name': 'Lance (lancedb)',
        'path': LANCE_PATH,
        'write': lance_write,
        'read': lance_read,
    },
    {
        'name': 'Vortex',
        'path': VORTEX_PATH,
        'write': lambda tbl, path=VORTEX_PATH: vortex_write(tbl, path),
        'read': lambda path=VORTEX_PATH: vortex.dataset.VortexDataset.from_path(path).to_table(),
    },
]

format_configs


In [None]:
results = run_benchmarks(table, format_configs, repeats=REPEATS)
results_df = pd.DataFrame(
    {
        'format': [r['format'] for r in results],
        'write_avg_s': [np.mean(r['write_seconds']) for r in results],
        'write_std_s': [np.std(r['write_seconds']) for r in results],
        'read_avg_s': [np.mean(r['read_seconds']) for r in results],
        'read_std_s': [np.std(r['read_seconds']) for r in results],
        'size_mb': [r['size_mb'] for r in results],
    }
)
results_df


In [None]:
timings = []
for r in results:
    for idx, t in enumerate(r['write_seconds']):
        timings.append({'format': r['format'], 'kind': 'write', 'run': idx, 'seconds': t})
    for idx, t in enumerate(r['read_seconds']):
        timings.append({'format': r['format'], 'kind': 'read', 'run': idx, 'seconds': t})

pd.DataFrame(timings)
