# Parquet vs. Vortex vs. Lance performance

Generate a wide random dataset (~100k rows x 3k columns) and benchmark on-disk formats. Decrease `N_ROWS`/`N_COLS` if you run into memory pressure.

Requires `pyarrow`, `lancedb`, and `vortex-data` (installed via uv).


**Setup**
- Run `uv run poe lab` (or `uv venv && uv sync && uv run jupyter lab`).
- Artifacts are written under `data/` (git-ignored).


In [1]:
import gc
import shutil
import time
from pathlib import Path

import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import lancedb
import vortex
import vortex.io as vxio

pd.set_option("display.precision", 4)


In [2]:
DATA_DIR = Path("data")
DATA_DIR.mkdir(exist_ok=True)

N_ROWS = 100_000
N_COLS = 3_000
DTYPE = np.float32
REPEATS = 3
SEED = 13

PARQUET_PATH = DATA_DIR / "wide.parquet"
LANCE_PATH = DATA_DIR / "lance_db"
VORTEX_PATH = DATA_DIR / "wide.vortex"

N_ROWS, N_COLS, DTYPE


(100000, 3000, numpy.float32)

In [3]:
rng = np.random.default_rng(SEED)
column_names = [f"col_{i:04d}" for i in range(N_COLS)]
columns = [pa.array(rng.standard_normal(N_ROWS, dtype=DTYPE)) for _ in range(N_COLS)]
table = pa.Table.from_arrays(columns, names=column_names)


In [4]:
def drop_path(path: Path) -> None:
    """Delete a file or directory if it exists."""
    if path.is_dir():
        shutil.rmtree(path, ignore_errors=True)
    elif path.exists():
        path.unlink()


def path_size_bytes(path: Path) -> int:
    if path.is_file():
        return path.stat().st_size
    if path.is_dir():
        return sum(p.stat().st_size for p in path.rglob("*") if p.is_file())
    return 0


def run_benchmarks(table: pa.Table, configs, repeats: int = 3):
    results = []
    for cfg in configs:
        write_times = []
        read_times = []
        for _ in range(repeats):
            drop_path(cfg['path'])
            t0 = time.perf_counter()
            cfg['write'](table, cfg['path'])
            write_times.append(time.perf_counter() - t0)

        size_bytes = path_size_bytes(cfg['path'])

        for _ in range(repeats):
            gc.collect()
            t0 = time.perf_counter()
            _ = cfg['read'](cfg['path'])
            read_times.append(time.perf_counter() - t0)

        results.append(
            {
                'format': cfg['name'],
                'write_seconds': write_times,
                'read_seconds': read_times,
                'size_mb': size_bytes / (1024 * 1024),
            }
        )
    return results


In [5]:
def lance_write(tbl, path=LANCE_PATH):
    drop_path(path)
    db = lancedb.connect(path)
    db.create_table("bench", tbl, mode="overwrite")


def lance_read(path=LANCE_PATH):
    db = lancedb.connect(path)
    return db.open_table("bench").to_arrow()


def vortex_write(tbl, path=VORTEX_PATH):
    drop_path(path)
    vxio.write(tbl, str(path))


def vortex_read(path=VORTEX_PATH):
    # vortex.open(...).to_arrow() returns a RecordBatchReader; .read_all() -> Table
    return vortex.open(str(path)).to_arrow().read_all()


format_configs = [
    {
        'name': 'Parquet (pyarrow, zstd)',
        'path': PARQUET_PATH,
        'write': lambda tbl, path=PARQUET_PATH: pq.write_table(tbl, path, compression='zstd'),
        'read': lambda path=PARQUET_PATH: pq.read_table(path),
    },
    {
        'name': 'Lance (lancedb)',
        'path': LANCE_PATH,
        'write': lance_write,
        'read': lance_read,
    },
    {
        'name': 'Vortex',
        'path': VORTEX_PATH,
        'write': vortex_write,
        'read': vortex_read,
    },
]

format_configs


[{'name': 'Parquet (pyarrow, zstd)',
  'path': PosixPath('data/wide.parquet'),
  'write': <function __main__.<lambda>(tbl, path=PosixPath('data/wide.parquet'))>,
  'read': <function __main__.<lambda>(path=PosixPath('data/wide.parquet'))>},
 {'name': 'Lance (lancedb)',
  'path': PosixPath('data/lance_db'),
  'write': <function __main__.lance_write(tbl, path=PosixPath('data/lance_db'))>,
  'read': <function __main__.lance_read(path=PosixPath('data/lance_db'))>},
 {'name': 'Vortex',
  'path': PosixPath('data/wide.vortex'),
  'write': <function __main__.vortex_write(tbl, path=PosixPath('data/wide.vortex'))>,
  'read': <function __main__.vortex_read(path=PosixPath('data/wide.vortex'))>}]

In [6]:
results = run_benchmarks(table, format_configs, repeats=REPEATS)
results_df = pd.DataFrame(
    {
        'format': [r['format'] for r in results],
        'write_avg_s': [np.mean(r['write_seconds']) for r in results],
        'write_std_s': [np.std(r['write_seconds']) for r in results],
        'read_avg_s': [np.mean(r['read_seconds']) for r in results],
        'read_std_s': [np.std(r['read_seconds']) for r in results],
        'size_mb': [r['size_mb'] for r in results],
    }
)
results_df


[90m[[0m2025-12-02T15:34:36Z [33mWARN [0m lance::dataset::write::insert[90m][0m No existing dataset at /Users/buntend/Documents/work/demo-compare-parquet-vortex-lance-perf/notebooks/data/lance_db/bench.lance, it will be created
[90m[[0m2025-12-02T15:34:38Z [33mWARN [0m lance::dataset::write::insert[90m][0m No existing dataset at /Users/buntend/Documents/work/demo-compare-parquet-vortex-lance-perf/notebooks/data/lance_db/bench.lance, it will be created
[90m[[0m2025-12-02T15:34:40Z [33mWARN [0m lance::dataset::write::insert[90m][0m No existing dataset at /Users/buntend/Documents/work/demo-compare-parquet-vortex-lance-perf/notebooks/data/lance_db/bench.lance, it will be created


Unnamed: 0,format,write_avg_s,write_std_s,read_avg_s,read_std_s,size_mb
0,"Parquet (pyarrow, zstd)",8.662,0.0354,0.413,0.2008,1666.391
1,Lance (lancedb),1.7387,0.1613,2.7244,0.0399,1145.195
2,Vortex,4.3172,0.1027,0.3798,0.0072,1015.947


In [7]:
timings = []
for r in results:
    for idx, t in enumerate(r['write_seconds']):
        timings.append({'format': r['format'], 'kind': 'write', 'run': idx, 'seconds': t})
    for idx, t in enumerate(r['read_seconds']):
        timings.append({'format': r['format'], 'kind': 'read', 'run': idx, 'seconds': t})

pd.DataFrame(timings)


Unnamed: 0,format,kind,run,seconds
0,"Parquet (pyarrow, zstd)",write,0,8.7108
1,"Parquet (pyarrow, zstd)",write,1,8.6469
2,"Parquet (pyarrow, zstd)",write,2,8.6282
3,"Parquet (pyarrow, zstd)",read,0,0.6965
4,"Parquet (pyarrow, zstd)",read,1,0.2565
5,"Parquet (pyarrow, zstd)",read,2,0.2861
6,Lance (lancedb),write,0,1.9648
7,Lance (lancedb),write,1,1.6517
8,Lance (lancedb),write,2,1.5995
9,Lance (lancedb),read,0,2.7804
