# Parquet vs. Vortex vs. Lance performance

Generate a wide random dataset (~100k rows x 4k float columns + 50 string columns) and benchmark on-disk formats.

Requires `pyarrow`, `lancedb`, `vortex-data`, `duckdb` (installed via uv).


**Setup**
- Run `uv run poe lab` (or `uv venv && uv sync && uv run jupyter lab`).
- Artifacts are written under `data/` (git-ignored).


In [1]:
import gc
import shutil
import time
from pathlib import Path

import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import lancedb
import duckdb
import vortex
import vortex.io as vxio

pd.set_option("display.precision", 4)


In [2]:
DATA_DIR = Path("data")
shutil.rmtree(DATA_DIR, ignore_errors=True)
DATA_DIR.mkdir(exist_ok=True)

N_ROWS = 100_000
N_COLS = 4_000  # float columns
STR_COLS = 50
DTYPE = np.float64
REPEATS = 5
SEED = 13

RUN_DUCKDB = True
DUCK_ROWS = None  # use full table
DUCK_REPEATS = REPEATS

PARQUET_PATH = DATA_DIR / "wide.parquet"
PARQUET_DUCK_PATH = DATA_DIR / "wide_duck.parquet"
LANCE_PATH = DATA_DIR / "lance_db"
VORTEX_PATH = DATA_DIR / "wide.vortex"
DUCK_PATH = DATA_DIR / "wide.duckdb"
LANCE_TABLE = "bench"
DUCK_TABLE = "bench"

# Resolve versions
try:
    import importlib.metadata as importlib_metadata
except ImportError:
    import importlib_metadata

try:
    vortex_version = getattr(vortex, '__version__', None) or importlib_metadata.version('vortex-data')
except importlib_metadata.PackageNotFoundError:
    try:
        vortex_version = importlib_metadata.version('vortex')
    except importlib_metadata.PackageNotFoundError:
        vortex_version = 'unknown'

VERSIONS = {
    'pyarrow': pa.__version__,
    'lancedb': getattr(lancedb, '__version__', 'unknown'),
    'duckdb': duckdb.__version__,
    'vortex': vortex_version,
}
FORMAT_VERSIONS = {
    'Parquet (pyarrow, zstd)': f"pyarrow {VERSIONS['pyarrow']}",
    'Parquet (duckdb, zstd)': f"duckdb {VERSIONS['duckdb']}",
    'Lance (lancedb)': f"lancedb {VERSIONS['lancedb']}",
    'Vortex': f"vortex {VERSIONS['vortex']}",
    'DuckDB (file table)': f"duckdb {VERSIONS['duckdb']}",
}

N_ROWS, N_COLS, STR_COLS, DTYPE


(100000, 4000, 50, numpy.float64)

In [3]:
rng = np.random.default_rng(SEED)
float_names = [f"col_{i:04d}" for i in range(N_COLS)]
float_columns = [pa.array(rng.standard_normal(N_ROWS, dtype=DTYPE)) for _ in range(N_COLS)]

str_names = [f"str_{i:04d}" for i in range(STR_COLS)]
str_columns = []
for _ in range(STR_COLS):
    ints = rng.integers(0, 1_000_000, size=N_ROWS, dtype=np.int32)
    strings = np.char.add('s', ints.astype(str))
    str_columns.append(pa.array(strings))

column_names = float_names + str_names
columns = float_columns + str_columns
table = pa.Table.from_arrays(columns, names=column_names)
print('table rows:', table.num_rows, 'cols:', table.num_columns)


table rows: 100000 cols: 4050


In [4]:
duck_table = table


In [5]:
def drop_path(path: Path) -> None:
    if path.is_dir():
        shutil.rmtree(path, ignore_errors=True)
    elif path.exists():
        path.unlink()


def path_size_bytes(path: Path) -> int:
    if path.is_file():
        return path.stat().st_size
    if path.is_dir():
        return sum(p.stat().st_size for p in path.rglob('*') if p.is_file())
    return 0


def run_benchmarks(table: pa.Table, configs, repeats: int = 3):
    results = []
    for cfg in configs:
        cfg_repeats = cfg.get('repeats', repeats)
        write_times = []
        read_times = []
        print(f"[format start] {cfg['name']}", flush=True)
        for run_idx in range(cfg_repeats):
            if cfg.get('cleanup', True):
                drop_path(cfg['path'])
            t0 = time.perf_counter()
            cfg_table = cfg.get('table', table)
            cfg['write'](cfg_table, cfg['path'])
            elapsed = time.perf_counter() - t0
            write_times.append(elapsed)
            print(f"[write] {cfg['name']} run {run_idx + 1}/{cfg_repeats}: {elapsed:.2f}s", flush=True)

        size_bytes = path_size_bytes(cfg['path'])

        for run_idx in range(cfg_repeats):
            gc.collect()
            t0 = time.perf_counter()
            _ = cfg['read'](cfg['path'])
            elapsed = time.perf_counter() - t0
            read_times.append(elapsed)
            print(f"[read ] {cfg['name']} run {run_idx + 1}/{cfg_repeats}: {elapsed:.2f}s", flush=True)

        results.append({
            'format': cfg['name'],
            'write_seconds': write_times,
            'read_seconds': read_times,
            'size_mb': size_bytes / (1024 * 1024),
        })
        print(f"[format end] {cfg['name']}", flush=True)
    return results


In [6]:
drop_path(LANCE_PATH)
LANCE_DB = lancedb.connect(LANCE_PATH)

def reset_lance_table(db, table_name):
    try:
        if hasattr(db, 'table_names') and table_name in db.table_names():
            if hasattr(db, 'drop_table'):
                db.drop_table(table_name)
            else:
                drop_path(LANCE_PATH)
                return lancedb.connect(LANCE_PATH)
    except Exception:
        drop_path(LANCE_PATH)
        return lancedb.connect(LANCE_PATH)
    return db


In [7]:
def lance_write(tbl, path=LANCE_PATH, table_name=LANCE_TABLE):
    global LANCE_DB
    LANCE_DB = reset_lance_table(LANCE_DB, table_name)
    LANCE_DB.create_table(table_name, tbl, mode="overwrite")


def lance_read(path=LANCE_PATH, table_name=LANCE_TABLE):
    return LANCE_DB.open_table(table_name).to_arrow()


def vortex_write(tbl, path=VORTEX_PATH):
    drop_path(path)
    vxio.write(tbl, str(path))


def vortex_read(path=VORTEX_PATH):
    return vortex.open(str(path)).to_arrow().read_all()


def duck_write(tbl, path=DUCK_PATH, table_name=DUCK_TABLE):
    drop_path(path)
    con = duckdb.connect(str(path))
    con.register('tmp_tbl', tbl)
    con.execute(f"CREATE TABLE {table_name} AS SELECT * FROM tmp_tbl")
    con.close()


def duck_read(path=DUCK_PATH, table_name=DUCK_TABLE):
    with duckdb.connect(str(path)) as con:
        return con.execute(f"SELECT * FROM {table_name}").fetch_arrow_table()


def parquet_duck_write(tbl, path=PARQUET_DUCK_PATH):
    drop_path(path)
    con = duckdb.connect()
    con.register('tmp_tbl', tbl)
    con.execute(f"COPY (SELECT * FROM tmp_tbl) TO '{path}' WITH (FORMAT 'PARQUET', COMPRESSION 'ZSTD')")
    con.close()


def parquet_duck_read(path=PARQUET_DUCK_PATH):
    with duckdb.connect() as con:
        return con.execute(f"SELECT * FROM read_parquet('{path}')").fetch_arrow_table()


format_configs = [
    {
        'name': 'Parquet (pyarrow, zstd)',
        'path': PARQUET_PATH,
        'write': lambda tbl, path=PARQUET_PATH: pq.write_table(tbl, path, compression='zstd'),
        'read': lambda path=PARQUET_PATH: pq.read_table(path),
    },
    {
        'name': 'Parquet (duckdb, zstd)',
        'path': PARQUET_DUCK_PATH,
        'write': parquet_duck_write,
        'read': parquet_duck_read,
    },
    {
        'name': 'Lance (lancedb)',
        'path': LANCE_PATH,
        'write': lance_write,
        'read': lance_read,
        'cleanup': False,
    },
    {
        'name': 'Vortex',
        'path': VORTEX_PATH,
        'write': vortex_write,
        'read': vortex_read,
    },
    {
        'name': 'DuckDB (file table)',
        'path': DUCK_PATH,
        'write': duck_write,
        'read': duck_read,
        'table': duck_table,
        'repeats': DUCK_REPEATS,
    },
]

print('Formats:', [cfg['name'] for cfg in format_configs])


Formats: ['Parquet (pyarrow, zstd)', 'Parquet (duckdb, zstd)', 'Lance (lancedb)', 'Vortex', 'DuckDB (file table)']


In [8]:
results = run_benchmarks(table, format_configs, repeats=REPEATS)
results_df = pd.DataFrame({
    'format': [r['format'] for r in results],
    'write_avg_s': [np.mean(r['write_seconds']) for r in results],
    'write_std_s': [np.std(r['write_seconds']) for r in results],
    'read_avg_s': [np.mean(r['read_seconds']) for r in results],
    'read_std_s': [np.std(r['read_seconds']) for r in results],
    'size_mb': [r['size_mb'] for r in results],
    'version': [FORMAT_VERSIONS.get(r['format'], '') for r in results],
})
results_df


[format start] Parquet (pyarrow, zstd)
[write] Parquet (pyarrow, zstd) run 1/5: 13.82s
[write] Parquet (pyarrow, zstd) run 2/5: 13.79s
[write] Parquet (pyarrow, zstd) run 3/5: 13.77s
[write] Parquet (pyarrow, zstd) run 4/5: 13.75s
[write] Parquet (pyarrow, zstd) run 5/5: 13.71s
[read ] Parquet (pyarrow, zstd) run 1/5: 0.61s
[read ] Parquet (pyarrow, zstd) run 2/5: 0.66s
[read ] Parquet (pyarrow, zstd) run 3/5: 0.56s
[read ] Parquet (pyarrow, zstd) run 4/5: 0.59s
[read ] Parquet (pyarrow, zstd) run 5/5: 0.54s
[format end] Parquet (pyarrow, zstd)
[format start] Parquet (duckdb, zstd)
[write] Parquet (duckdb, zstd) run 1/5: 10.49s
[write] Parquet (duckdb, zstd) run 2/5: 10.08s
[write] Parquet (duckdb, zstd) run 3/5: 10.03s
[write] Parquet (duckdb, zstd) run 4/5: 10.11s
[write] Parquet (duckdb, zstd) run 5/5: 10.02s
[read ] Parquet (duckdb, zstd) run 1/5: 3.99s
[read ] Parquet (duckdb, zstd) run 2/5: 4.53s
[read ] Parquet (duckdb, zstd) run 3/5: 4.04s
[read ] Parquet (duckdb, zstd) run 4/5

[90m[[0m2025-12-03T17:54:12Z [33mWARN [0m lance::dataset::write::insert[90m][0m No existing dataset at /Users/buntend/Documents/work/demo-compare-parquet-vortex-lance-perf/notebooks/data/lance_db/bench.lance, it will be created


[write] Lance (lancedb) run 1/5: 4.18s


[90m[[0m2025-12-03T17:54:16Z [33mWARN [0m lance::dataset::write::insert[90m][0m No existing dataset at /Users/buntend/Documents/work/demo-compare-parquet-vortex-lance-perf/notebooks/data/lance_db/bench.lance, it will be created


[write] Lance (lancedb) run 2/5: 4.22s


[90m[[0m2025-12-03T17:54:20Z [33mWARN [0m lance::dataset::write::insert[90m][0m No existing dataset at /Users/buntend/Documents/work/demo-compare-parquet-vortex-lance-perf/notebooks/data/lance_db/bench.lance, it will be created


[write] Lance (lancedb) run 3/5: 4.03s


[90m[[0m2025-12-03T17:54:24Z [33mWARN [0m lance::dataset::write::insert[90m][0m No existing dataset at /Users/buntend/Documents/work/demo-compare-parquet-vortex-lance-perf/notebooks/data/lance_db/bench.lance, it will be created


[write] Lance (lancedb) run 4/5: 4.42s


[90m[[0m2025-12-03T17:54:28Z [33mWARN [0m lance::dataset::write::insert[90m][0m No existing dataset at /Users/buntend/Documents/work/demo-compare-parquet-vortex-lance-perf/notebooks/data/lance_db/bench.lance, it will be created


[write] Lance (lancedb) run 5/5: 3.91s
[read ] Lance (lancedb) run 1/5: 5.04s
[read ] Lance (lancedb) run 2/5: 5.10s
[read ] Lance (lancedb) run 3/5: 4.49s
[read ] Lance (lancedb) run 4/5: 4.45s
[read ] Lance (lancedb) run 5/5: 5.11s
[format end] Lance (lancedb)
[format start] Vortex
[write] Vortex run 1/5: 7.29s
[write] Vortex run 2/5: 7.18s
[write] Vortex run 3/5: 7.70s
[write] Vortex run 4/5: 7.28s
[write] Vortex run 5/5: 7.25s
[read ] Vortex run 1/5: 0.87s
[read ] Vortex run 2/5: 0.57s
[read ] Vortex run 3/5: 0.59s
[read ] Vortex run 4/5: 0.58s
[read ] Vortex run 5/5: 0.61s
[format end] Vortex
[format start] DuckDB (file table)
[write] DuckDB (file table) run 1/5: 15.90s
[write] DuckDB (file table) run 2/5: 15.39s
[write] DuckDB (file table) run 3/5: 15.42s
[write] DuckDB (file table) run 4/5: 15.36s
[write] DuckDB (file table) run 5/5: 15.39s
[read ] DuckDB (file table) run 1/5: 6.68s
[read ] DuckDB (file table) run 2/5: 2.80s
[read ] DuckDB (file table) run 3/5: 2.59s
[read ] Duc

Unnamed: 0,format,write_avg_s,write_std_s,read_avg_s,read_std_s,size_mb,version
0,"Parquet (pyarrow, zstd)",13.7673,0.0372,0.5917,0.0409,3742.9114,pyarrow 22.0.0
1,"Parquet (duckdb, zstd)",10.146,0.1736,4.2278,0.2099,2943.4074,duckdb 1.4.2
2,Lance (lancedb),4.1525,0.1731,4.8372,0.3023,3123.8285,lancedb 0.25.3
3,Vortex,7.3386,0.1839,0.643,0.1141,2769.6237,vortex 0.56.0
4,DuckDB (file table),15.4919,0.2037,3.4556,1.6165,3026.0117,duckdb 1.4.2


In [9]:
timings = []
for r in results:
    for idx, t in enumerate(r['write_seconds']):
        timings.append({'format': r['format'], 'kind': 'write', 'run': idx, 'seconds': t})
    for idx, t in enumerate(r['read_seconds']):
        timings.append({'format': r['format'], 'kind': 'read', 'run': idx, 'seconds': t})

pd.DataFrame(timings)


Unnamed: 0,format,kind,run,seconds
0,"Parquet (pyarrow, zstd)",write,0,13.816
1,"Parquet (pyarrow, zstd)",write,1,13.7934
2,"Parquet (pyarrow, zstd)",write,2,13.7662
3,"Parquet (pyarrow, zstd)",write,3,13.7541
4,"Parquet (pyarrow, zstd)",write,4,13.7066
5,"Parquet (pyarrow, zstd)",read,0,0.6102
6,"Parquet (pyarrow, zstd)",read,1,0.6592
7,"Parquet (pyarrow, zstd)",read,2,0.5576
8,"Parquet (pyarrow, zstd)",read,3,0.5879
9,"Parquet (pyarrow, zstd)",read,4,0.5437
