# Why Parquet?

This notebook explores the benefits or drawbacks of using the [parquet](https://parquet.apache.org/docs/) file format relative to other formats such as CSV or SQLite.

In [1]:
import os
import pathlib
import shutil
from typing import Literal

import anndata as ad
import duckdb
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.io as pio
from IPython.display import Image
from utilities import get_system_info, timer

In [2]:
# show the system information
_ = get_system_info(show_output=True)


System Information:
Operating System: Linux
Machine Type: x86_64
Processor: x86_64
CPU Cores (Logical): 16
CPU Cores (Physical): 8
Total RAM (GB): 62.68
Python Version: 3.12.5


In [3]:
# target file or table names
image_dir = "images"
csv_name = "example.csv.gz"
parquet_name = "example.parquet"
sqlite_name = "example.sqlite"
sqlite_tbl_name = "tbl_example"
anndata_h5_name = "adata.h5ad"
anndata_zarr_name = "adata.zarr"
file_write_time_image = f"{image_dir}/parquet-comparisons-file-write-time.png"
file_storage_size_image = f"{image_dir}/parquet-comparisons-file-storage-size.png"
file_read_time_all_image = (
    f"{image_dir}/parquet-comparisons-file-read-time-all-columns.png"
)
file_read_time_one_image = (
    f"{image_dir}/parquet-comparisons-file-read-time-one-column.png"
)

pathlib.Path(csv_name).unlink(missing_ok=True)
pathlib.Path(parquet_name).unlink(missing_ok=True)
pathlib.Path(sqlite_name).unlink(missing_ok=True)
pathlib.Path(anndata_h5_name).unlink(missing_ok=True)
if pathlib.Path(anndata_zarr_name).is_dir():
    shutil.rmtree(anndata_zarr_name)

In [4]:
def write_anndata(
    df: pd.DataFrame,
    write_to: Literal["h5ad", "zarr"],
    dest_path: str,
) -> str:
    """
    Serialize a DataFrame to AnnData (h5ad or zarr).

    Numeric columns are stored in ``X`` (observations × variables). All
    remaining columns are stored in ``.obs``. Variable (feature) names are taken
    from the numeric column labels, and observation names from the DataFrame
    index.

    Args:
        df:
            Input table with rows as observations and columns as features.
        write_to:
            Output format. Either ``"h5ad"`` or ``"zarr"``.
        dest_path:
            Destination file (``.h5ad``) or directory (zarr store)
            to write to. Parent directories are created if missing.

    Returns:
        The path written to as a string.
    """
    dest = pathlib.Path(dest_path)
    dest.parent.mkdir(parents=True, exist_ok=True)

    numeric = df.select_dtypes(include=["number"])
    if numeric.shape[1] == 0:
        raise ValueError("No numeric columns found to place in AnnData.X.")

    non_numeric = df.select_dtypes(exclude=["number"])

    adata = ad.AnnData(X=numeric.to_numpy())
    adata.obs_names = df.index.astype(str)
    adata.var_names = numeric.columns.astype(str)
    # Align non-numeric obs metadata to the same index
    adata.obs = non_numeric.copy()

    if write_to == "h5ad":
        adata.write_h5ad(str(dest))
    elif write_to == "zarr":
        # For zarr, the destination is a directory-like store
        adata.write_zarr(str(dest))
    else:
        raise ValueError('write_to must be "h5ad" or "zarr".')

    return str(dest)


def read_anndata(
    path: str,
    read_from: Literal["h5ad", "zarr"],
) -> pd.DataFrame:
    """
    Load an AnnData file (h5ad or zarr) as a single pandas DataFrame.

    The returned DataFrame concatenates ``.obs`` (non-numeric metadata) with
    ``X`` converted to a DataFrame using the variable names.

    Args:
        path:
            Str path to the AnnData object. For zarr, this is a directory-like
            store; for h5ad, a file path.
        read_from:
            Input format. Either ``"h5ad"`` or ``"zarr"``.

    Returns:
        A pandas DataFrame with ``.obs`` columns followed by the numeric
        columns from ``X`` (``adata.to_df()``), indexed from 0..n-1.
    """

    if read_from == "h5ad":
        adata = ad.read_h5ad(path)
    elif read_from == "zarr":
        adata = ad.read_zarr(path)
    else:
        raise ValueError('read_from must be "h5ad" or "zarr".')

    return adata.obs.join(adata.to_df(), how="left").reset_index(drop=True)

In [5]:
# avoid a "cold start" for tested packages by using them before benchmarks
df = pd.DataFrame(np.random.rand(2, 2), columns=[f"col_{num}" for num in range(0, 2)])
# export and read using various methods
df.to_csv(path_or_buf=csv_name, compression="gzip")
pd.read_csv(filepath_or_buffer=csv_name, compression="gzip")
df.to_sql(name=sqlite_tbl_name, con=f"sqlite:///{sqlite_name}")
pd.read_sql(sql=f"SELECT * FROM {sqlite_tbl_name}", con=f"sqlite:///{sqlite_name}")
df.to_parquet(path=parquet_name, compression="gzip")
pd.read_parquet(path=parquet_name)

Unnamed: 0,col_0,col_1
0,0.17554,0.987834
1,0.07825,0.681681


In [6]:
# remove any existing prior work
for filename in [csv_name, parquet_name, sqlite_name]:
    pathlib.Path(filename).unlink(missing_ok=True)

In [7]:
# starting rowcount and col count
nrows = 320
ncols = 160

# result list for storing data
results = []

# loop for iterating over increasingly large dataframes
# and gathering data about operations on them
for _ in range(1, 4):
    # increase the size of the dataframe
    nrows *= 2
    ncols *= 2

    # form a dataframe using randomized data
    df = pd.DataFrame(
        np.random.rand(nrows, ncols), columns=[f"col_{num}" for num in range(0, ncols)]
    )

    # run multiple times for error and average
    for _ in range(1, 5):
        # remove any existing files in preparation for next steps
        for filename in [
            csv_name,
            parquet_name,
            sqlite_name,
            anndata_h5_name,
            anndata_zarr_name,
        ]:
            if pathlib.Path(filename).is_dir():
                shutil.rmtree(anndata_zarr_name)
            else:
                pathlib.Path(filename).unlink(missing_ok=True)
        # append data to the result list
        results.append(
            {
                # general information about the dataframe
                "dataframe_shape (rows, cols)": str(df.shape),
                # information about CSV
                "csv_write_time (secs)": timer(
                    df.to_csv, path_or_buf=csv_name, compression="gzip"
                ),
                "csv_size (bytes)": os.stat(csv_name).st_size,
                "csv_read_time_all (secs)": timer(
                    pd.read_csv, filepath_or_buffer=csv_name, compression="gzip"
                ),
                "csv_read_time_one (secs)": timer(
                    pd.read_csv,
                    filepath_or_buffer=csv_name,
                    compression="gzip",
                    usecols=["col_2"],
                ),
                # information about SQLite
                "sqlite_write_time (secs)": timer(
                    df.to_sql,
                    name=sqlite_tbl_name,
                    con=f"sqlite:///{sqlite_name}",
                ),
                "sqlite_size (bytes)": os.stat(sqlite_name).st_size,
                "sqlite_read_time_all (secs)": timer(
                    pd.read_sql,
                    sql=f"SELECT * FROM {sqlite_tbl_name}",
                    con=f"sqlite:///{sqlite_name}",
                ),
                "sqlite_read_time_one (secs)": timer(
                    pd.read_sql,
                    sql=f"SELECT col_2 FROM {sqlite_tbl_name}",
                    con=f"sqlite:///{sqlite_name}",
                ),
                # information about anndata h5ad
                "anndata_h5ad_write_time (secs)": timer(
                    write_anndata,
                    df=df,
                    write_to="h5ad",
                    dest_path=anndata_h5_name,
                ),
                "anndata_h5ad_size (bytes)": os.stat(anndata_h5_name).st_size,
                "anndata_h5ad_read_time_all (secs)": timer(
                    read_anndata,
                    path=anndata_h5_name,
                    read_from="h5ad",
                ),
                # information about anndata zarr
                "anndata_zarr_write_time (secs)": timer(
                    write_anndata,
                    df=df,
                    write_to="zarr",
                    dest_path=anndata_zarr_name,
                ),
                # note: we use a comprehension below to recurse through
                # the zarr directory for a true estimate of size.
                "anndata_zarr_size (bytes)": sum(
                    f.stat().st_size
                    for f in pathlib.Path(anndata_zarr_name).rglob("**/*")
                    if f.is_file()
                ),
                "anndata_zarr_read_time_all (secs)": timer(
                    read_anndata,
                    path=anndata_zarr_name,
                    read_from="zarr",
                ),
                # information about Parquet
                "parquet_write_time (secs)": timer(
                    df.to_parquet, path=parquet_name, compression="gzip"
                ),
                "parquet_size (bytes)": os.stat(parquet_name).st_size,
                "parquet_read_time_all (secs)": timer(
                    pd.read_parquet, path=parquet_name
                ),
                "parquet_read_time_one (secs)": timer(
                    pd.read_parquet, path=parquet_name, columns=["col_2"]
                ),
            }
        )


df_results = pd.DataFrame(results)
df_results

  return dispatch(args[0].__class__)(*args, **kw)
  return dispatch(args[0].__class__)(*args, **kw)


  return dispatch(args[0].__class__)(*args, **kw)
  return dispatch(args[0].__class__)(*args, **kw)


  return dispatch(args[0].__class__)(*args, **kw)
  return dispatch(args[0].__class__)(*args, **kw)


  return dispatch(args[0].__class__)(*args, **kw)
  return dispatch(args[0].__class__)(*args, **kw)


  return dispatch(args[0].__class__)(*args, **kw)
  return dispatch(args[0].__class__)(*args, **kw)


  return dispatch(args[0].__class__)(*args, **kw)
  return dispatch(args[0].__class__)(*args, **kw)


  return dispatch(args[0].__class__)(*args, **kw)
  return dispatch(args[0].__class__)(*args, **kw)


  return dispatch(args[0].__class__)(*args, **kw)
  return dispatch(args[0].__class__)(*args, **kw)


  return dispatch(args[0].__class__)(*args, **kw)
  return dispatch(args[0].__class__)(*args, **kw)


  return dispatch(args[0].__class__)(*args, **kw)
  return dispatch(args[0].__class__)(*args, **kw)


  return dispatch(args[0].__class__)(*args, **kw)
  return dispatch(args[0].__class__)(*args, **kw)


  return dispatch(args[0].__class__)(*args, **kw)


  return dispatch(args[0].__class__)(*args, **kw)


Unnamed: 0,"dataframe_shape (rows, cols)",csv_write_time (secs),csv_size (bytes),csv_read_time_all (secs),csv_read_time_one (secs),sqlite_write_time (secs),sqlite_size (bytes),sqlite_read_time_all (secs),sqlite_read_time_one (secs),anndata_h5ad_write_time (secs),anndata_h5ad_size (bytes),anndata_h5ad_read_time_all (secs),anndata_zarr_write_time (secs),anndata_zarr_size (bytes),anndata_zarr_read_time_all (secs),parquet_write_time (secs),parquet_size (bytes),parquet_read_time_all (secs),parquet_read_time_one (secs)
0,"(640, 320)",0.842291,1796776,0.051321,0.033486,0.292911,2654208,0.081895,0.004012,0.015043,1689464,0.009666,0.200134,1444205,0.024827,0.122654,1996960,0.040673,0.009489
1,"(640, 320)",0.806049,1796776,0.049559,0.037302,0.281395,2654208,0.158389,0.004061,0.013186,1689464,0.008883,0.269879,1444205,0.025537,0.121052,1996960,0.040905,0.010552
2,"(640, 320)",0.811007,1796776,0.050351,0.037784,0.289146,2654208,0.080007,0.004056,0.013334,1689464,0.009278,0.224012,1444205,0.023597,0.118297,1996960,0.047455,0.00889
3,"(640, 320)",2.815953,1796776,0.050481,0.037987,0.287124,2654208,0.078846,0.004096,0.013307,1689464,0.008762,0.20373,1444205,0.018752,0.118394,1996960,0.046874,0.010313
4,"(1280, 640)",3.218594,7177156,0.187479,0.135454,0.975879,7909376,0.321715,0.005191,0.023874,6648696,0.013905,0.211492,5752196,0.032643,0.310097,7686116,0.0828,0.021087
5,"(1280, 640)",3.231007,7177156,0.171248,0.11712,1.004868,7909376,0.387579,0.005156,0.024271,6648696,0.01961,0.219477,5752196,0.036041,0.321031,7686116,0.073183,0.016686
6,"(1280, 640)",3.278998,7177156,0.185984,0.137208,0.992581,7909376,0.315881,0.005491,0.02493,6648696,0.014585,0.232451,5752196,0.044017,0.423086,7686116,0.075099,0.021984
7,"(1280, 640)",3.257791,7177156,0.183664,0.134662,0.929014,7909376,0.317495,0.005261,0.02394,6648696,0.014066,0.263685,5752196,0.039005,0.326676,7686116,0.08099,0.025437
8,"(2560, 1280)",12.86796,28692830,0.687113,0.422292,5.484956,31543296,1.218488,0.010993,0.073147,26389368,0.036874,0.314813,22974236,0.110739,1.24274,30261412,0.175512,0.029622
9,"(2560, 1280)",13.161694,28692830,2.615864,0.437876,3.852527,31543296,1.231781,0.011333,0.074586,26389368,0.037749,0.331743,22974236,0.085215,1.134063,30261412,0.166437,0.029078


In [8]:
average = (
    df_results.groupby("dataframe_shape (rows, cols)")
    .mean()
    .reset_index()
    .sort_values(by="csv_size (bytes)")
)
minimums = (
    df_results.groupby("dataframe_shape (rows, cols)")
    .min()
    .reset_index()
    .sort_values(by="csv_size (bytes)")
)
maximums = (
    df_results.groupby("dataframe_shape (rows, cols)")
    .max()
    .reset_index()
    .sort_values(by="csv_size (bytes)")
)

key = "dataframe_shape (rows, cols)"

result = (
    average.set_index(key)
    .add_suffix(" mean")
    .join(minimums.set_index(key).add_suffix(" min"))
    .join(maximums.set_index(key).add_suffix(" max"))
    .reset_index()
)

result

Unnamed: 0,"dataframe_shape (rows, cols)",csv_write_time (secs) mean,csv_size (bytes) mean,csv_read_time_all (secs) mean,csv_read_time_one (secs) mean,sqlite_write_time (secs) mean,sqlite_size (bytes) mean,sqlite_read_time_all (secs) mean,sqlite_read_time_one (secs) mean,anndata_h5ad_write_time (secs) mean,...,anndata_h5ad_write_time (secs) max,anndata_h5ad_size (bytes) max,anndata_h5ad_read_time_all (secs) max,anndata_zarr_write_time (secs) max,anndata_zarr_size (bytes) max,anndata_zarr_read_time_all (secs) max,parquet_write_time (secs) max,parquet_size (bytes) max,parquet_read_time_all (secs) max,parquet_read_time_one (secs) max
0,"(640, 320)",1.318825,1796776.0,0.050428,0.03664,0.287644,2654208.0,0.099784,0.004056,0.013718,...,0.015043,1689464,0.009666,0.269879,1444205,0.025537,0.122654,1996960,0.047455,0.010552
1,"(1280, 640)",3.246597,7177156.0,0.182094,0.131111,0.975585,7909376.0,0.335667,0.005275,0.024254,...,0.02493,6648696,0.01961,0.263685,5752196,0.044017,0.423086,7686116,0.0828,0.025437
2,"(2560, 1280)",13.462006,28692830.0,1.15721,0.426322,4.13781,31543296.0,1.258269,0.011642,0.298224,...,0.971748,26389368,2.071073,0.331743,22974236,1.205168,1.256516,30261412,0.175512,0.040086


In [9]:
key = "dataframe_shape (rows, cols)"

cols = {
    "CSV": (
        "csv_write_time (secs) mean",
        "csv_write_time (secs) min",
        "csv_write_time (secs) max",
    ),
    "SQLite": (
        "sqlite_write_time (secs) mean",
        "sqlite_write_time (secs) min",
        "sqlite_write_time (secs) max",
    ),
    "AnnData (H5AD)": (
        "anndata_h5ad_write_time (secs) mean",
        "anndata_h5ad_write_time (secs) min",
        "anndata_h5ad_write_time (secs) max",
    ),
    "AnnData (Zarr)": (
        "anndata_zarr_write_time (secs) mean",
        "anndata_zarr_write_time (secs) min",
        "anndata_zarr_write_time (secs) max",
    ),
    "Parquet": (
        "parquet_write_time (secs) mean",
        "parquet_write_time (secs) min",
        "parquet_write_time (secs) max",
    ),
}

parts = []
for fmt, (mcol, mincol, maxcol) in cols.items():
    tmp = result[[key, mcol, mincol, maxcol]].copy()
    tmp["format"] = fmt
    tmp.rename(columns={mcol: "mean", mincol: "min", maxcol: "max"}, inplace=True)
    tmp["err_plus"] = tmp["max"] - tmp["mean"]
    tmp["err_minus"] = tmp["mean"] - tmp["min"]
    parts.append(tmp[[key, "format", "mean", "err_plus", "err_minus"]])


stats = pd.concat(parts, ignore_index=True)

y_order = result[key].iloc[::-1].tolist()

fig = px.bar(
    stats.sort_values(by="format"),
    x="mean",
    y=key,
    color="format",
    error_x="err_plus",
    error_x_minus="err_minus",
    orientation="h",
    barmode="group",
    category_orders={key: y_order},
    labels={key: "Data Shape", "mean": "Seconds"},
    width=1300,
)
fig.update_layout(
    legend_title_text="File Write Duration",
    legend=dict(x=0.68, y=0.02, bgcolor="rgba(255,255,255,0.8)"),
    font=dict(size=18),
)

pio.write_image(fig, file_write_time_image)
Image(url=file_write_time_image)

In [10]:
key = "dataframe_shape (rows, cols)"

size_cols = {
    "csv_size (bytes)": "CSV",
    "sqlite_size (bytes)": "SQLite",
    "anndata_h5ad_size (bytes)": "AnnData (H5AD)",
    "anndata_zarr_size (bytes)": "AnnData (Zarr)",
    "parquet_size (bytes)": "Parquet",
}

# Long-form; if you have repeated runs per shape, we'll average them
long = df_results.melt(
    id_vars=[key],
    value_vars=list(size_cols.keys()),
    var_name="col",
    value_name="bytes",
)
long["format"] = long["col"].map(size_cols)

stats = long.groupby([key, "format"], as_index=False)["bytes"].mean()

# Descending y-axis by total size across formats (largest first).
# Swap this for a format-specific sort if you prefer (see below).
y_order = (
    stats.groupby(key, as_index=False)["bytes"]
    .sum()
    .sort_values("bytes", ascending=False)[key]
    .tolist()
)

fig = px.bar(
    stats.sort_values(by="format"),
    x="bytes",
    y=key,
    color="format",
    orientation="h",
    barmode="group",
    category_orders={key: result[key].iloc[::-1].tolist()},
    labels={key: "Data Shape", "bytes": "Bytes"},
    width=1300,
)

fig.update_layout(
    legend_title_text="File Size",
    legend=dict(x=0.72, y=0.02, bgcolor="rgba(255,255,255,0.8)"),
    font=dict(size=18),
)

pio.write_image(fig, file_storage_size_image)
Image(url=file_storage_size_image)

In [11]:
# read time barchart (all columns)
key = "dataframe_shape (rows, cols)"

cols = {
    "CSV": (
        "csv_read_time_all (secs) mean",
        "csv_read_time_all (secs) min",
        "csv_read_time_all (secs) max",
    ),
    "SQLite": (
        "sqlite_read_time_all (secs) mean",
        "sqlite_read_time_all (secs) min",
        "sqlite_read_time_all (secs) max",
    ),
    "AnnData (H5AD)": (
        "anndata_h5ad_read_time_all (secs) mean",
        "anndata_h5ad_read_time_all (secs) min",
        "anndata_h5ad_read_time_all (secs) max",
    ),
    "AnnData (Zarr)": (
        "anndata_zarr_read_time_all (secs) mean",
        "anndata_zarr_read_time_all (secs) min",
        "anndata_zarr_read_time_all (secs) max",
    ),
    "Parquet": (
        "parquet_read_time_all (secs) mean",
        "parquet_read_time_all (secs) min",
        "parquet_read_time_all (secs) max",
    ),
}

parts = []
for fmt, (mcol, mincol, maxcol) in cols.items():
    tmp = result[[key, mcol, mincol, maxcol]].copy()
    tmp["format"] = fmt
    tmp.rename(columns={mcol: "mean", mincol: "min", maxcol: "max"}, inplace=True)
    tmp["err_plus"] = tmp["max"] - tmp["mean"]
    tmp["err_minus"] = tmp["mean"] - tmp["min"]
    parts.append(tmp[[key, "format", "mean", "err_plus", "err_minus"]])


stats = pd.concat(parts, ignore_index=True)

y_order = result[key].iloc[::-1].tolist()

fig = px.bar(
    stats.sort_values(by="format"),
    x="mean",
    y=key,
    color="format",
    error_x="err_plus",
    error_x_minus="err_minus",
    orientation="h",
    barmode="group",
    category_orders={key: y_order},
    labels={key: "Data Shape", "mean": "Seconds"},
    width=1300,
)
fig.update_layout(
    legend_title_text="File Read Duration",
    legend=dict(x=0.68, y=0.02, bgcolor="rgba(255,255,255,0.8)"),
    font=dict(size=18),
)

pio.write_image(fig, file_read_time_all_image)
Image(url=file_read_time_all_image)

In [12]:
# read time barchart (one column)
fig = px.line(
    df_results,
    y=[
        "csv_read_time_one (secs)",
        "sqlite_read_time_one (secs)",
        "parquet_read_time_one (secs)",
    ],
    x="dataframe_shape (rows, cols)",
    labels={"dataframe_shape (rows, cols)": "Data Shape", "value": "Seconds"},
    width=1300,
    color_discrete_sequence=px.colors.qualitative.D3,
)
fig.update_layout(
    legend_title_text="File Read Duration (one column)",
    legend=dict(x=0.01, y=0.98, bgcolor="rgba(255,255,255,0.8)"),
    font=dict(
        size=20,  # global font size
    ),
)
fig.update_xaxes(range=[0, 2.13])
fig.update_traces(mode="lines+markers")

pio.write_image(fig, file_read_time_one_image)
Image(url=file_read_time_one_image)