# Why Parquet?

This notebook explores the benefits or drawbacks of using the [parquet](https://parquet.apache.org/docs/) file format relative to other formats such as CSV or SQLite.

In [1]:
import os
import pathlib
import shutil
import string
import warnings
from typing import Literal

import anndata as ad
import duckdb
import hdf5plugin
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.io as pio
from anndata import ImplicitModificationWarning
from IPython.display import Image
from utilities import get_system_info, timer

# ignore anndata warnings about index conversion
warnings.filterwarnings("ignore", category=ImplicitModificationWarning)

In [2]:
# show the system information
_ = get_system_info(show_output=True)


System Information:
Operating System: Darwin
Machine Type: arm64
Processor: arm
CPU Cores (Logical): 12
CPU Cores (Physical): 12
Total RAM (GB): 48.0
Python Version: 3.12.2


In [3]:
# target file or table names
image_dir = "images"
csv_name = "example.csv.gz"
parquet_noc_name = "example.parquet"
parquet_snappy_name = "example.snappy.parquet"
parquet_gzip_name = "example.gzip.parquet"
parquet_lz4_name = "example.lz4.parquet"
parquet_zstd_name = "example.zstd.parquet"
sqlite_name = "example.sqlite"
sqlite_tbl_name = "tbl_example"
anndata_h5_noc_name = "adata.noc.h5ad"
anndata_h5_gzip_name = "adata.gzip.h5ad"
anndata_h5_lz4_name = "adata.lz4.h5ad"
anndata_h5_zstd_name = "adata.zstd.h5ad"
anndata_zarr_name = "adata.zarr"
file_write_time_image = f"{image_dir}/parquet-comparisons-file-write-time.png"
file_storage_size_image = f"{image_dir}/parquet-comparisons-file-storage-size.png"
file_read_time_all_image = (
    f"{image_dir}/parquet-comparisons-file-read-time-all-columns.png"
)
file_read_time_one_image = (
    f"{image_dir}/parquet-comparisons-file-read-time-one-column.png"
)


def remove_files():
    """
    Utility function to remove files as needed.
    """
    for name in [
        csv_name,
        parquet_noc_name,
        parquet_snappy_name,
        parquet_gzip_name,
        parquet_lz4_name,
        parquet_zstd_name,
        sqlite_name,
        anndata_h5_noc_name,
        anndata_h5_gzip_name,
        anndata_h5_lz4_name,
        anndata_h5_zstd_name,
    ]:
        pathlib.Path(name).unlink(missing_ok=True)

    if pathlib.Path(anndata_zarr_name).is_dir():
        shutil.rmtree(anndata_zarr_name)


# remove all files just in case
remove_files()

In [4]:
def write_anndata(
    df: pd.DataFrame,
    write_to: Literal["h5ad", "zarr"],
    compression: Literal["gzip", "lz4", "zstd", "none"],
    dest_path: str,
) -> str:
    """
    Serialize a DataFrame to AnnData (h5ad or zarr).

    Numeric columns are stored in ``X`` (observations × variables). All
    remaining columns are stored in ``.obs``. Variable (feature) names are taken
    from the numeric column labels, and observation names from the DataFrame
    index.

    Args:
        df:
            Input table with rows as observations and columns as features.
        write_to:
            Output format. Either ``"h5ad"`` or ``"zarr"``.
        compression:
            The type of compression to use with
        dest_path:
            Destination file (``.h5ad``) or directory (zarr store)
            to write to. Parent directories are created if missing.

    Returns:
        The path written to as a string.
    """
    dest = pathlib.Path(dest_path)
    dest.parent.mkdir(parents=True, exist_ok=True)

    numeric = df.select_dtypes(include=["number"])
    if numeric.shape[1] == 0:
        raise ValueError("No numeric columns found to place in AnnData.X.")

    non_numeric = df.select_dtypes(exclude=["number"])

    adata = ad.AnnData(X=numeric)
    adata.obs_names = df.index.astype(str)
    adata.var_names = numeric.columns.astype(str)
    # Align non-numeric obs metadata to the same index
    adata.obs = non_numeric

    if write_to == "h5ad":
        # we default to use None for compression
        # meaning no compression.
        comp_arg = None
        if compression == "gzip":
            comp_arg = "gzip"
        elif compression == "zstd":
            comp_arg = hdf5plugin.FILTERS["zstd"]
        elif compression == "lz4":
            comp_arg = hdf5plugin.FILTERS["lz4"]

        adata.write_h5ad(filename=str(dest), compression=comp_arg)
    elif write_to == "zarr":
        # For zarr, the destination is a directory-like store
        adata.write_zarr(str(dest))
    else:
        raise ValueError('write_to must be "h5ad" or "zarr".')

    return str(dest)


def read_anndata(
    path: str,
    read_from: Literal["h5ad", "zarr"],
    read_one: bool = False,
) -> pd.DataFrame:
    """
    Load an AnnData file (h5ad or zarr) as a single pandas DataFrame.

    The returned DataFrame concatenates ``.obs`` (non-numeric metadata) with
    ``X`` converted to a DataFrame using the variable names.

    Args:
        path:
            Str path to the AnnData object. For zarr, this is a directory-like
            store; for h5ad, a file path.
        read_from:
            Input format. Either ``"h5ad"`` or ``"zarr"``.
        read_one:
            Whether to read just one column.

    Returns:
        A pandas DataFrame with ``.obs`` columns followed by the numeric
        columns from ``X`` (``adata.to_df()``), indexed from 0..n-1.
    """

    if read_from == "h5ad":
        adata = ad.read_h5ad(path)
    elif read_from == "zarr":
        adata = ad.read_zarr(path)
    else:
        raise ValueError('read_from must be "h5ad" or "zarr".')

    if read_one:
        return adata.to_df()["col_2"]

    return adata.obs.join(adata.to_df(), how="left").reset_index(drop=True)

In [5]:
# avoid a "cold start" for tested packages by using them before benchmarks
df = pd.DataFrame(np.random.rand(2, 2), columns=[f"col_{num}" for num in range(0, 2)])
# export and read using various methods
df.to_csv(path_or_buf=csv_name, compression="gzip")
pd.read_csv(filepath_or_buffer=csv_name, compression="gzip")
df.to_sql(name=sqlite_tbl_name, con=f"sqlite:///{sqlite_name}")
pd.read_sql(sql=f"SELECT * FROM {sqlite_tbl_name}", con=f"sqlite:///{sqlite_name}")
df.to_parquet(path=parquet_gzip_name, compression="gzip")
pd.read_parquet(path=parquet_gzip_name)

Unnamed: 0,col_0,col_1
0,0.154316,0.796868
1,0.737232,0.859234


In [6]:
# remove any existing prior work
for filename in [csv_name, parquet_gzip_name, sqlite_name]:
    pathlib.Path(filename).unlink(missing_ok=True)

In [7]:
# starting rowcount and col count
nrows = 320
ncols = 124

# result list for storing data
results = []

# loop for iterating over increasingly large dataframes
# and gathering data about operations on them
for _ in range(1, 6):
    # increase the size of the dataframe
    nrows *= 2
    ncols *= 2

    # form a dataframe using randomized data
    df = pd.DataFrame(
        np.random.rand(nrows, ncols), columns=[f"col_{num}" for num in range(0, ncols)]
    )

    # add some string data
    alphabet = np.array(list(string.ascii_lowercase + string.digits))
    df = df.assign(
        **{
            f"str_{i+1}": [
                "".join(np.random.default_rng(10).choice(alphabet, 10))
                for _ in range(len(df))
            ]
            for i in range(10)
        }
    )

    print(df.shape)

    # run multiple times for error and average
    for _ in range(1, 5):
        # remove any existing files in preparation for next steps
        remove_files()
        # append data to the result list
        results.append(
            {
                # general information about the dataframe
                "dataframe_shape (rows, cols)": str(df.shape),
                # information about CSV (uncompressed)
                "csv_write_time (secs)": timer(
                    df.to_csv, path_or_buf=csv_name, compression="gzip"
                ),
                "csv_size (bytes)": os.stat(csv_name).st_size,
                "csv_read_time_all (secs)": timer(
                    pd.read_csv, filepath_or_buffer=csv_name, compression="gzip"
                ),
                "csv_read_time_one (secs)": timer(
                    pd.read_csv,
                    filepath_or_buffer=csv_name,
                    compression="gzip",
                    usecols=["col_2"],
                ),
                # information about SQLite
                "sqlite_write_time (secs)": (
                    timer(
                        df.to_sql,
                        name=sqlite_tbl_name,
                        con=f"sqlite:///{sqlite_name}",
                    )
                    if ncols < 2000
                    else None
                ),
                "sqlite_size (bytes)": (
                    os.stat(sqlite_name).st_size if ncols < 2000 else None
                ),
                "sqlite_read_time_all (secs)": (
                    timer(
                        pd.read_sql,
                        sql=f"SELECT * FROM {sqlite_tbl_name}",
                        con=f"sqlite:///{sqlite_name}",
                    )
                    if ncols < 2000
                    else None
                ),
                "sqlite_read_time_one (secs)": (
                    timer(
                        pd.read_sql,
                        sql=f"SELECT col_2 FROM {sqlite_tbl_name}",
                        con=f"sqlite:///{sqlite_name}",
                    )
                    if ncols < 2000
                    else None
                ),
                # information about anndata h5ad (no compression)
                "anndata_h5ad_noc_write_time (secs)": timer(
                    write_anndata,
                    df=df,
                    write_to="h5ad",
                    compression="none",
                    dest_path=anndata_h5_noc_name,
                ),
                "anndata_h5ad_noc_size (bytes)": os.stat(anndata_h5_noc_name).st_size,
                "anndata_h5ad_noc_read_time_all (secs)": timer(
                    read_anndata,
                    path=anndata_h5_noc_name,
                    read_from="h5ad",
                    read_one=False,
                ),
                "anndata_h5ad_noc_read_time_one (secs)": timer(
                    read_anndata,
                    path=anndata_h5_noc_name,
                    read_from="h5ad",
                    read_one=True,
                ),
                # information about anndata h5ad (gzip)
                "anndata_h5ad_gzip_write_time (secs)": timer(
                    write_anndata,
                    df=df,
                    write_to="h5ad",
                    compression="gzip",
                    dest_path=anndata_h5_gzip_name,
                ),
                "anndata_h5ad_gzip_size (bytes)": os.stat(anndata_h5_gzip_name).st_size,
                "anndata_h5ad_gzip_read_time_all (secs)": timer(
                    read_anndata,
                    path=anndata_h5_gzip_name,
                    read_from="h5ad",
                    read_one=False,
                ),
                "anndata_h5ad_gzip_read_time_one (secs)": timer(
                    read_anndata,
                    path=anndata_h5_gzip_name,
                    read_from="h5ad",
                    read_one=True,
                ),
                # information about anndata h5ad (lz4)
                "anndata_h5ad_lz4_write_time (secs)": timer(
                    write_anndata,
                    df=df,
                    write_to="h5ad",
                    compression="lz4",
                    dest_path=anndata_h5_lz4_name,
                ),
                "anndata_h5ad_lz4_size (bytes)": os.stat(anndata_h5_lz4_name).st_size,
                "anndata_h5ad_lz4_read_time_all (secs)": timer(
                    read_anndata,
                    path=anndata_h5_lz4_name,
                    read_from="h5ad",
                    read_one=False,
                ),
                "anndata_h5ad_lz4_read_time_one (secs)": timer(
                    read_anndata,
                    path=anndata_h5_lz4_name,
                    read_from="h5ad",
                    read_one=True,
                ),
                # information about anndata h5ad (zstd)
                "anndata_h5ad_zstd_write_time (secs)": timer(
                    write_anndata,
                    df=df,
                    write_to="h5ad",
                    compression="zstd",
                    dest_path=anndata_h5_zstd_name,
                ),
                "anndata_h5ad_zstd_size (bytes)": os.stat(anndata_h5_zstd_name).st_size,
                "anndata_h5ad_zstd_read_time_all (secs)": timer(
                    read_anndata,
                    path=anndata_h5_zstd_name,
                    read_from="h5ad",
                    read_one=False,
                ),
                "anndata_h5ad_zstd_read_time_one (secs)": timer(
                    read_anndata,
                    path=anndata_h5_zstd_name,
                    read_from="h5ad",
                    read_one=True,
                ),
                # information about anndata zarr
                "anndata_zarr_write_time (secs)": timer(
                    write_anndata,
                    df=df,
                    write_to="zarr",
                    compression="none",
                    dest_path=anndata_zarr_name,
                ),
                # note: we use a comprehension below to recurse through
                # the zarr directory for a true estimate of size.
                "anndata_zarr_size (bytes)": sum(
                    f.stat().st_size
                    for f in pathlib.Path(anndata_zarr_name).rglob("**/*")
                    if f.is_file()
                ),
                "anndata_zarr_read_time_all (secs)": timer(
                    read_anndata,
                    path=anndata_zarr_name,
                    read_from="zarr",
                    read_one=False,
                ),
                "anndata_zarr_read_time_one (secs)": timer(
                    read_anndata,
                    path=anndata_zarr_name,
                    read_from="zarr",
                    read_one=True,
                ),
                # information about Parquet with no compression
                "parquet_noc_write_time (secs)": timer(
                    df.to_parquet, path=parquet_noc_name, compression=None
                ),
                "parquet_noc_size (bytes)": os.stat(parquet_noc_name).st_size,
                "parquet_noc_read_time_all (secs)": timer(
                    pd.read_parquet, path=parquet_noc_name
                ),
                "parquet_noc_read_time_one (secs)": timer(
                    pd.read_parquet, path=parquet_noc_name, columns=["col_2"]
                ),
                # information about Parquet with snappy compression
                "parquet_snappy_write_time (secs)": timer(
                    df.to_parquet, path=parquet_snappy_name, compression="snappy"
                ),
                "parquet_snappy_size (bytes)": os.stat(parquet_snappy_name).st_size,
                "parquet_snappy_read_time_all (secs)": timer(
                    pd.read_parquet, path=parquet_snappy_name
                ),
                "parquet_snappy_read_time_one (secs)": timer(
                    pd.read_parquet, path=parquet_snappy_name, columns=["col_2"]
                ),
                # information about Parquet with gzip compression
                "parquet_gzip_write_time (secs)": timer(
                    df.to_parquet, path=parquet_gzip_name, compression="gzip"
                ),
                "parquet_gzip_size (bytes)": os.stat(parquet_gzip_name).st_size,
                "parquet_gzip_read_time_all (secs)": timer(
                    pd.read_parquet, path=parquet_gzip_name
                ),
                "parquet_gzip_read_time_one (secs)": timer(
                    pd.read_parquet, path=parquet_gzip_name, columns=["col_2"]
                ),
                # information about Parquet with zstd compression
                "parquet_zstd_write_time (secs)": timer(
                    df.to_parquet, path=parquet_zstd_name, compression="zstd"
                ),
                "parquet_zstd_size (bytes)": os.stat(parquet_zstd_name).st_size,
                "parquet_zstd_read_time_all (secs)": timer(
                    pd.read_parquet, path=parquet_zstd_name
                ),
                "parquet_zstd_read_time_one (secs)": timer(
                    pd.read_parquet, path=parquet_zstd_name, columns=["col_2"]
                ),
                # information about Parquet with lz4 compression
                "parquet_lz4_write_time (secs)": timer(
                    df.to_parquet, path=parquet_lz4_name, compression="lz4"
                ),
                "parquet_lz4_size (bytes)": os.stat(parquet_lz4_name).st_size,
                "parquet_lz4_read_time_all (secs)": timer(
                    pd.read_parquet, path=parquet_lz4_name
                ),
                "parquet_lz4_read_time_one (secs)": timer(
                    pd.read_parquet, path=parquet_lz4_name, columns=["col_2"]
                ),
            }
        )


df_results = pd.DataFrame(results)
df_results

(640, 258)
(1280, 506)


Unnamed: 0,"dataframe_shape (rows, cols)",csv_write_time (secs),csv_size (bytes),csv_read_time_all (secs),csv_read_time_one (secs),sqlite_write_time (secs),sqlite_size (bytes),sqlite_read_time_all (secs),sqlite_read_time_one (secs),anndata_h5ad_noc_write_time (secs),...,parquet_gzip_read_time_all (secs),parquet_gzip_read_time_one (secs),parquet_zstd_write_time (secs),parquet_zstd_size (bytes),parquet_zstd_read_time_all (secs),parquet_zstd_read_time_one (secs),parquet_lz4_write_time (secs),parquet_lz4_size (bytes),parquet_lz4_read_time_all (secs),parquet_lz4_read_time_one (secs)
0,"(640, 258)",0.355772,1396284,0.023583,0.012523,0.060731,2654208,0.018013,0.001717,0.009688,...,0.028125,0.002779,0.028946,1544136,0.010563,0.002736,0.014473,1614683,0.011104,0.002812
1,"(640, 258)",0.373457,1396284,0.021033,0.013035,0.089561,2654208,0.019476,0.001774,0.009084,...,0.012556,0.002649,0.017807,1544136,0.010388,0.002704,0.013382,1614683,0.009761,0.002479
2,"(640, 258)",0.415735,1396284,0.022612,0.016394,0.064994,2654208,0.025093,0.002818,0.013014,...,0.018105,0.003096,0.017724,1544136,0.013265,0.002813,0.01531,1614683,0.019062,0.002693
3,"(640, 258)",0.368796,1396284,0.022502,0.014335,0.062763,2654208,0.02304,0.002444,0.012158,...,0.021093,0.003093,0.017088,1544136,0.016057,0.002801,0.014475,1614683,0.016393,0.002633
4,"(1280, 506)",1.524452,5570451,0.076728,0.053763,0.252394,5935104,0.390468,0.004775,0.016109,...,0.022171,0.005242,0.054853,5940887,0.022449,0.004629,0.034645,6254022,0.028063,0.005024
5,"(1280, 506)",1.440919,5570451,0.069477,0.047721,0.213947,5935104,0.32242,0.007824,0.010158,...,0.051263,0.005441,0.040757,5940887,0.02391,0.004431,0.034287,6254022,0.018256,0.004432
6,"(1280, 506)",1.671773,5570451,0.076862,0.056179,0.229379,5935104,0.362107,0.003811,0.012154,...,0.025916,0.004868,0.049272,5940887,0.021582,0.004972,0.036969,6254022,0.021805,0.005103
7,"(1280, 506)",1.707333,5570451,0.072484,0.05259,0.267593,5935104,0.344293,0.003363,0.010884,...,0.021979,0.004538,0.040599,5940887,0.019143,0.004661,0.034367,6254022,0.021142,0.004879


  return dispatch(args[0].__class__)(*args, **kw)
  return dispatch(args[0].__class__)(*args, **kw)


  return dispatch(args[0].__class__)(*args, **kw)
  return dispatch(args[0].__class__)(*args, **kw)


  return dispatch(args[0].__class__)(*args, **kw)
  return dispatch(args[0].__class__)(*args, **kw)


  return dispatch(args[0].__class__)(*args, **kw)
  return dispatch(args[0].__class__)(*args, **kw)


  return dispatch(args[0].__class__)(*args, **kw)
  return dispatch(args[0].__class__)(*args, **kw)


  return dispatch(args[0].__class__)(*args, **kw)
  return dispatch(args[0].__class__)(*args, **kw)


  return dispatch(args[0].__class__)(*args, **kw)
  return dispatch(args[0].__class__)(*args, **kw)


  return dispatch(args[0].__class__)(*args, **kw)
  return dispatch(args[0].__class__)(*args, **kw)


  return dispatch(args[0].__class__)(*args, **kw)
  return dispatch(args[0].__class__)(*args, **kw)


  return dispatch(args[0].__class__)(*args, **kw)


  return dispatch(args[0].__class__)(*args, **kw)


Unnamed: 0,"dataframe_shape (rows, cols)",csv_write_time (secs),csv_size (bytes),csv_read_time_all (secs),csv_read_time_one (secs),sqlite_write_time (secs),sqlite_size (bytes),sqlite_read_time_all (secs),sqlite_read_time_one (secs),anndata_h5ad_write_time (secs),anndata_h5ad_size (bytes),anndata_h5ad_read_time_all (secs),anndata_zarr_write_time (secs),anndata_zarr_size (bytes),anndata_zarr_read_time_all (secs),parquet_write_time (secs),parquet_size (bytes),parquet_read_time_all (secs),parquet_read_time_one (secs)
0,"(640, 320)",0.842291,1796776,0.051321,0.033486,0.292911,2654208,0.081895,0.004012,0.015043,1689464,0.009666,0.200134,1444205,0.024827,0.122654,1996960,0.040673,0.009489
1,"(640, 320)",0.806049,1796776,0.049559,0.037302,0.281395,2654208,0.158389,0.004061,0.013186,1689464,0.008883,0.269879,1444205,0.025537,0.121052,1996960,0.040905,0.010552
2,"(640, 320)",0.811007,1796776,0.050351,0.037784,0.289146,2654208,0.080007,0.004056,0.013334,1689464,0.009278,0.224012,1444205,0.023597,0.118297,1996960,0.047455,0.00889
3,"(640, 320)",2.815953,1796776,0.050481,0.037987,0.287124,2654208,0.078846,0.004096,0.013307,1689464,0.008762,0.20373,1444205,0.018752,0.118394,1996960,0.046874,0.010313
4,"(1280, 640)",3.218594,7177156,0.187479,0.135454,0.975879,7909376,0.321715,0.005191,0.023874,6648696,0.013905,0.211492,5752196,0.032643,0.310097,7686116,0.0828,0.021087
5,"(1280, 640)",3.231007,7177156,0.171248,0.11712,1.004868,7909376,0.387579,0.005156,0.024271,6648696,0.01961,0.219477,5752196,0.036041,0.321031,7686116,0.073183,0.016686
6,"(1280, 640)",3.278998,7177156,0.185984,0.137208,0.992581,7909376,0.315881,0.005491,0.02493,6648696,0.014585,0.232451,5752196,0.044017,0.423086,7686116,0.075099,0.021984
7,"(1280, 640)",3.257791,7177156,0.183664,0.134662,0.929014,7909376,0.317495,0.005261,0.02394,6648696,0.014066,0.263685,5752196,0.039005,0.326676,7686116,0.08099,0.025437
8,"(2560, 1280)",12.86796,28692830,0.687113,0.422292,5.484956,31543296,1.218488,0.010993,0.073147,26389368,0.036874,0.314813,22974236,0.110739,1.24274,30261412,0.175512,0.029622
9,"(2560, 1280)",13.161694,28692830,2.615864,0.437876,3.852527,31543296,1.231781,0.011333,0.074586,26389368,0.037749,0.331743,22974236,0.085215,1.134063,30261412,0.166437,0.029078


In [8]:
average = (
    df_results.groupby("dataframe_shape (rows, cols)")
    .mean()
    .reset_index()
    .sort_values(by="csv_size (bytes)")
)
minimums = (
    df_results.groupby("dataframe_shape (rows, cols)")
    .min()
    .reset_index()
    .sort_values(by="csv_size (bytes)")
)
maximums = (
    df_results.groupby("dataframe_shape (rows, cols)")
    .max()
    .reset_index()
    .sort_values(by="csv_size (bytes)")
)

key = "dataframe_shape (rows, cols)"

result = (
    average.set_index(key)
    .add_suffix(" mean")
    .join(minimums.set_index(key).add_suffix(" min"))
    .join(maximums.set_index(key).add_suffix(" max"))
    .reset_index()
)

result

Unnamed: 0,"dataframe_shape (rows, cols)",csv_write_time (secs) mean,csv_size (bytes) mean,csv_read_time_all (secs) mean,csv_read_time_one (secs) mean,sqlite_write_time (secs) mean,sqlite_size (bytes) mean,sqlite_read_time_all (secs) mean,sqlite_read_time_one (secs) mean,anndata_h5ad_noc_write_time (secs) mean,...,parquet_gzip_read_time_all (secs) max,parquet_gzip_read_time_one (secs) max,parquet_zstd_write_time (secs) max,parquet_zstd_size (bytes) max,parquet_zstd_read_time_all (secs) max,parquet_zstd_read_time_one (secs) max,parquet_lz4_write_time (secs) max,parquet_lz4_size (bytes) max,parquet_lz4_read_time_all (secs) max,parquet_lz4_read_time_one (secs) max
0,"(640, 258)",0.37844,1396284.0,0.022433,0.014072,0.069512,2654208.0,0.021406,0.002188,0.010986,...,0.028125,0.003096,0.028946,1544136,0.016057,0.002813,0.01531,1614683,0.019062,0.002812
1,"(1280, 506)",1.586119,5570451.0,0.073888,0.052563,0.240828,5935104.0,0.354822,0.004943,0.012326,...,0.051263,0.005441,0.054853,5940887,0.02391,0.004972,0.036969,6254022,0.028063,0.005103


In [9]:
key = "dataframe_shape (rows, cols)"

cols = {
    "CSV (GZIP)": (
        "csv_write_time (secs) mean",
        "csv_write_time (secs) min",
        "csv_write_time (secs) max",
    ),
    "SQLite": (
        "sqlite_write_time (secs) mean",
        "sqlite_write_time (secs) min",
        "sqlite_write_time (secs) max",
    ),
    "AnnData (H5AD - uncompressed)": (
        "anndata_h5ad_noc_write_time (secs) mean",
        "anndata_h5ad_noc_write_time (secs) min",
        "anndata_h5ad_noc_write_time (secs) max",
    ),
    "AnnData (H5AD - GZIP)": (
        "anndata_h5ad_gzip_write_time (secs) mean",
        "anndata_h5ad_gzip_write_time (secs) min",
        "anndata_h5ad_gzip_write_time (secs) max",
    ),
    "AnnData (H5AD - ZSTD)": (
        "anndata_h5ad_zstd_write_time (secs) mean",
        "anndata_h5ad_zstd_write_time (secs) min",
        "anndata_h5ad_zstd_write_time (secs) max",
    ),
    "AnnData (H5AD - LZ4) (": (
        "anndata_h5ad_lz4_write_time (secs) mean",
        "anndata_h5ad_lz4_write_time (secs) min",
        "anndata_h5ad_lz4_write_time (secs) max",
    ),
    "AnnData (Zarr)": (
        "anndata_zarr_write_time (secs) mean",
        "anndata_zarr_write_time (secs) min",
        "anndata_zarr_write_time (secs) max",
    ),
    "Parquet (uncompressed)": (
        "parquet_noc_write_time (secs) mean",
        "parquet_noc_write_time (secs) min",
        "parquet_noc_write_time (secs) max",
    ),
    "Parquet (Snappy)": (
        "parquet_snappy_write_time (secs) mean",
        "parquet_snappy_write_time (secs) min",
        "parquet_snappy_write_time (secs) max",
    ),
    "Parquet (GZIP)": (
        "parquet_gzip_write_time (secs) mean",
        "parquet_gzip_write_time (secs) min",
        "parquet_gzip_write_time (secs) max",
    ),
    "Parquet (ZSTD)": (
        "parquet_zstd_write_time (secs) mean",
        "parquet_zstd_write_time (secs) min",
        "parquet_zstd_write_time (secs) max",
    ),
    "Parquet (LZ4)": (
        "parquet_lz4_write_time (secs) mean",
        "parquet_lz4_write_time (secs) min",
        "parquet_lz4_write_time (secs) max",
    ),
}

parts = []
for fmt, (mcol, mincol, maxcol) in cols.items():
    tmp = result[[key, mcol, mincol, maxcol]].copy()
    tmp["format"] = fmt
    tmp.rename(columns={mcol: "mean", mincol: "min", maxcol: "max"}, inplace=True)
    tmp["err_plus"] = tmp["max"] - tmp["mean"]
    tmp["err_minus"] = tmp["mean"] - tmp["min"]
    parts.append(tmp[[key, "format", "mean", "err_plus", "err_minus"]])


stats = pd.concat(parts, ignore_index=True)

x_order = result[key].tolist()  # not reversed; use iloc[::-1] to reverse
pos = {k: i for i, k in enumerate(x_order)}  # category → position index

# 2) give each row its x position and sort per-trace
stats = stats.assign(xpos=stats[key].map(pos)).sort_values(["format", "xpos"])

fig = px.line(
    stats,  # already trace-sorted by xpos
    x=key,
    y="mean",
    color="format",
    error_y="err_plus",
    error_y_minus="err_minus",
    markers=True,
    category_orders={key: x_order},  # sets axis order & legend hover categories
    labels={key: "Data Shape", "mean": "Seconds (log)"},
    width=1300,
    log_y=True,
    title="File format write time duration (seconds)"
)
fig.update_traces(mode="lines+markers")
fig.update_traces(marker_color=None, line_color=None).update_layout(
    colorway=px.colors.qualitative.Dark24
)
fig.update_layout(legend_title_text="Format")


pio.write_image(fig, file_write_time_image)
Image(url=file_write_time_image)

In [10]:
key = "dataframe_shape (rows, cols)"

size_cols = {
    "csv_size (bytes)": "CSV (GZIP)",
    "sqlite_size (bytes)": "SQLite",
    "anndata_h5ad_noc_size (bytes)": "AnnData (H5AD - uncompressed)",
    "anndata_h5ad_gzip_size (bytes)": "AnnData (H5AD - GZIP)",
    "anndata_h5ad_lz4_size (bytes)": "AnnData (H5AD - LZ4)",
    "anndata_h5ad_zstd_size (bytes)": "AnnData (H5AD - ZSTD)",
    "anndata_zarr_size (bytes)": "AnnData (Zarr)",
    "parquet_noc_size (bytes)": "Parquet (uncompressed)",
    "parquet_snappy_size (bytes)": "Parquet (Snappy)",
    "parquet_gzip_size (bytes)": "Parquet (GZIP)",
    "parquet_zstd_size (bytes)": "Parquet (ZSTD)",
    "parquet_lz4_size (bytes)": "Parquet (LZ4)",
}

# Long-form + average across repeats
long = df_results.melt(
    id_vars=[key],
    value_vars=list(size_cols.keys()),
    var_name="col",
    value_name="bytes",
).dropna(subset=["bytes"])
long["format"] = long["col"].map(size_cols)

stats = long.groupby([key, "format"], as_index=False)["bytes"].mean()

# Choose x-axis category order (keep your current result order, reversed here).
x_order = result[key].iloc[::-1].tolist()

# Ensure each trace's points follow that order (pre-sort rows)
pos = {cat: i for i, cat in enumerate(x_order)}
stats_sorted = stats.assign(xpos=stats[key].map(pos)).sort_values(["format", "xpos"])

fig = px.line(
    stats_sorted,
    x=key,
    y="bytes",
    color="format",
    markers=True,
    category_orders={key: x_order},
    labels={key: "Data Shape", "bytes": "Bytes"},
    width=1300,
    title="File format size (bytes)"
)

fig.update_traces(mode="lines+markers")
fig.update_xaxes(autorange="reversed")
fig.update_traces(marker_color=None, line_color=None).update_layout(
    colorway=px.colors.qualitative.Dark24
)
fig.update_layout(
    legend=dict(
        x=1.02,
        y=1,  # just outside the plotting area
        xanchor="left",
        yanchor="top",
        bgcolor="rgba(255,255,255,0.8)",
    ),
    margin=dict(r=220),  # add right margin so legend fits
    font=dict(size=18),
)

pio.write_image(fig, file_storage_size_image)
Image(url=file_storage_size_image)

In [11]:
# read time barchart (all columns)
key = "dataframe_shape (rows, cols)"

cols = {
    "CSV (GZIP)": (
        "csv_read_time_all (secs) mean",
        "csv_read_time_all (secs) min",
        "csv_read_time_all (secs) max",
    ),
    "SQLite": (
        "sqlite_read_time_all (secs) mean",
        "sqlite_read_time_all (secs) min",
        "sqlite_read_time_all (secs) max",
    ),
    "AnnData (H5AD - uncompressed)": (
        "anndata_h5ad_noc_read_time_all (secs) mean",
        "anndata_h5ad_noc_read_time_all (secs) min",
        "anndata_h5ad_noc_read_time_all (secs) max",
    ),
    "AnnData (H5AD - GZIP)": (
        "anndata_h5ad_gzip_read_time_all (secs) mean",
        "anndata_h5ad_gzip_read_time_all (secs) min",
        "anndata_h5ad_gzip_read_time_all (secs) max",
    ),
    "AnnData (H5AD - ZSTD)": (
        "anndata_h5ad_zstd_read_time_all (secs) mean",
        "anndata_h5ad_zstd_read_time_all (secs) min",
        "anndata_h5ad_zstd_read_time_all (secs) max",
    ),
    "AnnData (H5AD - LZ4) (": (
        "anndata_h5ad_lz4_read_time_all (secs) mean",
        "anndata_h5ad_lz4_read_time_all (secs) min",
        "anndata_h5ad_lz4_read_time_all (secs) max",
    ),
    "AnnData (Zarr)": (
        "anndata_zarr_read_time_all (secs) mean",
        "anndata_zarr_read_time_all (secs) min",
        "anndata_zarr_read_time_all (secs) max",
    ),
    "Parquet (uncompressed)": (
        "parquet_noc_read_time_all (secs) mean",
        "parquet_noc_read_time_all (secs) min",
        "parquet_noc_read_time_all (secs) max",
    ),
    "Parquet (Snappy)": (
        "parquet_snappy_read_time_all (secs) mean",
        "parquet_snappy_read_time_all (secs) min",
        "parquet_snappy_read_time_all (secs) max",
    ),
    "Parquet (GZIP)": (
        "parquet_gzip_read_time_all (secs) mean",
        "parquet_gzip_read_time_all (secs) min",
        "parquet_gzip_read_time_all (secs) max",
    ),
    "Parquet (ZSTD)": (
        "parquet_zstd_read_time_all (secs) mean",
        "parquet_zstd_read_time_all (secs) min",
        "parquet_zstd_read_time_all (secs) max",
    ),
    "Parquet (LZ4)": (
        "parquet_lz4_read_time_all (secs) mean",
        "parquet_lz4_read_time_all (secs) min",
        "parquet_lz4_read_time_all (secs) max",
    ),
}

parts = []
for fmt, (mcol, mincol, maxcol) in cols.items():
    tmp = result[[key, mcol, mincol, maxcol]].copy()
    tmp["format"] = fmt
    tmp.rename(columns={mcol: "mean", mincol: "min", maxcol: "max"}, inplace=True)
    tmp["err_plus"] = tmp["max"] - tmp["mean"]
    tmp["err_minus"] = tmp["mean"] - tmp["min"]
    parts.append(tmp[[key, "format", "mean", "err_plus", "err_minus"]])


stats = pd.concat(parts, ignore_index=True)

x_order = result[key].tolist()  # not reversed; use iloc[::-1] to reverse
pos = {k: i for i, k in enumerate(x_order)}  # category → position index

# 2) give each row its x position and sort per-trace
stats = stats.assign(xpos=stats[key].map(pos)).sort_values(["format", "xpos"])

fig = px.line(
    stats,  # already trace-sorted by xpos
    x=key,
    y="mean",
    color="format",
    error_y="err_plus",
    error_y_minus="err_minus",
    markers=True,
    category_orders={key: x_order},  # sets axis order & legend hover categories
    labels={key: "Data Shape", "mean": "Seconds"},
    width=1300,
    log_y=True,
    title="File format read time duration (full dataset) (seconds)"
)
fig.update_traces(mode="lines+markers")
fig.update_traces(marker_color=None, line_color=None).update_layout(
    colorway=px.colors.qualitative.Dark24
)
fig.update_layout(legend_title_text="Format")


pio.write_image(fig, file_read_time_all_image)
Image(url=file_read_time_all_image)

In [12]:
# read time barchart (one column)

key = "dataframe_shape (rows, cols)"

cols = {
    "CSV (GZIP)": (
        "csv_read_time_one (secs) mean",
        "csv_read_time_one (secs) min",
        "csv_read_time_one (secs) max",
    ),
    "SQLite": (
        "sqlite_read_time_one (secs) mean",
        "sqlite_read_time_one (secs) min",
        "sqlite_read_time_one (secs) max",
    ),
    "AnnData (H5AD - uncompressed)": (
        "anndata_h5ad_noc_read_time_one (secs) mean",
        "anndata_h5ad_noc_read_time_one (secs) min",
        "anndata_h5ad_noc_read_time_one (secs) max",
    ),
    "AnnData (H5AD - GZIP)": (
        "anndata_h5ad_gzip_read_time_one (secs) mean",
        "anndata_h5ad_gzip_read_time_one (secs) min",
        "anndata_h5ad_gzip_read_time_one (secs) max",
    ),
    "AnnData (H5AD - ZSTD)": (
        "anndata_h5ad_zstd_read_time_one (secs) mean",
        "anndata_h5ad_zstd_read_time_one (secs) min",
        "anndata_h5ad_zstd_read_time_one (secs) max",
    ),
    "AnnData (H5AD - LZ4) (": (
        "anndata_h5ad_lz4_read_time_one (secs) mean",
        "anndata_h5ad_lz4_read_time_one (secs) min",
        "anndata_h5ad_lz4_read_time_one (secs) max",
    ),
    "AnnData (Zarr)": (
        "anndata_zarr_read_time_one (secs) mean",
        "anndata_zarr_read_time_one (secs) min",
        "anndata_zarr_read_time_one (secs) max",
    ),
    "Parquet (uncompressed)": (
        "parquet_noc_read_time_one (secs) mean",
        "parquet_noc_read_time_one (secs) min",
        "parquet_noc_read_time_one (secs) max",
    ),
    "Parquet (Snappy)": (
        "parquet_snappy_read_time_one (secs) mean",
        "parquet_snappy_read_time_one (secs) min",
        "parquet_snappy_read_time_one (secs) max",
    ),
    "Parquet (GZIP)": (
        "parquet_gzip_read_time_one (secs) mean",
        "parquet_gzip_read_time_one (secs) min",
        "parquet_gzip_read_time_one (secs) max",
    ),
    "Parquet (ZSTD)": (
        "parquet_zstd_read_time_one (secs) mean",
        "parquet_zstd_read_time_one (secs) min",
        "parquet_zstd_read_time_one (secs) max",
    ),
    "Parquet (LZ4)": (
        "parquet_lz4_read_time_one (secs) mean",
        "parquet_lz4_read_time_one (secs) min",
        "parquet_lz4_read_time_one (secs) max",
    ),
}


parts = []
for fmt, (mcol, mincol, maxcol) in cols.items():
    tmp = result[[key, mcol, mincol, maxcol]].copy()
    tmp["format"] = fmt
    tmp.rename(columns={mcol: "mean", mincol: "min", maxcol: "max"}, inplace=True)
    tmp["err_plus"] = tmp["max"] - tmp["mean"]
    tmp["err_minus"] = tmp["mean"] - tmp["min"]
    parts.append(tmp[[key, "format", "mean", "err_plus", "err_minus"]])


stats = pd.concat(parts, ignore_index=True)

x_order = result[key].tolist()  # not reversed; use iloc[::-1] to reverse
pos = {k: i for i, k in enumerate(x_order)}  # category → position index

# 2) give each row its x position and sort per-trace
stats = stats.assign(xpos=stats[key].map(pos)).sort_values(["format", "xpos"])

fig = px.line(
    stats,  # already trace-sorted by xpos
    x=key,
    y="mean",
    color="format",
    error_y="err_plus",
    error_y_minus="err_minus",
    markers=True,
    category_orders={key: x_order},  # sets axis order & legend hover categories
    labels={key: "Data Shape", "mean": "Seconds (log)"},
    width=1300,
    log_y=True,
    title="File format read time duration (one column) (seconds)"
)
fig.update_traces(mode="lines+markers")
fig.update_traces(marker_color=None, line_color=None).update_layout(
    colorway=px.colors.qualitative.Dark24
)
fig.update_layout(legend_title_text="Format")


pio.write_image(fig, file_read_time_one_image)
Image(url=file_read_time_one_image)