# End-to-end benchmarks
This notebook can run on master as well as on #370.

See plans_benchmarks.ipynb for performance measures on the matching plans in isolation

In [1]:
import csv
import sys
import tempfile
import time

import h5py
import numpy as np
from versioned_hdf5 import VersionedHDF5File

print(sys.modules.get("versioned_hdf5.staged_changes"))
print(sys.modules.get("versioned_hdf5.subchunk_map"))

tmpdir = tempfile.TemporaryDirectory()
path = f"{tmpdir.name}/data.h5"
print(path)

<module 'versioned_hdf5.staged_changes' from '/home/crusaderky/github/versioned-hdf5/versioned_hdf5/staged_changes.cpython-312-x86_64-linux-gnu.so'>
<module 'versioned_hdf5.subchunk_map' from '/home/crusaderky/github/versioned-hdf5/versioned_hdf5/subchunk_map.cpython-312-x86_64-linux-gnu.so'>
/tmp/tmpga9eoz62/data.h5


In [2]:
shape = (20_000, 20_000)  # ~3 GiB
chunks = (100, 100)  # 40k x 78 kib chunks
# chunks = (2_000, 2_000)  # 100 x 30 MiB chunks

In [3]:
class Benchmark:
    def __init__(self):
        self.fh = open("benchmark.csv", "w")
        self.writer = csv.writer(self.fh)
        print("Writing to benchmark.csv")
        self.t0 = None
        self.label = None

    def start(self, label):
        self.t0 = time.time()
        self.label = label
        print(f"**** {label} ****")

    def clock(self, sublabel):
        t1 = time.time()
        self.print(t1 - self.t0, sublabel)
        self.t0 = t1

    def print(self, delta_t, sublabel):
        print(f"{delta_t:-10.6f}s {sublabel}")
        self.writer.writerow([self.label, sublabel, delta_t])
        self.fh.flush()


bench = Benchmark()

Writing to benchmark.csv


In [4]:
i = 0
with h5py.File(path, "w") as f:
    vf = VersionedHDF5File(f)
    with vf.stage_version(f"r{i}") as sv:
        sv.create_dataset(
            "value",
            data=np.arange(np.prod(shape)).reshape(shape),
            chunks=(chunks),
            maxshape=(None, None),
        )

In [5]:
i += 1
bench.start("stage_version without activity")

with h5py.File(path, "r+") as f:
    vf = VersionedHDF5File(f)
    with vf.stage_version(f"r{i}") as sv:
        bench.clock("open file")
        dset = sv["value"]
        bench.clock("open dataset")
        try:
            dset.staged_changes
        except AttributeError:
            pass  # master
        bench.clock("build staged_changes")
        dset.data_dict
        bench.clock("build data_dict")

bench.clock("commit")

**** stage_version without activity ****
  0.733366s open file
  0.000337s open dataset
  0.172615s build staged_changes
  0.503502s build data_dict
  0.972063s commit


In [6]:
new_data = np.arange(1, np.prod(shape) + 1).reshape(shape)[chunks[0] :]

In [7]:
label = "__setitem__ of whole chunks not in memory"
i += 1

with h5py.File(path, "r+") as f:
    vf = VersionedHDF5File(f)
    with vf.stage_version(f"r{i}") as sv:
        dset = sv["value"]
        bench.start(label)
        # Note: writing to the whole dataset would cause the InMemoryDataset
        # to be dropped and replaced with a InMemoryArrayDataset
        dset[chunks[0] :] = new_data
        bench.clock("dset[c:] = v")
bench.clock("commit")

**** __setitem__ of whole chunks not in memory ****
  0.780353s dset[c:] = v
  9.186669s commit


In [8]:
label = "__setitem__ of parts of chunks (target all chunks)"
i += 1

with h5py.File(path, "r+") as f:
    vf = VersionedHDF5File(f)
    with vf.stage_version(f"r{i}") as sv:
        dset = sv["value"]
        bench.start(label)
        dset[:: chunks[0], :: chunks[1]] = 12
        bench.clock("dset[::c, ::c] = v")
bench.clock("commit")

**** __setitem__ of parts of chunks (target all chunks) ****
  0.982122s dset[::c, ::c] = v
  9.406580s commit


In [9]:
label = "setitem of parts of chunks (target every other chunk on outer axis)"
i += 1

with h5py.File(path, "r+") as f:
    vf = VersionedHDF5File(f)
    with vf.stage_version(f"r{i}") as sv:
        dset = sv["value"]
        bench.start(label)
        dset[:: chunks[0] * 2, :: chunks[1]] = 34
        bench.clock("dset[::c*2, ::c] = v")
bench.clock("commit")

**** setitem of parts of chunks (target every other chunk on outer axis) ****
  0.634756s dset[::c*2, ::c] = v
  6.457476s commit


In [10]:
label = "setitem of parts of chunks (target every other chunk on inner axis)"
i += 1

with h5py.File(path, "r+") as f:
    vf = VersionedHDF5File(f)
    with vf.stage_version(f"r{i}") as sv:
        dset = sv["value"]
        bench.start(label)
        dset[:: chunks[0], :: chunks[1] * 2] = 56
        bench.clock("dset[::c, ::c*2] = v")
bench.clock("commit")

**** setitem of parts of chunks (target every other chunk on inner axis) ****
  0.608711s dset[::c, ::c*2] = v
  6.214005s commit


In [11]:
label = "setitem of parts of chunks (checkers pattern)"
i += 1

with h5py.File(path, "r+") as f:
    vf = VersionedHDF5File(f)
    with vf.stage_version(f"r{i}") as sv:
        dset = sv["value"]
        bench.start(label)
        dset[:: chunks[0] * 2, :: chunks[1] * 2] = 78
        bench.clock("dset[::c*2, ::c*2] = v")
bench.clock("commit")

**** setitem of parts of chunks (checkers pattern) ****
  0.384542s dset[::c*2, ::c*2] = v
  4.323836s commit


In [12]:
label = "getitem with no uncommitted changes, step=1"
i += 1
with h5py.File(path, "r+") as f:
    vf = VersionedHDF5File(f)
    with vf.stage_version(f"r{i}") as sv:
        dset = sv["value"]
        bench.start(label)
        dset[()]
        bench.clock("dset[()]")
bench.clock("commit")

**** getitem with no uncommitted changes, step=1 ****
  1.292948s dset[()]
  0.786354s commit


In [13]:
label = "getitem after a single chunk has been changed, step=1"
i += 1

with h5py.File(path, "r+") as f:
    vf = VersionedHDF5File(f)
    with vf.stage_version(f"r{i}") as sv:
        dset = sv["value"]
        bench.start(label)
        dset[0, 0] = 90
        bench.clock("dset[0, 0] = v")
        dset[()]
        bench.clock("dset[()]")
bench.clock("commit")

**** getitem after a single chunk has been changed, step=1 ****
  0.185331s dset[0, 0] = v
  1.120037s dset[()]
  2.604008s commit


In [14]:
label = "getitem with no uncommitted changes, step=2 rows"
i += 1
with h5py.File(path, "r+") as f:
    vf = VersionedHDF5File(f)
    with vf.stage_version(f"r{i}") as sv:
        dset = sv["value"]
        bench.start(label)
        dset[::2]
        bench.clock("dset[::2]")
bench.clock("commit")

**** getitem with no uncommitted changes, step=2 rows ****
 10.178604s dset[::2]
  0.716204s commit


In [15]:
label = "getitem with no uncommitted changes, step=2 columns"
i += 1
with h5py.File(path, "r+") as f:
    vf = VersionedHDF5File(f)
    with vf.stage_version(f"r{i}") as sv:
        dset = sv["value"]
        bench.start(label)
        dset[:, ::2]
        bench.clock("dset[:, ::2]")
bench.clock("commit")

**** getitem with no uncommitted changes, step=2 columns ****
 11.087460s dset[:, ::2]
  0.821244s commit


In [16]:
label = "getitem after a single chunk has been changed, step=2 rows"
i += 1

with h5py.File(path, "r+") as f:
    vf = VersionedHDF5File(f)
    with vf.stage_version(f"r{i}") as sv:
        dset = sv["value"]
        bench.start(label)
        dset[0, 0] = 90
        bench.clock("dset[0, 0] = v")
        dset[::2]
        bench.clock("dset[::2]")
bench.clock("commit")

**** getitem after a single chunk has been changed, step=2 rows ****
  0.176433s dset[0, 0] = v
  9.712827s dset[::2]
  2.572764s commit


In [17]:
label = "getitem after a single chunk has been changed, step=2 columns"
i += 1

with h5py.File(path, "r+") as f:
    vf = VersionedHDF5File(f)
    with vf.stage_version(f"r{i}") as sv:
        dset = sv["value"]
        bench.start(label)
        dset[0, 0] = 90
        bench.clock("dset[0, 0] = v")
        dset[:, ::2]
        bench.clock("dset[:, ::2]")
bench.clock("commit")

**** getitem after a single chunk has been changed, step=2 columns ****
  0.172729s dset[0, 0] = v
 10.832580s dset[:, ::2]
  2.566417s commit


In [18]:
label = "resize to add 1 chunk"
i += 1

with h5py.File(path, "r+") as f:
    vf = VersionedHDF5File(f)
    with vf.stage_version(f"r{i}") as sv:
        dset = sv["value"]
        bench.start(label)
        dset.resize((shape[0] + 1, shape[0]))
        bench.clock("dset.resize((d + 1, d))")
bench.clock("commit")

**** resize to add 1 chunk ****
  0.172323s dset.resize((d + 1, d))
  2.532207s commit
