# End-to-end benchmarks
This notebook can run on master as well as on #370.

See plans_benchmarks.ipynb for performance measures on the matching plans in isolation

In [1]:
import csv
import sys
import tempfile
import time

import h5py
import numpy as np
from versioned_hdf5 import VersionedHDF5File

print(sys.modules.get("versioned_hdf5.hyperspace"))
print(sys.modules.get("versioned_hdf5.staged_changes"))
print(sys.modules.get("versioned_hdf5.subchunk_map"))

tmpdir = tempfile.TemporaryDirectory()
path = f'{tmpdir.name}/data.h5'
print(path)

<module 'versioned_hdf5.hyperspace' from '/home/crusaderky/github/versioned-hdf5/versioned_hdf5/hyperspace.cpython-312-x86_64-linux-gnu.so'>
<module 'versioned_hdf5.staged_changes' from '/home/crusaderky/github/versioned-hdf5/versioned_hdf5/staged_changes.cpython-312-x86_64-linux-gnu.so'>
<module 'versioned_hdf5.subchunk_map' from '/home/crusaderky/github/versioned-hdf5/versioned_hdf5/subchunk_map.cpython-312-x86_64-linux-gnu.so'>
/tmp/tmp3d1hwn82/data.h5


In [2]:
# Very important flag, that completely changes the behaviour of StagedChangesArray.
# Read note in versioned_hdf5.wrappers
import versioned_hdf5.wrappers
versioned_hdf5.wrappers.USE_VIRTUAL_GETITEM = True

shape = (20_000, 20_000)  # ~3 GiB
# chunks=(100, 100)  # 40k x 80 kb chunks
chunks=(4000, 4000)  # 25 x 128 MB chunks

In [3]:
class Benchmark:
    def __init__(self):
        self.fh = open("benchmark.csv", "w")
        self.writer = csv.writer(self.fh)
        print("Writing to benchmark.csv")
        self.t0 = None
        self.label = None

    def start(self, label):
        self.t0 = time.time()
        self.label = label
        print(f"**** {label} ****")

    def clock(self, sublabel):
        t1 = time.time()
        self.print(t1 - self.t0, sublabel)
        self.t0 = t1

    def print(self, delta_t, sublabel):
        print(f"{delta_t:-10.6f}s {sublabel}")
        self.writer.writerow([self.label, sublabel, delta_t])
        self.fh.flush()

bench = Benchmark()

Writing to benchmark.csv


In [4]:
i = 0
with h5py.File(path, 'w') as f:
    vf = VersionedHDF5File(f)
    with vf.stage_version(f'r{i}') as sv:
        sv.create_dataset('value', data=np.arange(np.prod(shape)).reshape(shape), chunks=(chunks), maxshape=(None, None))

In [5]:
i += 1
bench.start("stage_version without activity")

with h5py.File(path, 'r+') as f:
    vf = VersionedHDF5File(f)
    with vf.stage_version(f'r{i}') as sv:
        bench.clock("open file")
        dset = sv["value"]
        bench.clock("open dataset")
        try:
            dset.data_dict  # master
        except AttributeError:
            try:
                dset.base_data_dict  #  #370+raw_data
            except AttributeError:
                pass  # #370
        bench.clock("build data_dict")
        try:
            dset.staged_changes.chunk_states  # #370
        except AttributeError:
            pass  # master
        bench.clock("build staged_changes")

bench.clock("commit")

**** stage_version without activity ****
  0.002685s open file
  0.000074s open dataset
  0.000023s build data_dict
  0.000036s build staged_changes
  0.001709s commit


In [6]:
new_data = np.arange(1, np.prod(shape) + 1).reshape(shape)[chunks[0]:]

In [7]:
label = "__setitem__ of whole chunks not in memory"
i += 1

with h5py.File(path, 'r+') as f:
    vf = VersionedHDF5File(f)
    with vf.stage_version(f'r{i}') as sv:
        dset = sv["value"]
        bench.start(label)
        # Note: writing to the whole dataset would cause the InMemoryDataset
        # to be dropped and replaced with a InMemoryArrayDataset
        dset[chunks[0]:] = new_data
        bench.clock("dset[c:] = v")
bench.clock("commit")

**** __setitem__ of whole chunks not in memory ****
  0.000757s dset[c:] = v
  0.001839s commit


In [8]:
label = "__setitem__ of parts of chunks (target all chunks)"
i += 1

with h5py.File(path, 'r+') as f:
    vf = VersionedHDF5File(f)
    with vf.stage_version(f'r{i}') as sv:
        dset = sv["value"]
        bench.start(label)
        dset[::chunks[0], ::chunks[1]] = 12
        bench.clock("dset[::c, ::c] = v")
bench.clock("commit")

**** __setitem__ of parts of chunks (target all chunks) ****
  1.667870s dset[::c, ::c] = v
 10.050791s commit


In [9]:
label = "setitem of parts of chunks (target every other chunk on outer axis)"
i += 1

with h5py.File(path, 'r+') as f:
    vf = VersionedHDF5File(f)
    with vf.stage_version(f'r{i}') as sv:
        dset = sv["value"]
        bench.start(label)
        dset[::chunks[0] * 2, ::chunks[1]] = 34
        bench.clock("dset[::c*2, ::c] = v")
bench.clock("commit")

**** setitem of parts of chunks (target every other chunk on outer axis) ****
  0.996491s dset[::c*2, ::c] = v
  6.026230s commit


In [10]:
label = "setitem of parts of chunks (target every other chunk on inner axis)"
i += 1

with h5py.File(path, 'r+') as f:
    vf = VersionedHDF5File(f)
    with vf.stage_version(f'r{i}') as sv:
        dset = sv["value"]
        bench.start(label)
        dset[::chunks[0], ::chunks[1] * 2] = 56
        bench.clock("dset[::c, ::c*2] = v")
bench.clock("commit")

**** setitem of parts of chunks (target every other chunk on inner axis) ****
  0.893804s dset[::c, ::c*2] = v
  6.306693s commit


In [11]:
label = "setitem of parts of chunks (checkers pattern)"
i += 1

with h5py.File(path, 'r+') as f:
    vf = VersionedHDF5File(f)
    with vf.stage_version(f'r{i}') as sv:
        dset = sv["value"]
        bench.start(label)
        dset[::chunks[0] * 2, ::chunks[1] * 2] = 78
        bench.clock("dset[::c*2, ::c*2] = v")
bench.clock("commit")

**** setitem of parts of chunks (checkers pattern) ****
  0.542705s dset[::c*2, ::c*2] = v
  3.450725s commit


In [12]:
label = "getitem with no uncommitted changes"
i += 1
with h5py.File(path, 'r+') as f:
    vf = VersionedHDF5File(f)
    with vf.stage_version(f'r{i}') as sv:
        dset = sv["value"]
        bench.start(label)
        dset[()]
        bench.clock("dset[()]")
bench.clock("commit")

**** getitem with no uncommitted changes ****
  1.888866s dset[()]
  0.002287s commit


In [13]:
label = "getitem after a single chunk within the selection has been changed"
i += 1

with h5py.File(path, 'r+') as f:
    vf = VersionedHDF5File(f)
    with vf.stage_version(f'r{i}') as sv:
        dset = sv["value"]
        bench.start(label)
        dset[shape[0] // 2, shape[1] // 2] = 90
        bench.clock("dset[i, j] = v")
        dset[chunks[0]:, :]
        bench.clock("dset[c:,:]")
bench.clock("commit")

**** getitem after a single chunk within the selection has been changed ****
  0.062790s dset[i, j] = v
  1.847945s dset[c:,:]
  0.401704s commit


In [14]:
label = "getitem after a single chunk outside of the selection has been changed"
i += 1

with h5py.File(path, 'r+') as f:
    vf = VersionedHDF5File(f)
    with vf.stage_version(f'r{i}') as sv:
        dset = sv["value"]
        bench.start(label)
        dset[0, 0] = 4
        bench.clock("dset[0, 0] = v")
        dset[chunks[0]:, :]
        bench.clock("dset[c:,:]")
bench.clock("commit")

**** getitem after a single chunk outside of the selection has been changed ****
  0.065654s dset[0, 0] = v
  1.532984s dset[c:,:]
  0.490781s commit


In [15]:
label = "resize to add 1 chunk"
i += 1

with h5py.File(path, 'r+') as f:
    vf = VersionedHDF5File(f)
    with vf.stage_version(f'r{i}') as sv:
        dset = sv["value"]
        bench.start(label)
        dset.resize((shape[0] + 1, shape[0]))
        bench.clock("dset.resize((d + 1, d))")
bench.clock("commit")

**** resize to add 1 chunk ****
  0.000237s dset.resize((d + 1, d))
  0.006112s commit
