# I/O Performance for fixed array sizes

```python
testnames = ["test_large_fraction_changes_sparse", 
             "test_mostly_appends_sparse", 
             "test_small_fraction_changes_sparse", 
             "test_mostly_appends_dense"]
```

In [1]:
path = "/home/melissa/projects/versioned-hdf5"

In [2]:
%matplotlib inline
import os
import sys
sys.path.append('..')
import time
import numpy as np
import matplotlib.pyplot as plt
import h5py
import datetime
from versioned_hdf5 import VersionedHDF5File
from generate_data_deterministic import TestVersionedDatasetPerformance

# Test 1: Large Fraction Changes (Sparse)

## Creating files

In [27]:
testname = "test_large_fraction_changes_sparse"
num_transactions = [50, 100, 500, 1000]
num_rows_initial = 5000

Create new file with `num_transactions` versions:

In [25]:
def time_create(n):
    t0 = time.time()
    TestVersionedDatasetPerformance().test_large_fraction_changes_sparse(n)
    t = time.time()-t0
    return t

In [26]:
for n in num_transactions:
    print(time_create(n))

1.3908498287200928
2.432126522064209
13.553502798080444
38.50507092475891


In [41]:
tests = []
for n in num_transactions:
    filename = f"{testname}_{n}.h5"
    h5pyfile = h5py.File(filename, 'r+')
    vfile = VersionedHDF5File(h5pyfile)
    tests.append(dict(num_transactions=n, filename=filename, h5pyfile=h5pyfile, vfile=vfile))

## Adding new version to existing set

In [34]:
def add_version(vfile):
    tt = datetime.datetime.utcnow()
    with vfile.stage_version(str(tt)) as group:
        key0 = group.create_dataset(testname + '/key0', data=np.random.rand(num_rows_initial), dtype=(np.dtype('int64')))
        key1 = group.create_dataset(testname + '/key1', data=np.random.rand(num_rows_initial), dtype=(np.dtype('int64')))
        val = group.create_dataset(testname + '/val', data=np.random.rand(num_rows_initial), dtype=(np.dtype('float64')))

In [42]:
for test in tests:
    t0 = time.time()
    add_version(test['vfile'])
    t = time.time()-t0
    print(t)

0.15346932411193848
0.042232513427734375
0.034436702728271484
0.04387378692626953


In [43]:
for test in tests:
    test['h5pyfile'].close()

## Reading in sequential mode

In [45]:
for test in tests:
    test['h5pyfile'] = h5py.File(filename, 'r')
    test['vfile'] = VersionedHDF5File(test['h5pyfile'])

In [46]:
def read_data(vfile):
    for vname in vfile._versions:
        if vname != '__first_version__':
            version = vfile[vname]
            group_key = list(version.keys())[0]
            val = version[group_key]['val']

In [48]:
for test in tests:
    t0 = time.time()
    read_data(test['vfile'])
    t = time.time()-t0
    print(t)

1.7396488189697266
1.846527338027954
1.6409974098205566
1.8354008197784424


## Reading specific version

In [49]:
def read_version(vfile):
    N = len(vfile._versions.keys())
    index = np.random.randint(0, N)
    vname = list(vfile._versions.keys())[index]
    return vname

In [54]:
for test in tests:
    vname = read_version(test['vfile'])
    if vname != '__first_version__':
        t0 = time.time()
        version = test['vfile'][vname]
        group_key = list(version.keys())[0]
        val = version[group_key]['val']
        t = time.time()-t0
        print(t)

0.0024514198303222656
0.0020580291748046875
0.008698225021362305
0.0025420188903808594


## Finishing up

In [55]:
for test in tests:
    test['h5pyfile'].close()

# Test 2: Mostly appends (Sparse)

## Creating files

In [11]:
testname = "test_mostly_appends_sparse"
num_transactions = [50, 100, 200]
num_rows_initial = 1000

Create new file with `num_transactions` versions:

In [12]:
def time_create(n):
    t0 = time.time()
    TestVersionedDatasetPerformance().test_mostly_appends_sparse(n)
    t = time.time()-t0
    return t

In [13]:
for n in num_transactions:
    print(time_create(n))

2.606062889099121
8.52845549583435
29.528016805648804


In [14]:
tests = []
for n in num_transactions:
    filename = f"{testname}_{n}.h5"
    h5pyfile = h5py.File(filename, 'r+')
    vfile = VersionedHDF5File(h5pyfile)
    tests.append(dict(num_transactions=n, filename=filename, h5pyfile=h5pyfile, vfile=vfile))

## Adding new version to existing set

In [15]:
def add_version(vfile):
    tt = datetime.datetime.utcnow()
    with vfile.stage_version(str(tt)) as group:
        key0 = group.create_dataset(testname + '/key0', data=np.random.rand(num_rows_initial), dtype=(np.dtype('int64')))
        key1 = group.create_dataset(testname + '/key1', data=np.random.rand(num_rows_initial), dtype=(np.dtype('int64')))
        val = group.create_dataset(testname + '/val', data=np.random.rand(num_rows_initial), dtype=(np.dtype('float64')))

In [16]:
for test in tests:
    t0 = time.time()
    add_version(test['vfile'])
    t = time.time()-t0
    print(t)

0.16510009765625
0.07856416702270508
0.09362387657165527


In [17]:
for test in tests:
    test['h5pyfile'].close()

## Reading in sequential mode

In [18]:
for test in tests:
    test['h5pyfile'] = h5py.File(filename, 'r')
    test['vfile'] = VersionedHDF5File(test['h5pyfile'])

In [19]:
def read_data(vfile):
    for vname in vfile._versions:
        if vname != '__first_version__':
            version = vfile[vname]
            group_key = list(version.keys())[0]
            val = version[group_key]['val']

In [20]:
for test in tests:
    t0 = time.time()
    read_data(test['vfile'])
    t = time.time()-t0
    print(t)

0.8390614986419678
0.9234764575958252
0.6934041976928711


## Reading specific version

In [21]:
def read_version(vfile):
    N = len(vfile._versions.keys())
    index = np.random.randint(0, N)
    vname = list(vfile._versions.keys())[index]
    return vname

In [22]:
for test in tests:
    vname = read_version(test['vfile'])
    if vname != '__first_version__':
        t0 = time.time()
        version = test['vfile'][vname]
        group_key = list(version.keys())[0]
        val = version[group_key]['val']
        t = time.time()-t0
        print(t)

0.0016345977783203125
0.0026993751525878906
0.005071878433227539


## Finishing up

In [23]:
for test in tests:
    test['h5pyfile'].close()

# Test 3: Small Fraction Changes (Sparse)

## Creating files

In [24]:
testname = "test_small_fraction_changes_sparse"
num_transactions = [50, 100, 500, 1000]
num_rows_initial = 5000

Create new file with `num_transactions` versions:

In [25]:
def time_create(n):
    t0 = time.time()
    TestVersionedDatasetPerformance().test_small_fraction_changes_sparse(n)
    t = time.time()-t0
    return t

In [26]:
for n in num_transactions:
    print(time_create(n))

1.0959360599517822
2.4691689014434814
15.07020616531372
33.29482340812683


In [27]:
tests = []
for n in num_transactions:
    filename = f"{testname}_{n}.h5"
    h5pyfile = h5py.File(filename, 'r+')
    vfile = VersionedHDF5File(h5pyfile)
    tests.append(dict(num_transactions=n, filename=filename, h5pyfile=h5pyfile, vfile=vfile))

## Adding new version to existing set

In [28]:
def add_version(vfile):
    tt = datetime.datetime.utcnow()
    with vfile.stage_version(str(tt)) as group:
        key0 = group.create_dataset(testname + '/key0', data=np.random.rand(num_rows_initial), dtype=(np.dtype('int64')))
        key1 = group.create_dataset(testname + '/key1', data=np.random.rand(num_rows_initial), dtype=(np.dtype('int64')))
        val = group.create_dataset(testname + '/val', data=np.random.rand(num_rows_initial), dtype=(np.dtype('float64')))

In [29]:
for test in tests:
    t0 = time.time()
    add_version(test['vfile'])
    t = time.time()-t0
    print(t)

0.0404512882232666
0.04590344429016113
0.04733729362487793
0.11342334747314453


In [30]:
for test in tests:
    test['h5pyfile'].close()

## Reading in sequential mode

In [31]:
for test in tests:
    test['h5pyfile'] = h5py.File(filename, 'r')
    test['vfile'] = VersionedHDF5File(test['h5pyfile'])

In [32]:
def read_data(vfile):
    for vname in vfile._versions:
        if vname != '__first_version__':
            version = vfile[vname]
            group_key = list(version.keys())[0]
            val = version[group_key]['val']

In [33]:
for test in tests:
    t0 = time.time()
    read_data(test['vfile'])
    t = time.time()-t0
    print(t)

1.8932957649230957
1.8067715167999268
1.5372166633605957
1.9597845077514648


## Reading specific version

In [34]:
def read_version(vfile):
    N = len(vfile._versions.keys())
    index = np.random.randint(0, N)
    vname = list(vfile._versions.keys())[index]
    return vname

In [35]:
for test in tests:
    vname = read_version(test['vfile'])
    if vname != '__first_version__':
        t0 = time.time()
        version = test['vfile'][vname]
        group_key = list(version.keys())[0]
        val = version[group_key]['val']
        t = time.time()-t0
        print(t)

0.001514434814453125
0.0012001991271972656
0.0013587474822998047
0.0014355182647705078


## Finishing up

In [36]:
for test in tests:
    test['h5pyfile'].close()

# Test 4: Mostly appends (Dense)

## Creating files

In [3]:
testname = "test_mostly_appends_dense"
num_transactions = [50, 100, 200]
num_rows_initial = 30

Create new file with `num_transactions` versions:

In [4]:
def time_create(n):
    t0 = time.time()
    TestVersionedDatasetPerformance().test_mostly_appends_dense(n)
    t = time.time()-t0
    return t

In [5]:
for n in num_transactions:
    print(time_create(n))

0.7160508632659912
2.5659000873565674
9.726256608963013


In [6]:
tests = []
for n in num_transactions:
    filename = f"{testname}_{n}.h5"
    h5pyfile = h5py.File(filename, 'r+')
    vfile = VersionedHDF5File(h5pyfile)
    tests.append(dict(num_transactions=n, filename=filename, h5pyfile=h5pyfile, vfile=vfile))

## Adding new version to existing set

In [7]:
def add_version(vfile):
    tt = datetime.datetime.utcnow()
    with vfile.stage_version(str(tt)) as group:
        key0 = group.create_dataset(testname + '/key0', data=np.random.rand(num_rows_initial), dtype=(np.dtype('int64')))
        key1 = group.create_dataset(testname + '/key1', data=np.random.rand(num_rows_initial), dtype=(np.dtype('int64')))
        val = group.create_dataset(testname + '/val', data=np.random.rand(num_rows_initial), dtype=(np.dtype('float64')))

In [8]:
for test in tests:
    t0 = time.time()
    add_version(test['vfile'])
    t = time.time()-t0
    print(t)

0.040671586990356445
0.049040794372558594
0.07277536392211914


In [9]:
for test in tests:
    test['h5pyfile'].close()

## Reading in sequential mode

In [10]:
for test in tests:
    test['h5pyfile'] = h5py.File(filename, 'r')
    test['vfile'] = VersionedHDF5File(test['h5pyfile'])

In [11]:
def read_data(vfile):
    for vname in vfile._versions:
        if vname != '__first_version__':
            version = vfile[vname]
            group_key = list(version.keys())[0]
            val = version[group_key]['val']

In [12]:
for test in tests:
    t0 = time.time()
    read_data(test['vfile'])
    t = time.time()-t0
    print(t)

0.6777133941650391
0.6043143272399902
0.6347177028656006


## Reading specific version

In [13]:
def read_version(vfile):
    N = len(vfile._versions.keys())
    index = np.random.randint(0, N)
    vname = list(vfile._versions.keys())[index]
    return vname

In [14]:
for test in tests:
    vname = read_version(test['vfile'])
    if vname != '__first_version__':
        t0 = time.time()
        version = test['vfile'][vname]
        group_key = list(version.keys())[0]
        val = version[group_key]['val']
        t = time.time()-t0
        print(t)

0.0023703575134277344
0.0009565353393554688
0.003654956817626953


## Finishing up

In [15]:
for test in tests:
    test['h5pyfile'].close()