In [1]:
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.compute as pc
import h5py
from src.benchmarkers import *
from src.benchmarkersV2 import *
import time
from matplotlib import pyplot as plt
np.random.seed(0)

In [2]:
def is_leaf(name, node):
    """Check if a node is a dataset (leaf)."""
    return isinstance(node, h5py.Dataset)

def find_leaf_datasets(hdf5_file):
    """Find and return paths to all leaf datasets in an HDF5 file."""
    leaf_datasets = []

    # Traverse the HDF5 file
    hdf5_file.visititems(lambda name, node: leaf_datasets.append(name) if is_leaf(name, node) else None)
    
    return leaf_datasets


In [2]:
N = [10,100,200,300,500]
#N = [10]
iterations = 100

In [6]:
generator = Generator()

In [7]:
for item in N:
    generator.create_dataset(item,f"outputs/v2/column_wise/ds_{item}")
    generator.create_arrow_file(f"outputs/v2/column_wise/ds_{item}")
    generator.create_arrow_stream(f"outputs/v2/column_wise/ds_{item}")
    

In [3]:
arrow_file_memory = ClockColumnWise().benchmark_arrow("outputs/v2/column_wise/ds",N,iterations,memory=True,stream=False)
arrow_stream_memory = ClockColumnWise().benchmark_arrow("outputs/v2/column_wise/ds",N,iterations,memory=True,stream=True)
arrow_file_no_memory = ClockColumnWise().benchmark_arrow("outputs/v2/column_wise/ds",N,iterations,memory=False,stream=False)
arrow_stream_no_memory = ClockColumnWise().benchmark_arrow("outputs/v2/column_wise/ds",N,iterations,memory=False,stream=True)
arrow_parquet = ClockColumnWise().benchmark_parquet("outputs/v2/column_wise/ds",N,iterations)
hdf5_core = ClockColumnWise().benchmark_hdf5("outputs/v2/column_wise/ds",N,iterations,hdf5_driver="core")
hdf5_sec2 = ClockColumnWise().benchmark_hdf5("outputs/v2/column_wise/ds",N,iterations,hdf5_driver="sec2")

100%|██████████| 100/100 [00:01<00:00, 61.31it/s]
100%|██████████| 100/100 [00:16<00:00,  6.20it/s]
100%|██████████| 100/100 [00:37<00:00,  2.67it/s]
100%|██████████| 100/100 [01:01<00:00,  1.64it/s]
100%|██████████| 100/100 [01:39<00:00,  1.01it/s]
100%|██████████| 100/100 [00:01<00:00, 53.38it/s]
100%|██████████| 100/100 [00:19<00:00,  5.17it/s]
100%|██████████| 100/100 [00:41<00:00,  2.42it/s]
100%|██████████| 100/100 [01:00<00:00,  1.64it/s]
100%|██████████| 100/100 [01:29<00:00,  1.12it/s]
100%|██████████| 100/100 [00:02<00:00, 48.70it/s]
100%|██████████| 100/100 [00:25<00:00,  3.96it/s]
100%|██████████| 100/100 [00:51<00:00,  1.96it/s]
100%|██████████| 100/100 [01:15<00:00,  1.32it/s]
100%|██████████| 100/100 [01:58<00:00,  1.19s/it]
100%|██████████| 100/100 [00:02<00:00, 45.07it/s]
100%|██████████| 100/100 [00:25<00:00,  3.99it/s]
100%|██████████| 100/100 [00:55<00:00,  1.80it/s]
100%|██████████| 100/100 [01:24<00:00,  1.18it/s]
100%|██████████| 100/100 [02:28<00:00,  1.48s/it]


In [6]:
t_load_arrow_file_memory = arrow_file_memory.t_load
t_load_arrow_stream_memory = arrow_stream_memory.t_load
t_load_arrow_file_no_memory = arrow_file_no_memory.t_load
t_load_arrow_stream_no_memory = arrow_stream_no_memory.t_load
t_load_arrow_parquet = arrow_parquet.t_load
t_load_hdf5_core = hdf5_core.t_load
t_load_hdf5_sec2 = hdf5_sec2.t_load

t_manipulate_arrow_file_memory = arrow_file_memory.t_manipulate
t_manipulate_arrow_stream_memory = arrow_stream_memory.t_manipulate
t_manipulate_arrow_file_no_memory = arrow_file_no_memory.t_manipulate
t_manipulate_arrow_stream_no_memory = arrow_stream_no_memory.t_manipulate
t_manipulate_arrow_parquet = arrow_parquet.t_manipulate
t_manipulate_hdf5_core = hdf5_core.t_manipulate
t_manipulate_hdf5_sec2 = hdf5_sec2.t_manipulate

In [7]:
np.save("t_load_arrow_file_memory",t_load_arrow_file_memory)
np.save("t_load_arrow_stream_memory",t_load_arrow_stream_memory)
np.save("t_load_arrow_file_no_memory",t_load_arrow_file_no_memory)
np.save("t_load_arrow_stream_no_memory",t_load_arrow_stream_no_memory)
np.save("t_load_arrow_parquet",t_load_arrow_parquet)
np.save("t_load_hdf5_core",t_load_hdf5_core)
np.save("t_load_hdf5_sec2",t_load_hdf5_sec2)

np.save("t_manipulate_arrow_file_memory",t_manipulate_arrow_file_memory)
np.save("t_manipulate_arrow_stream_memory",t_manipulate_arrow_stream_memory)
np.save("t_manipulate_arrow_file_no_memory",t_manipulate_arrow_file_no_memory)
np.save("t_manipulate_arrow_stream_no_memory",t_manipulate_arrow_stream_no_memory)
np.save("t_manipulate_arrow_parquet",t_manipulate_arrow_parquet)
np.save("t_manipulate_hdf5_core",t_manipulate_hdf5_core)
np.save("t_manipulate_hdf5_sec2",t_manipulate_hdf5_sec2)