In [None]:
# Cell 1: Header and Setup
import sys
sys.path.insert(0, '..')

from utils.notebook_utils import display_header, display_toc, check_dependency, conclusion_box, info_box
from utils.system_info import display_system_info
from utils.benchmark import Benchmark, BenchmarkResult, ComparisonTable
from utils.charts import setup_style, bar_comparison, throughput_comparison, memory_comparison, COLORS

display_header('ML Data Loading Comparison', 'SynaDB vs HDF5 vs TFRecord')

In [None]:
# Cell 2: Table of Contents
sections = [
    ('Introduction', 'introduction'),
    ('Setup', 'setup'),
    ('Benchmark: Sequential Load', 'benchmark-sequential'),
    ('Benchmark: Random Access', 'benchmark-random'),
    ('Benchmark: PyTorch DataLoader', 'benchmark-dataloader'),
    ('Tensor Extraction Comparison', 'tensor-extraction'),
    ('Schema Flexibility Demo', 'schema-flexibility'),
    ('Results Summary', 'results'),
    ('Conclusions', 'conclusions'),
]
display_toc(sections)

## üìå Introduction <a id="introduction"></a>

This notebook compares **SynaDB's data loading capabilities** against two popular ML data formats:

| System | Type | Key Features |
|--------|------|-------------|
| **SynaDB** | Embedded DB | Single-file, schema-free, native tensor extraction |
| **HDF5** | File Format | Hierarchical, chunked, widely used in scientific computing |
| **TFRecord** | File Format | TensorFlow native, sequential access optimized |

### What We'll Measure

- **Sequential load time** (full dataset read)
- **Random access latency** (single sample retrieval)
- **PyTorch DataLoader throughput** (training iteration speed)
- **Tensor extraction** (direct numpy array access)
- **Schema flexibility** (adding new data types)

### Test Configuration

- **Dataset**: MNIST-like synthetic data (60,000 images)
- **Image size**: 28x28 grayscale
- **Labels**: 10 classes
- **Batch size**: 64 for DataLoader tests

In [None]:
# Cell 4: System Info
display_system_info()

## üîß Setup <a id="setup"></a>

Let's set up our test environment with MNIST-like synthetic data in all three formats.

In [None]:
# Cell 6: Check Dependencies and Imports
import numpy as np
import time
import os
import shutil
import tempfile
from pathlib import Path
import matplotlib.pyplot as plt

# Check for SynaDB
HAS_SYNADB = check_dependency('synadb', 'pip install synadb')

# Check for HDF5
HAS_H5PY = check_dependency('h5py', 'pip install h5py')

# Check for TensorFlow (for TFRecord)
HAS_TF = check_dependency('tensorflow', 'pip install tensorflow')

# Check for PyTorch
HAS_TORCH = check_dependency('torch', 'pip install torch')

# Apply consistent styling
setup_style()

In [None]:
# Cell 7: Generate MNIST-like Synthetic Data
NUM_SAMPLES = 60_000
IMAGE_HEIGHT = 28
IMAGE_WIDTH = 28
NUM_CLASSES = 10
SEED = 42

print(f'Generating {NUM_SAMPLES:,} synthetic MNIST-like samples...')
np.random.seed(SEED)

# Generate synthetic images (uint8, 0-255)
images = np.random.randint(0, 256, size=(NUM_SAMPLES, IMAGE_HEIGHT, IMAGE_WIDTH), dtype=np.uint8)
labels = np.random.randint(0, NUM_CLASSES, size=NUM_SAMPLES, dtype=np.int64)

print(f'‚úì Generated {NUM_SAMPLES:,} images')
print(f'‚úì Image shape: {images.shape}')
print(f'‚úì Labels shape: {labels.shape}')
print(f'‚úì Memory usage: {(images.nbytes + labels.nbytes) / 1024 / 1024:.1f} MB')

In [None]:
# Cell 8: Create Temp Directory for Data Files
temp_dir = tempfile.mkdtemp(prefix='synadb_dataload_')
print(f'Using temp directory: {temp_dir}')

synadb_path = os.path.join(temp_dir, 'mnist_synadb.db')
hdf5_path = os.path.join(temp_dir, 'mnist.h5')
tfrecord_path = os.path.join(temp_dir, 'mnist.tfrecord')

In [None]:
# Cell 9: Save Data to SynaDB
synadb_write_time = None

if HAS_SYNADB:
    from synadb import SynaDB
    print('Saving data to SynaDB...')
    start = time.perf_counter()
    with SynaDB(synadb_path) as db:
        for i in range(NUM_SAMPLES):
            db.put_bytes(f'image/{i}', images[i].tobytes())
            db.put_int(f'label/{i}', int(labels[i]))
            if (i + 1) % 10000 == 0:
                print(f'  Saved {i + 1:,} samples...')
    synadb_write_time = time.perf_counter() - start
    print(f'‚úì SynaDB: {NUM_SAMPLES:,} samples in {synadb_write_time:.2f}s')
else:
    print('‚ö†Ô∏è SynaDB not available, skipping...')

In [None]:
# Cell 10: Save Data to HDF5
hdf5_write_time = None

if HAS_H5PY:
    import h5py
    print('Saving data to HDF5...')
    start = time.perf_counter()
    with h5py.File(hdf5_path, 'w') as f:
        f.create_dataset('images', data=images, chunks=(1000, IMAGE_HEIGHT, IMAGE_WIDTH))
        f.create_dataset('labels', data=labels, chunks=(1000,))
    hdf5_write_time = time.perf_counter() - start
    print(f'‚úì HDF5: {NUM_SAMPLES:,} samples in {hdf5_write_time:.2f}s')
else:
    print('‚ö†Ô∏è h5py not available, skipping...')

In [None]:
# Cell 11: Save Data to TFRecord
tfrecord_write_time = None

if HAS_TF:
    import tensorflow as tf
    print('Saving data to TFRecord...')
    start = time.perf_counter()
    def _bytes_feature(value):
        return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
    def _int64_feature(value):
        return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
    with tf.io.TFRecordWriter(tfrecord_path) as writer:
        for i in range(NUM_SAMPLES):
            feature = {'image': _bytes_feature(images[i].tobytes()), 'label': _int64_feature(int(labels[i]))}
            example = tf.train.Example(features=tf.train.Features(feature=feature))
            writer.write(example.SerializeToString())
            if (i + 1) % 10000 == 0:
                print(f'  Saved {i + 1:,} samples...')
    tfrecord_write_time = time.perf_counter() - start
    print(f'‚úì TFRecord: {NUM_SAMPLES:,} samples in {tfrecord_write_time:.2f}s')
else:
    print('‚ö†Ô∏è TensorFlow not available, skipping...')

In [None]:
# Cell 12: Write Time Comparison
write_times = {}
if synadb_write_time: write_times['SynaDB'] = synadb_write_time
if hdf5_write_time: write_times['HDF5'] = hdf5_write_time
if tfrecord_write_time: write_times['TFRecord'] = tfrecord_write_time

if write_times:
    fig = bar_comparison(write_times, title=f'Write Time ({NUM_SAMPLES:,} samples)', ylabel='Time (seconds)', lower_is_better=True)
    plt.show()
else:
    print('No write time results to display.')

## ‚ö° Benchmark: Sequential Load <a id="benchmark-sequential"></a>

Let's measure how fast each format can load the entire dataset sequentially.

In [None]:
# Cell 14: SynaDB Sequential Load Benchmark
synadb_seq_time = None

if HAS_SYNADB:
    from synadb import SynaDB
    print('Benchmarking SynaDB sequential load...')
    # Warm up
    with SynaDB(synadb_path) as db:
        for i in range(100): _ = db.get_bytes(f'image/{i}')
    # Benchmark
    start = time.perf_counter()
    loaded_images, loaded_labels = [], []
    with SynaDB(synadb_path) as db:
        for i in range(NUM_SAMPLES):
            img_bytes = db.get_bytes(f'image/{i}')
            label = db.get_int(f'label/{i}')
            img = np.frombuffer(img_bytes, dtype=np.uint8).reshape(IMAGE_HEIGHT, IMAGE_WIDTH)
            loaded_images.append(img)
            loaded_labels.append(label)
    synadb_seq_time = time.perf_counter() - start
    print(f'‚úì SynaDB: {NUM_SAMPLES:,} samples in {synadb_seq_time:.2f}s')
    print(f'  Throughput: {NUM_SAMPLES / synadb_seq_time:,.0f} samples/sec')
else:
    print('‚ö†Ô∏è SynaDB not available, skipping...')

In [None]:
# Cell 15: HDF5 Sequential Load Benchmark
hdf5_seq_time = None

if HAS_H5PY:
    import h5py
    print('Benchmarking HDF5 sequential load...')
    # Warm up
    with h5py.File(hdf5_path, 'r') as f: _ = f['images'][:100]
    # Benchmark
    start = time.perf_counter()
    with h5py.File(hdf5_path, 'r') as f:
        loaded_images_h5 = f['images'][:]
        loaded_labels_h5 = f['labels'][:]
    hdf5_seq_time = time.perf_counter() - start
    print(f'‚úì HDF5: {NUM_SAMPLES:,} samples in {hdf5_seq_time:.2f}s')
    print(f'  Throughput: {NUM_SAMPLES / hdf5_seq_time:,.0f} samples/sec')
else:
    print('‚ö†Ô∏è h5py not available, skipping...')

In [None]:
# Cell 16: TFRecord Sequential Load Benchmark
tfrecord_seq_time = None

if HAS_TF:
    import tensorflow as tf
    print('Benchmarking TFRecord sequential load...')
    def parse_tfrecord(example_proto):
        feature_description = {'image': tf.io.FixedLenFeature([], tf.string), 'label': tf.io.FixedLenFeature([], tf.int64)}
        parsed = tf.io.parse_single_example(example_proto, feature_description)
        image = tf.io.decode_raw(parsed['image'], tf.uint8)
        image = tf.reshape(image, [IMAGE_HEIGHT, IMAGE_WIDTH])
        return image, parsed['label']
    # Warm up
    dataset = tf.data.TFRecordDataset(tfrecord_path)
    for i, _ in enumerate(dataset.take(100)): pass
    # Benchmark
    start = time.perf_counter()
    dataset = tf.data.TFRecordDataset(tfrecord_path).map(parse_tfrecord)
    loaded_images_tf, loaded_labels_tf = [], []
    for image, label in dataset:
        loaded_images_tf.append(image.numpy())
        loaded_labels_tf.append(label.numpy())
    tfrecord_seq_time = time.perf_counter() - start
    print(f'‚úì TFRecord: {NUM_SAMPLES:,} samples in {tfrecord_seq_time:.2f}s')
    print(f'  Throughput: {NUM_SAMPLES / tfrecord_seq_time:,.0f} samples/sec')
else:
    print('‚ö†Ô∏è TensorFlow not available, skipping...')

In [None]:
# Cell 17: Sequential Load Results Visualization
seq_throughput = {}
if synadb_seq_time: seq_throughput['SynaDB'] = NUM_SAMPLES / synadb_seq_time
if hdf5_seq_time: seq_throughput['HDF5'] = NUM_SAMPLES / hdf5_seq_time
if tfrecord_seq_time: seq_throughput['TFRecord'] = NUM_SAMPLES / tfrecord_seq_time

if seq_throughput:
    fig = throughput_comparison(seq_throughput, title='Sequential Load Throughput', ylabel='Samples/second')
    plt.show()
else:
    print('No sequential load results to display.')

## üéØ Benchmark: Random Access <a id="benchmark-random"></a>

Random access is critical for shuffled training. Let's measure single-sample retrieval latency.

In [None]:
# Cell 19: Random Access Benchmark Setup
NUM_RANDOM_ACCESSES = 1000
np.random.seed(SEED)
random_indices = np.random.randint(0, NUM_SAMPLES, size=NUM_RANDOM_ACCESSES)
print(f'Testing {NUM_RANDOM_ACCESSES:,} random accesses...')

In [None]:
# Cell 20: SynaDB Random Access Benchmark
synadb_random_times = []

if HAS_SYNADB:
    from synadb import SynaDB
    print('Benchmarking SynaDB random access...')
    with SynaDB(synadb_path) as db:
        # Warm up
        for _ in range(10): _ = db.get_bytes(f'image/{random_indices[0]}')
        # Benchmark
        for idx in random_indices:
            start = time.perf_counter()
            img_bytes = db.get_bytes(f'image/{idx}')
            label = db.get_int(f'label/{idx}')
            img = np.frombuffer(img_bytes, dtype=np.uint8).reshape(IMAGE_HEIGHT, IMAGE_WIDTH)
            elapsed = (time.perf_counter() - start) * 1000
            synadb_random_times.append(elapsed)
    print(f'‚úì SynaDB: Mean latency {np.mean(synadb_random_times):.3f}ms')
    print(f'  P95 latency: {np.percentile(synadb_random_times, 95):.3f}ms')
else:
    print('‚ö†Ô∏è SynaDB not available, skipping...')

In [None]:
# Cell 21: HDF5 Random Access Benchmark
hdf5_random_times = []

if HAS_H5PY:
    import h5py
    print('Benchmarking HDF5 random access...')
    with h5py.File(hdf5_path, 'r') as f:
        # Warm up
        for _ in range(10): _ = f['images'][random_indices[0]]
        # Benchmark
        for idx in random_indices:
            start = time.perf_counter()
            img = f['images'][idx]
            label = f['labels'][idx]
            elapsed = (time.perf_counter() - start) * 1000
            hdf5_random_times.append(elapsed)
    print(f'‚úì HDF5: Mean latency {np.mean(hdf5_random_times):.3f}ms')
    print(f'  P95 latency: {np.percentile(hdf5_random_times, 95):.3f}ms')
else:
    print('‚ö†Ô∏è h5py not available, skipping...')

In [None]:
# Cell 22: TFRecord Random Access Note
info_box('TFRecord is optimized for sequential access and does not support efficient random access. This is a key architectural difference - TFRecord requires reading from the beginning to reach a specific record.', 'TFRecord Limitation')

In [None]:
# Cell 23: Random Access Results Visualization
random_latencies = {}
if synadb_random_times: random_latencies['SynaDB'] = np.mean(synadb_random_times)
if hdf5_random_times: random_latencies['HDF5'] = np.mean(hdf5_random_times)

if random_latencies:
    fig = bar_comparison(random_latencies, title='Random Access Latency', ylabel='Latency (ms)', lower_is_better=True)
    plt.show()
else:
    print('No random access results to display.')

## üîÑ Benchmark: PyTorch DataLoader <a id="benchmark-dataloader"></a>

Let's measure training iteration throughput using PyTorch DataLoader with each format.

In [None]:
# Cell 25: PyTorch DataLoader Setup
BATCH_SIZE = 64
NUM_WORKERS = 0  # Single-threaded for fair comparison
NUM_BATCHES = 100  # Number of batches to measure

if HAS_TORCH:
    import torch
    from torch.utils.data import Dataset, DataLoader
    print(f'PyTorch DataLoader benchmark: batch_size={BATCH_SIZE}, num_batches={NUM_BATCHES}')
else:
    print('‚ö†Ô∏è PyTorch not available, skipping DataLoader benchmarks...')

In [None]:
# Cell 26: SynaDB PyTorch Dataset
synadb_dataloader_time = None

if HAS_TORCH and HAS_SYNADB:
    from synadb import SynaDB
    
    class SynaDBDataset(Dataset):
        def __init__(self, db_path, num_samples, img_shape):
            self.db_path = db_path
            self.num_samples = num_samples
            self.img_shape = img_shape
            self.db = SynaDB(db_path)
        def __len__(self): return self.num_samples
        def __getitem__(self, idx):
            img_bytes = self.db.get_bytes(f'image/{idx}')
            label = self.db.get_int(f'label/{idx}')
            img = np.frombuffer(img_bytes, dtype=np.uint8).reshape(self.img_shape)
            return torch.from_numpy(img.copy()).float(), torch.tensor(label)
        def close(self): self.db.close()
    
    print('Benchmarking SynaDB DataLoader...')
    dataset = SynaDBDataset(synadb_path, NUM_SAMPLES, (IMAGE_HEIGHT, IMAGE_WIDTH))
    dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS)
    
    # Warm up
    for i, _ in enumerate(dataloader):
        if i >= 5: break
    
    # Benchmark
    start = time.perf_counter()
    for i, (imgs, lbls) in enumerate(dataloader):
        if i >= NUM_BATCHES: break
    synadb_dataloader_time = time.perf_counter() - start
    dataset.close()
    
    print(f'‚úì SynaDB DataLoader: {NUM_BATCHES} batches in {synadb_dataloader_time:.2f}s')
    print(f'  Throughput: {NUM_BATCHES * BATCH_SIZE / synadb_dataloader_time:,.0f} samples/sec')

In [None]:
# Cell 27: HDF5 PyTorch Dataset
hdf5_dataloader_time = None

if HAS_TORCH and HAS_H5PY:
    import h5py
    
    class HDF5Dataset(Dataset):
        def __init__(self, h5_path):
            self.h5_path = h5_path
            self.file = h5py.File(h5_path, 'r')
            self.images = self.file['images']
            self.labels = self.file['labels']
        def __len__(self): return len(self.images)
        def __getitem__(self, idx):
            img = self.images[idx]
            label = self.labels[idx]
            return torch.from_numpy(img.copy()).float(), torch.tensor(label)
        def close(self): self.file.close()
    
    print('Benchmarking HDF5 DataLoader...')
    dataset = HDF5Dataset(hdf5_path)
    dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS)
    
    # Warm up
    for i, _ in enumerate(dataloader):
        if i >= 5: break
    
    # Benchmark
    start = time.perf_counter()
    for i, (imgs, lbls) in enumerate(dataloader):
        if i >= NUM_BATCHES: break
    hdf5_dataloader_time = time.perf_counter() - start
    dataset.close()
    
    print(f'‚úì HDF5 DataLoader: {NUM_BATCHES} batches in {hdf5_dataloader_time:.2f}s')
    print(f'  Throughput: {NUM_BATCHES * BATCH_SIZE / hdf5_dataloader_time:,.0f} samples/sec')

In [None]:
# Cell 28: DataLoader Results Visualization
dataloader_throughput = {}
if synadb_dataloader_time: dataloader_throughput['SynaDB'] = NUM_BATCHES * BATCH_SIZE / synadb_dataloader_time
if hdf5_dataloader_time: dataloader_throughput['HDF5'] = NUM_BATCHES * BATCH_SIZE / hdf5_dataloader_time

if dataloader_throughput:
    fig = throughput_comparison(dataloader_throughput, title='PyTorch DataLoader Throughput', ylabel='Samples/second')
    plt.show()
else:
    print('No DataLoader results to display.')

## üßÆ Tensor Extraction Comparison <a id="tensor-extraction"></a>

SynaDB provides native tensor extraction capabilities. Let's compare how each format handles extracting data as numpy arrays.

In [None]:
# Cell 30: Tensor Extraction Comparison
print('Tensor Extraction Comparison\n')
print('=' * 60)

# SynaDB tensor extraction
if HAS_SYNADB:
    from synadb import SynaDB
    print('\nüì¶ SynaDB Tensor Extraction:')
    print('  - Native get_history_tensor() for time-series data')
    print('  - Direct numpy array output')
    print('  - Schema-free: mix different data types')
    with SynaDB(synadb_path) as db:
        # Store some float values for tensor extraction demo
        for i in range(100):
            db.put_float('metrics/loss', 1.0 / (i + 1))
        # Extract as tensor
        loss_tensor = db.get_history_tensor('metrics/loss')
        print(f'  ‚úì Extracted tensor shape: {loss_tensor.shape}')

# HDF5 tensor extraction
if HAS_H5PY:
    import h5py
    print('\nüì¶ HDF5 Tensor Extraction:')
    print('  - Direct array slicing with [:] syntax')
    print('  - Chunked access for large datasets')
    print('  - Fixed schema required')
    with h5py.File(hdf5_path, 'r') as f:
        sample_tensor = f['images'][:100]
        print(f'  ‚úì Extracted tensor shape: {sample_tensor.shape}')

# TFRecord tensor extraction
if HAS_TF:
    print('\nüì¶ TFRecord Tensor Extraction:')
    print('  - Requires parsing through tf.data pipeline')
    print('  - Optimized for streaming, not random access')
    print('  - Best for TensorFlow training pipelines')

print('\n' + '=' * 60)

## üîÑ Schema Flexibility Demo <a id="schema-flexibility"></a>

One of SynaDB's key advantages is schema flexibility - you can add new data types without migration.

In [None]:
# Cell 32: Schema Flexibility Demo
print('Schema Flexibility Demonstration\n')
print('=' * 60)

if HAS_SYNADB:
    from synadb import SynaDB
    print('\nüì¶ SynaDB Schema Flexibility:')
    with SynaDB(synadb_path) as db:
        # Add new data types without any schema changes
        db.put_text('metadata/description', 'MNIST-like synthetic dataset')
        db.put_int('metadata/num_samples', NUM_SAMPLES)
        db.put_float('metadata/version', 1.0)
        db.put_bytes('metadata/config', b'{"augmentation": true}')
        
        print('  ‚úì Added text metadata')
        print('  ‚úì Added integer metadata')
        print('  ‚úì Added float metadata')
        print('  ‚úì Added bytes metadata (JSON config)')
        print('  ‚Üí No schema migration required!')

if HAS_H5PY:
    import h5py
    print('\nüì¶ HDF5 Schema Changes:')
    with h5py.File(hdf5_path, 'a') as f:
        # Adding new datasets requires explicit creation
        if 'metadata' not in f:
            meta = f.create_group('metadata')
            meta.attrs['description'] = 'MNIST-like synthetic dataset'
            meta.attrs['num_samples'] = NUM_SAMPLES
        print('  ‚úì Added metadata group with attributes')
        print('  ‚Üí Requires explicit dataset/group creation')

if HAS_TF:
    print('\nüì¶ TFRecord Schema Changes:')
    print('  ‚ö†Ô∏è Adding new fields requires rewriting the entire file')
    print('  ‚Üí Schema changes are expensive')

print('\n' + '=' * 60)

## üìä Results Summary <a id="results"></a>

Let's summarize all benchmark results.

In [None]:
# Cell 34: Results Summary Table
from IPython.display import display, Markdown, HTML

# Build summary table
summary_md = '''\n| Metric | SynaDB | HDF5 | TFRecord |\n|--------|--------|------|----------|\n'''

# Write time
synadb_wt = f'{synadb_write_time:.2f}s' if synadb_write_time else 'N/A'
hdf5_wt = f'{hdf5_write_time:.2f}s' if hdf5_write_time else 'N/A'
tf_wt = f'{tfrecord_write_time:.2f}s' if tfrecord_write_time else 'N/A'
summary_md += f'| Write Time | {synadb_wt} | {hdf5_wt} | {tf_wt} |\n'

# Sequential load
synadb_sl = f'{NUM_SAMPLES/synadb_seq_time:,.0f}/s' if synadb_seq_time else 'N/A'
hdf5_sl = f'{NUM_SAMPLES/hdf5_seq_time:,.0f}/s' if hdf5_seq_time else 'N/A'
tf_sl = f'{NUM_SAMPLES/tfrecord_seq_time:,.0f}/s' if tfrecord_seq_time else 'N/A'
summary_md += f'| Sequential Load | {synadb_sl} | {hdf5_sl} | {tf_sl} |\n'

# Random access
synadb_ra = f'{np.mean(synadb_random_times):.3f}ms' if synadb_random_times else 'N/A'
hdf5_ra = f'{np.mean(hdf5_random_times):.3f}ms' if hdf5_random_times else 'N/A'
summary_md += f'| Random Access | {synadb_ra} | {hdf5_ra} | Not supported |\n'

# DataLoader throughput
synadb_dl = f'{NUM_BATCHES*BATCH_SIZE/synadb_dataloader_time:,.0f}/s' if synadb_dataloader_time else 'N/A'
hdf5_dl = f'{NUM_BATCHES*BATCH_SIZE/hdf5_dataloader_time:,.0f}/s' if hdf5_dataloader_time else 'N/A'
summary_md += f'| DataLoader | {synadb_dl} | {hdf5_dl} | TF native |\n'

# Features
summary_md += '| Schema Flexibility | ‚úì Excellent | ‚óã Limited | ‚úó Fixed |\n'
summary_md += '| Single File | ‚úì Yes | ‚úì Yes | ‚úì Yes |\n'
summary_md += '| Random Access | ‚úì Yes | ‚úì Yes | ‚úó No |\n'

display(Markdown(summary_md))

In [None]:
# Cell 35: File Size Comparison
def get_file_size(path):
    if os.path.exists(path):
        return os.path.getsize(path) / (1024 * 1024)  # MB
    return 0

file_sizes = {}
if os.path.exists(synadb_path): file_sizes['SynaDB'] = get_file_size(synadb_path)
if os.path.exists(hdf5_path): file_sizes['HDF5'] = get_file_size(hdf5_path)
if os.path.exists(tfrecord_path): file_sizes['TFRecord'] = get_file_size(tfrecord_path)

if file_sizes:
    fig = memory_comparison(file_sizes, title='Storage Size Comparison', ylabel='Size (MB)')
    plt.show()
    print('\nFile sizes:')
    for name, size in file_sizes.items():
        print(f'  {name}: {size:.1f} MB')

## üéØ Conclusions <a id="conclusions"></a>

In [None]:
# Cell 37: Conclusions
conclusion_box(
    title='Key Takeaways',
    points=[
        'SynaDB offers excellent schema flexibility - add new data types without migration',
        'HDF5 excels at bulk sequential loading with its optimized chunked storage',
        'TFRecord is best for TensorFlow pipelines but lacks random access',
        'SynaDB provides a good balance of flexibility and performance for ML workflows',
        'For mixed workloads (training + experimentation), SynaDB\'s schema-free design shines',
    ],
    summary='Choose based on your workflow: HDF5 for pure performance, TFRecord for TensorFlow, SynaDB for flexibility and unified data management.'
)

In [None]:
# Cell 38: Cleanup
import shutil
try:
    shutil.rmtree(temp_dir)
    print(f'‚úì Cleaned up temp directory: {temp_dir}')
except Exception as e:
    print(f'‚ö†Ô∏è Could not clean up: {e}')