In [11]:
import time
import numpy as np
import pandas as pd
import polars as pl
import pyarrow as pa
import pyarrow.compute as pc
import h5py

def generate_data(rows, cols):
    return np.random.rand(rows, cols)

def benchmark_pandas(data):
    start = time.time()
    df = pd.DataFrame(data)
    df['new_col'] = df.sum(axis=1)
    result = df.groupby(df.index % 10).mean()
    end = time.time()
    return end - start

def benchmark_polars(data):
    start = time.time()
    df = pl.DataFrame(data)
    df = df.with_columns(pl.sum_horizontal(df.columns).alias('new_col'))
    result = df.group_by(pl.arange(0, df.height) % 10).mean()
    end = time.time()
    return end - start

def benchmark_arrow(data):
    start = time.time()
    table = pa.Table.from_arrays([pa.array(col) for col in data.T], 
                                 names=[f'col_{i}' for i in range(data.shape[1])])
    table = table.add_column(len(table.columns), 'new_col', pa.array(np.sum(data, axis=1)))
    
    # Create a new column for grouping
    group_column = pa.array(np.arange(len(table)) % 10)
    table = table.add_column(0, 'group', group_column)
    
    # Group and aggregate
    result = table.group_by('group').aggregate([
        (f'col_{i}', 'mean') for i in range(data.shape[1])
    ] + [('new_col', 'mean')])
    
    end = time.time()
    return end - start

def benchmark_hdf5(data):
    start = time.time()
    with h5py.File('test.h5', 'w') as f:
        f.create_dataset('data', data=data)
    with h5py.File('test.h5', 'r') as f:
        loaded_data = f['data'][:]
    new_col = np.sum(loaded_data, axis=1)
    result = np.array([loaded_data[i::10].mean(axis=0) for i in range(10)])
    end = time.time()
    return end - start

def run_benchmark(rows, cols, runs=5):
    data = generate_data(rows, cols)
    results = {
        'Pandas': [],
        'Polars': [],
        'Arrow': [],
        'HDF5': []
    }
    
    for _ in range(runs):
        results['Pandas'].append(benchmark_pandas(data))
        results['Polars'].append(benchmark_polars(data))
        results['Arrow'].append(benchmark_arrow(data))
        results['HDF5'].append(benchmark_hdf5(data))
    
    for lib, times in results.items():
        avg_time = sum(times) / len(times)
        print(f"{lib}: Average time over {runs} runs: {avg_time:.4f} seconds")

# Run the benchmark
run_benchmark(rows=1000000, cols=10)

Pandas: Average time over 5 runs: 0.2224 seconds
Polars: Average time over 5 runs: 0.2448 seconds
Arrow: Average time over 5 runs: 0.2067 seconds
HDF5: Average time over 5 runs: 0.2610 seconds
