# Performance Benchmarks: Polars vs Pandas

This notebook compares the performance of Polars and Pandas for common data processing operations using the same datasets.

In [None]:
import os
import polars as pl
import pandas as pd
import numpy as np
import time


## 1. Locate Datasets
Set up the path to the datasets directory.

In [None]:
datasets_dir = os.path.abspath(os.path.join('..', 'datasets'))


## 2. Define Benchmark Function
A helper function to compare execution times for Polars and Pandas.

In [None]:
def benchmark(name, polars_fn, pandas_fn, repeat=5):
    """Run benchmark comparing polars vs pandas performance"""
    polars_fn()
    pandas_fn()
    polars_times = []
    for _ in range(repeat):
        start = time.time()
        polars_fn()
        polars_times.append(time.time() - start)
    pandas_times = []
    for _ in range(repeat):
        start = time.time()
        pandas_fn()
        pandas_times.append(time.time() - start)
    polars_avg = sum(polars_times) / repeat
    pandas_avg = sum(pandas_times) / repeat
    speedup = pandas_avg / polars_avg
    print(f'\n{name} Benchmark:')
    print(f'Polars: {polars_avg:.4f}s')
    print(f'Pandas: {pandas_avg:.4f}s')
    print(f'Speedup: {speedup:.2f}x')
    return {"name": name, "polars_time": polars_avg, "pandas_time": pandas_avg, "speedup": speedup}


## 3. Benchmark 1: Data Loading
Compare the time to load a CSV file with Polars and Pandas.

In [None]:
def polars_load():
    df = pl.read_csv(f'{datasets_dir}/transactions.csv')
    return df
def pandas_load():
    df = pd.read_csv(f'{datasets_dir}/transactions.csv')
    return df
load_result = benchmark('Data Loading', polars_load, pandas_load)


## 4. Benchmark 2: Filtering
Compare filtering rows with Polars and Pandas.

In [None]:
polars_df = pl.read_csv(f'{datasets_dir}/transactions.csv')
pandas_df = pd.read_csv(f'{datasets_dir}/transactions.csv')
def polars_filter():
    return polars_df.filter(pl.col('amount') > 5000)
def pandas_filter():
    return pandas_df[pandas_df['amount'] > 5000]
filter_result = benchmark('Filtering', polars_filter, pandas_filter)


## 5. Benchmark 3: GroupBy & Aggregation
Compare groupby and aggregation performance.

In [None]:
def polars_groupby():
    return polars_df.group_by('client_id').agg(pl.sum('amount').alias('total_amount'))
def pandas_groupby():
    return pandas_df.groupby('client_id')['amount'].sum().reset_index()
groupby_result = benchmark('GroupBy & Aggregation', polars_groupby, pandas_groupby)
group_res = polars_groupby()
print(f'\nUnique groups: {group_res.shape[0]} clients')


## 6. Benchmark 4: Join Operation
Compare join performance between Polars and Pandas.

In [None]:
clients_pl = pl.read_csv(f'{datasets_dir}/clients.csv')
clients_pd = pd.read_csv(f'{datasets_dir}/clients.csv')
def polars_join():
    return polars_df.join(clients_pl, on='client_id')
def pandas_join():
    return pandas_df.merge(clients_pd, on='client_id')
join_result = benchmark('Join Operation', polars_join, pandas_join)


## 7. Results Summary
We collect all benchmark results and display them as a table.

In [None]:
results = [load_result, filter_result, groupby_result, join_result]
names = [r['name'] for r in results]
polars_times = [r['polars_time'] for r in results]
pandas_times = [r['pandas_time'] for r in results]
speedups = [r['speedup'] for r in results]
results_df = pl.DataFrame({
    'operation': names,
    'polars_time': polars_times,
    'pandas_time': pandas_times,
    'speedup': speedups
})
print('Benchmark Results Summary:')
print(results_df)
results_df.write_csv(f'{datasets_dir}/benchmark_results.csv')
