# Why arrow?

This notebook explores the benefits or drawbacks of using the [arrow](https://arrow.apache.org) in-memory data format relative to other formats such as Pandas DataFrames.

In [1]:
# set ipyflow reactive mode
%flow mode reactive

In [2]:
import os
import pathlib
import time

import duckdb
import numpy as np
import pandas as pd
import plotly.express as px
import polars as pl
import pyarrow as pa
import pyarrow.parquet as parquet
from pympler.asizeof import asizeof
from utilities import timer

In [3]:
# target file or table names
parquet_name = "example.parquet"

In [4]:
# remove any existing prior work
pathlib.Path(parquet_name).unlink(missing_ok=True)

In [5]:
# starting rowcount and col count
nrows = 10
ncols = 5

# result list for storing data
results = []

# loop for iterating over increasingly large dataframes
# and gathering data about operations on them
for _ in range(1, 9):
    # increase the size of the dataframe
    nrows *= 2
    ncols *= 2

    # form a dataframe using randomized data
    df = pd.DataFrame(
        np.random.rand(nrows, ncols), columns=[f"col_{num}" for num in range(0, ncols)]
    )
    # write to parquet for tests below
    df.to_parquet(path=parquet_name, compression="snappy")

    # append data to the result list
    results.append(
        {
            # general information about the dataframe
            "dataframe_shape (rows, cols)": str(df.shape),
            # information about pandas
            "pandas_read_time (secs)": timer(pd.read_parquet, path=parquet_name),
            "pandas_size (bytes)": asizeof(pd.read_parquet(path=parquet_name)),
            # information about pyarrow
            "pyarrow_read_time (secs)": timer(parquet.read_table, source=parquet_name),
            "pyarrow_size (bytes)": asizeof(parquet.read_table(source=parquet_name)),
            # information about polars
            "polars_read_time (secs)": timer(
                pl.scan_parquet, source=parquet_name, method_chain="collect"
            ),
            "polars_size (bytes)": pl.scan_parquet(source=parquet_name)
            .collect()
            .estimated_size(),
            # information about duckdb numpy
            "duckdb_arrow_read_time (secs)": timer(
                duckdb.connect().execute,
                query=f"SELECT * FROM read_parquet('{parquet_name}')",
                method_chain="arrow",
            ),
            "duckdb_arrow_size (bytes)": asizeof(
                duckdb.connect()
                .execute(query=f"SELECT * FROM read_parquet('{parquet_name}')")
                .arrow()
            ),
        }
    )

    # remove any existing files in preparation for next steps
    pathlib.Path(parquet_name).unlink(missing_ok=True)


df_results = pd.DataFrame(results)
df_results

Unnamed: 0,"dataframe_shape (rows, cols)",pandas_read_time (secs),pandas_size (bytes),pyarrow_read_time (secs),pyarrow_size (bytes),polars_read_time (secs),polars_size (bytes),duckdb_arrow_read_time (secs),duckdb_arrow_size (bytes)
0,"(20, 10)",0.016199,22776,0.001583,1688,0.002296,1600,0.001721,1688
1,"(40, 20)",0.005236,49376,0.001915,6560,0.000687,6400,0.001705,6560
2,"(80, 40)",0.003975,121752,0.002724,26056,0.000701,25600,0.003609,26056
3,"(160, 80)",0.006643,343328,0.004465,104056,0.0009,102400,0.007884,104056
4,"(320, 160)",0.011236,1093912,0.00828,416056,0.001514,409600,0.021519,416056
5,"(640, 320)",0.021552,3823648,0.015445,1664056,0.003038,1638400,0.069813,1664056
6,"(1280, 640)",0.040374,14198296,0.030953,6656056,0.006458,6553600,0.244854,6656056
7,"(2560, 1280)",0.085547,54610656,0.068986,26624056,0.017791,26214400,0.943557,26624056


In [None]:
# write times barchart
fig = px.bar(
    df_results,
    x=[
        "pandas_read_time (secs)",
        "pyarrow_read_time (secs)",
        "polars_read_time (secs)",
        "duckdb_arrow_read_time (secs)",
    ],
    y="dataframe_shape (rows, cols)",
    orientation="h",
    barmode="group",
    labels={"dataframe_shape (rows, cols)": "DataFrame Shape", "value": "Seconds"},
    title="How long are read times for different formats?",
)
fig.show()

In [None]:
# write times barchart
fig = px.bar(
    df_results,
    x=[
        "pandas_size (bytes)",
        "pyarrow_size (bytes)",
        "polars_size (bytes)",
        "duckdb_arrow_size (bytes)",
    ],
    y="dataframe_shape (rows, cols)",
    orientation="h",
    barmode="group",
    labels={"dataframe_shape (rows, cols)": "DataFrame Shape", "value": "Bytes"},
    title="What is the memory size for different formats?",
)
fig.show()