## Arrow Client

In [1]:
import time
import requests
import pyarrow as pa

In [5]:
def read_arrow_stream_from_url_batches(url: str) -> pa.Table:
    """
    Reads an Arrow stream from a URL and returns it as a pyarrow Table.
    """
    with requests.get(url, stream=True) as response:
        response.raise_for_status()

        # Read raw binary stream in chunks
        chunks = bytearray()
        for chunk in response.iter_content(
            chunk_size=130058908
        ):  # chunk_size is based on the size of a single Arrow RecordBatch
            if chunk:
                chunks.extend(chunk)

        # Use pyarrow to load from bytes
        buffer = memoryview(chunks)
        reader = pa.ipc.open_stream(buffer)
        return reader.read_all()

In [115]:
nrows = 20_000_000
t1 = time.time()
arrow_table = read_arrow_stream_from_url_batches(
    f"http://localhost:8000/rows/arrow/lineitem?nrows={nrows}"
)
t2 = time.time()
print(f"Time taken to read {nrows} from Arrow stream: {t2 - t1:.2f} seconds")
arrow_table

Time taken to read 20000000 from Arrow stream: 10.87 seconds


pyarrow.Table
l_orderkey: int64
l_partkey: int64
l_suppkey: int64
l_linenumber: int64
l_quantity: decimal128(15, 2)
l_extendedprice: decimal128(15, 2)
l_discount: decimal128(15, 2)
l_tax: decimal128(15, 2)
l_returnflag: string
l_linestatus: string
l_shipdate: date32[day]
l_commitdate: date32[day]
l_receiptdate: date32[day]
l_shipinstruct: string
l_shipmode: string
l_comment: string
----
l_orderkey: [[1,1,1,1,1,...,999939,999939,999939,999939,999939],[999939,999939,999940,999940,999940,...,1999524,1999524,1999524,1999525,1999525],...,[18003427,18003428,18003428,18003429,18003429,...,19004199,19004199,19004199,19004224,19004224],[19004225,19004225,19004225,19004225,19004225,...,20005348,20005348,20005349,20005349,20005349]]
l_partkey: [[1551894,673091,636998,21315,240267,...,34552,711982,272928,1099732,1185143],[683313,439625,766792,1461447,528198,...,1637358,1060259,1770665,1557031,1003407],...,[208066,1193316,58728,951117,1409583,...,1449090,840729,121354,1237352,160542],[328310,190540

In [113]:
arrow_table.num_columns, arrow_table.num_rows, arrow_table.nbytes

(16, 15000000, 2591777478)

## JSON Client

In [2]:
def json_request_handler(url: str) -> dict:
    response = requests.get(url)
    response.raise_for_status()
    # response.json() already parses de JSON response into a Python dictionary
    return response.json()

In [85]:
nrows = 1_000_000
t1 = time.time()
data = json_request_handler(f"http://localhost:8000/rows/json/orders?nrows={nrows}")
t2 = time.time()
print(f"Time taken to process {nrows} with JSON: {t2 - t1:.2f} seconds")

Time taken to process 1000000 with JSON: 10.46 seconds


In [9]:
len(data["o_orderkey"])

1000000

## JSON vs Arrow with Lineitem data

In [3]:
def benchmark():
    benchmark_data = []
    for nrows in [1_000, 10_000, 100_000, 1_000_000, 10_000_000]:
        t1 = time.time()
        read_arrow_stream_from_url_batches(
            f"http://localhost:8000/rows/arrow/lineitem?nrows={nrows}"
        )
        t2 = time.time()
        time_arrow = t2 - t1
        print(f"Time taken to read {nrows} from Arrow stream: {t2 - t1:.2f} seconds")

        t1 = time.time()
        json_request_handler(f"http://localhost:8000/rows/json/lineitem?nrows={nrows}")
        t2 = time.time()
        time_json = t2 - t1
        print(f"Time taken to read {nrows} with JSON: {t2 - t1:.2f} seconds")

        benchmark_data.append(
            {"nrows": nrows, "time_arrow": time_arrow, "time_json": time_json}
        )
    return benchmark_data

In [6]:
data = benchmark()

Time taken to read 1000 from Arrow stream: 0.05 seconds
Time taken to read 1000 with JSON: 0.04 seconds
Time taken to read 10000 from Arrow stream: 0.04 seconds
Time taken to read 10000 with JSON: 0.12 seconds
Time taken to read 100000 from Arrow stream: 0.14 seconds
Time taken to read 100000 with JSON: 1.01 seconds
Time taken to read 1000000 from Arrow stream: 1.18 seconds
Time taken to read 1000000 with JSON: 10.21 seconds
Time taken to read 10000000 from Arrow stream: 5.22 seconds
Time taken to read 10000000 with JSON: 104.76 seconds


In [None]:
import pandas as pd
import plotly.express as px

df = pd.DataFrame(data)
fig = px.line(
    df,
    x="nrows",
    y=["time_arrow", "time_json"],
    log_x=True,
    labels={"nrows": "log(nrows)", "value": "Time (seconds)"},
    title="Arrow vs JSON Performance Benchmark",
    width=400,
)
fig.show()