## Arrow Client

In [12]:
import requests
import pyarrow as pa

def read_arrow_stream_from_url(url: str) -> pa.Table:
    with requests.get(url, stream=True) as response:
        response.raise_for_status()
        
        # Read raw binary stream in chunks
        # Note that we are bottlenecking the stream and this could probably be optimized
        chunks = bytearray()
        for chunk in response.iter_content(chunk_size=8192):
            if chunk:
                chunks.extend(chunk)
        
        # Use pyarrow to load from buffer
        buf = pa.py_buffer(bytes(chunks))
        reader = pa.ipc.open_stream(buf)
        return reader.read_all()

In [17]:
import time

In [21]:
nrows = 1_000_000
t1 = time.time()
arrow_table = read_arrow_stream_from_url(f"http://localhost:8000/rows/arrow/orders?nrows={nrows}")
t2 = time.time()
print(f"Time taken to read {nrows} from Arrow stream: {t2 - t1:.2f} seconds")
arrow_table

Time taken to read 1000000 from Arrow stream: 1.01 seconds


pyarrow.Table
o_orderkey: int64
o_custkey: int64
o_orderstatus: string
o_totalprice: decimal128(15, 2)
o_orderdate: date32[day]
o_orderpriority: string
o_clerk: string
o_shippriority: int32
o_comment: string
----
o_orderkey: [[1,2,3,4,5,...,3999972,3999973,3999974,3999975,4000000]]
o_custkey: [[369001,780017,1233140,1367761,444848,...,444874,993742,814084,782699,1317229]]
o_orderstatus: [["O","O","F","O","F",...,"O","F","F","O","F"]]
o_totalprice: [[186600.18,66219.63,270741.97,41714.38,122444.33,...,87908.80,225217.42,167696.63,82458.82,80168.65]]
o_orderdate: [[1996-01-02,1996-12-01,1993-10-14,1995-10-11,1994-07-30,...,1998-04-22,1992-05-20,1994-08-24,1996-05-06,1994-10-10]]
o_orderpriority: [["5-LOW","1-URGENT","5-LOW","5-LOW","5-LOW",...,"2-HIGH","1-URGENT","2-HIGH","5-LOW","2-HIGH"]]
o_clerk: [["Clerk#000009506","Clerk#000008792","Clerk#000009543","Clerk#000001234","Clerk#000009248",...,"Clerk#000005354","Clerk#000003672","Clerk#000006593","Clerk#000005436","Clerk#000009756"]]
o_s

In [22]:
arrow_table.num_columns, arrow_table.num_rows

(9, 1000000)

## JSON Client

In [23]:
def json_request_handler(url: str) -> dict:
    response = requests.get(url)
    response.raise_for_status()
    # response.json() already parses de JSON response into a Python dictionary
    return response.json()


In [24]:
nrows = 1_000_000
t1 = time.time()
data = json_request_handler(f"http://localhost:8000/rows/json/orders?nrows={nrows}")
t2 = time.time()
print(f"Time taken to process {nrows} with JSON: {t2 - t1:.2f} seconds")
data

Time taken to process 1000000 with JSON: 5.40 seconds


{'o_orderkey': [1,
  2,
  3,
  4,
  5,
  6,
  7,
  32,
  33,
  34,
  35,
  36,
  37,
  38,
  39,
  64,
  65,
  66,
  67,
  68,
  69,
  70,
  71,
  96,
  97,
  98,
  99,
  100,
  101,
  102,
  103,
  128,
  129,
  130,
  131,
  132,
  133,
  134,
  135,
  160,
  161,
  162,
  163,
  164,
  165,
  166,
  167,
  192,
  193,
  194,
  195,
  196,
  197,
  198,
  199,
  224,
  225,
  226,
  227,
  228,
  229,
  230,
  231,
  256,
  257,
  258,
  259,
  260,
  261,
  262,
  263,
  288,
  289,
  290,
  291,
  292,
  293,
  294,
  295,
  320,
  321,
  322,
  323,
  324,
  325,
  326,
  327,
  352,
  353,
  354,
  355,
  356,
  357,
  358,
  359,
  384,
  385,
  386,
  387,
  388,
  389,
  390,
  391,
  416,
  417,
  418,
  419,
  420,
  421,
  422,
  423,
  448,
  449,
  450,
  451,
  452,
  453,
  454,
  455,
  480,
  481,
  482,
  483,
  484,
  485,
  486,
  487,
  512,
  513,
  514,
  515,
  516,
  517,
  518,
  519,
  544,
  545,
  546,
  547,
  548,
  549,
  550,
  551,
  576,
  577,
  578

In [25]:
len(data['o_orderkey'])

1000000