In [1]:
import pyarrow as pa

In [2]:
# create test data
batch = pa.RecordBatch.from_arrays([range(1,1_000_000),
                                    range(1,1_000_000),
                                    range(1,1_000_000)],
                                   names=["x", "y", "z"])
batch.nbytes / 1024**2

22.888160705566406

In [3]:
# write it
with pa.ipc.new_file("test.arrow", schema=batch.schema) as f:
    for i in range(50):
        f.write_batch(batch)

In [5]:
!ls -lah # the file we created (1.2 GB) is larger than our memory (512 MB)

total 1.4G
drwxr-xr-x 3 1001 1002 4.0K Oct  7 17:37 .
drwxr-xr-x 1 root root 4.0K Oct  7 04:48 ..
drwxr-xr-x 2 1001 1002 4.0K Oct  7 04:50 .ipynb_checkpoints
-rw-r--r-- 1 1001 1002  19K Oct  5 23:10 9-15_lecture.ipynb
-rw-r--r-- 1 1001 1002  54K Oct  5 23:10 9-18_lecture.ipynb
-rw-r--r-- 1 1001 1002 5.9K Oct  6 00:26 Caching1.ipynb
-rw-r--r-- 1 root root 3.8K Oct  6 16:32 Caching2.ipynb
-rw-r--r-- 1 root root  26K Oct  7 03:59 Caching3_PyArrow.ipynb
-rw-r--r-- 1 root root 3.0K Oct  7 17:37 Mmap.ipynb
-rw-r--r-- 1 1001 1002 2.3K Oct  5 23:10 Threading1.ipynb
-rw-r--r-- 1 1001 1002 7.3K Oct  5 23:10 Threading2.ipynb
-rw-r----- 1 root root 167M Nov  1  2022 hdma-wi-2021.csv
-rw-r--r-- 1 root root  21M Jan  5  2023 hdma-wi-2021.zip
-rw-r--r-- 1 root root 1.2G Oct  7 17:36 test.arrow


In [None]:
# this code causes the kernel to crash, as we're trying to load in the entire file (1.2 GB) into our physical memory (0.5 GB)
# with pa.ipc.open_file("test_arrow") as f:
#     tbl = f.read_all()

In [7]:
import mmap
with open("test.arrow", "rb") as f:
    # memory_region = mmap.mmap(FILE, SIZE, access=ACCESS)
    mm = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)    # creates a region of SIZE (in this case, 0=the whole file so 1.2 GB) in virtual memory mapped to the file in disk

In [8]:
# Can see our region is 1.2 GB large
mm

<mmap.mmap closed=False, access=ACCESS_READ, length=1200012490, pos=0, offset=0>

In [9]:
# we can read some file contents (ex. first 10 lines), which triggers pulling into physical memory
mm[:10]

b'ARROW1\x00\x00\xff\xff'

In [10]:
# file contents will be lazily read into our mmap region as we read them
with pa.ipc.open_file(mm) as f:
    tbl = f.read_all()

In [11]:
# using mmap we can load in only contents we need; in this case we only load in 1 column from disk saving on I/O
import pyarrow.compute as pc
pc.sum(tbl["x"])

<pyarrow.Int64Scalar: 24999975000000>

In [13]:
# These columns definitely can't all fit into physical memory but with mmap we can do the calculation
pc.sum(tbl["y"])
pc.sum(tbl["z"])

<pyarrow.Int64Scalar: 24999975000000>