# Mini Demo of Ray Data parallel read considerations

Compare reading a single monolith parquet file versus a directory with multiple parquet files, using Ray Data as well as Modin.

The file in question is smaller than RAM of a single machine, but larger than 1/3 the RAM of a single machine, so accounting for read overhead it can't be read in a single large chunk.

(Spoiler: Modin can do both, but Ray Data can only do the split file, because options for parallelism in reading don't work on the monolith file.)

In [1]:
import numpy as np
import pandas as pd
import modin.pandas as mpd
import pyarrow.parquet as pq
import pyarrow.dataset as pds
import pyarrow as pa

In [2]:
import ray
import os

if ray.is_initialized() == False:
    service_host = os.environ["RAY_HEAD_SERVICE_HOST"]
    service_port = os.environ["RAY_HEAD_SERVICE_PORT"]
    ray.init(f"ray://{service_host}:{service_port}")

## The monolith file

Generated with code like this, split into row groups within a single file.

```
arr = np.random.standard_normal((n_rows, n_columns))
df = pd.DataFrame(arr, columns = [str(i) for i in range(n_columns)])
df.to_parquet(dummy_file_path, row_group_size = n_rows // 10)
```

In [3]:
monolith_file = '/domino/datasets/local/Ray-Tutorial/medium-split.parquet'

In [4]:
!du -h $monolith_file

1.6G	/domino/datasets/local/Ray-Tutorial/medium-split.parquet


## The split file

Generated with code like this (requires pyarrow 7.0), to actually split into separate files.

```
data_columns = {f"col_{i}": np.random.standard_normal(n_rows) for i in range(n_columns)}
n_per = n_rows // n_parts
pds.write_dataset(
    pa.Table.from_pydict(data_columns),
    dummy_file_root,
    format='parquet',
    max_rows_per_file = n_per,
    max_rows_per_group = n_per
)
```

In [5]:
split_file = '/domino/datasets/local/Ray-Tutorial/medium-filesplit'

In [6]:
!du -h $split_file

1.6G	/domino/datasets/local/Ray-Tutorial/medium-filesplit


In [7]:
!ls -lh $split_file

total 1.6G
-rw-rw-r-- 1 root root 158M Aug  9 00:07 part-0.parquet
-rw-rw-r-- 1 root root 160M Aug  9 00:08 part-1.parquet
-rw-rw-r-- 1 root root 162M Aug  9 00:08 part-2.parquet
-rw-rw-r-- 1 root root 164M Aug  9 00:08 part-3.parquet
-rw-rw-r-- 1 root root 164M Aug  9 00:08 part-4.parquet
-rw-rw-r-- 1 root root 164M Aug  9 00:08 part-5.parquet
-rw-rw-r-- 1 root root 164M Aug  9 00:08 part-6.parquet
-rw-rw-r-- 1 root root 164M Aug  9 00:08 part-7.parquet
-rw-rw-r-- 1 root root 164M Aug  9 00:08 part-8.parquet
-rw-rw-r-- 1 root root 164M Aug  9 00:08 part-9.parquet


## Read split file with Modin

In [8]:
mdf = mpd.read_parquet(split_file)

In [9]:
mdf.head(3)

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,col_10,col_11,col_12,col_13,col_14,col_15,col_16,col_17,col_18,col_19
0,1.158893,0.602489,0.854727,-0.771795,1.327245,1.676098,0.851014,-0.610542,-0.898746,0.193705,-0.401126,-1.142757,-2.035543,-1.60622,-1.762634,-0.595791,1.244053,0.084862,-0.92786,0.891158
1,-1.158531,0.155682,-0.103746,0.730911,-0.844126,-0.442617,2.145428,0.773411,0.152442,0.534158,0.213542,0.479042,0.039539,-0.135565,0.027108,-1.388288,-0.023594,0.45148,-0.622062,-1.617443
2,-0.827434,-0.103443,0.532639,-1.020834,0.024577,-2.515021,0.091969,0.661409,0.566055,0.288772,0.15065,-0.606403,0.781512,0.762483,0.344753,0.048367,-1.749009,1.317534,0.12975,0.416652


In [10]:
mdf2 = mdf.sum(axis=1).to_frame()

In [11]:
mdf2.head(3)

Unnamed: 0,0
0,-1.868769
1,-0.633229
2,-0.705018


## Read monolith file with Modin

In [12]:
ray.shutdown()

In [13]:
if ray.is_initialized() == False:
    service_host = os.environ["RAY_HEAD_SERVICE_HOST"]
    service_port = os.environ["RAY_HEAD_SERVICE_PORT"]
    ray.init(f"ray://{service_host}:{service_port}")

In [14]:
mdf = mpd.read_parquet(monolith_file)

In [15]:
mdf.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,-3.232196,-1.11419,0.711632,-0.25022,-0.131877,-0.005184,1.031817,-0.945482,0.975556,-0.457545,-0.30441,0.378853,-0.273116,1.059342,-0.274358,-0.367763,1.003205,0.236463,1.227877,0.469126
1,-0.597349,1.118023,-2.043759,-2.229171,0.46143,0.526196,0.472792,-0.135688,0.3809,0.876594,-0.769614,-0.386359,-0.58475,-1.915788,0.79827,-0.972866,-0.121678,0.538673,-0.934768,-0.706762
2,-1.092904,-0.689833,0.870981,1.322129,-1.732093,-1.638802,0.964642,0.418966,1.118773,-1.23363,-0.298513,1.907154,1.092906,1.630934,0.050813,-0.977698,-0.223904,-0.063423,-1.126671,-1.975544


In [16]:
mdf2 = mdf.sum(axis=1).to_frame()

In [17]:
mdf2.head(3)

Unnamed: 0,0
0,-0.26247
1,-6.225675
2,-1.675714


## Read split file with Ray Data

In [18]:
ray.shutdown()

In [19]:
if ray.is_initialized() == False:
    service_host = os.environ["RAY_HEAD_SERVICE_HOST"]
    service_port = os.environ["RAY_HEAD_SERVICE_PORT"]
    ray.init(f"ray://{service_host}:{service_port}")

In [20]:
ds = ray.data.read_parquet(split_file, parallelism = 10)

In [21]:
ds.show(3)

{'col_0': 1.158893411011255, 'col_1': 0.6024888481795232, 'col_2': 0.8547269707463037, 'col_3': -0.7717945711432682, 'col_4': 1.327244959107909, 'col_5': 1.6760976533642786, 'col_6': 0.851013834218383, 'col_7': -0.6105415319538923, 'col_8': -0.8987464297029039, 'col_9': 0.1937053286234583, 'col_10': -0.4011255531356019, 'col_11': -1.142756618209515, 'col_12': -2.0355433205668514, 'col_13': -1.6062197382186973, 'col_14': -1.7626335335372112, 'col_15': -0.5957913560997585, 'col_16': 1.2440529127597844, 'col_17': 0.08486158782383446, 'col_18': -0.9278595131767658, 'col_19': 0.8911579179072887}
{'col_0': -1.1585311566375038, 'col_1': 0.155681761397261, 'col_2': -0.10374564341978187, 'col_3': 0.7309110889294081, 'col_4': -0.8441261132098692, 'col_5': -0.4426172541321336, 'col_6': 2.145427727628284, 'col_7': 0.7734108353639534, 'col_8': 0.15244208585701527, 'col_9': 0.5341579368697362, 'col_10': 0.21354196252623878, 'col_11': 0.47904246676231227, 'col_12': 0.03953915050317926, 'col_13': -0.1

In [22]:
def dummy_transform_batch(t: pa.Table) -> pd.DataFrame:
    return t.to_pandas().sum(axis=1).to_frame()

In [23]:
ds2 = ds.map_batches(dummy_transform_batch)

Map Progress: 100%|██████████| 10/10 [00:05<00:00,  1.91it/s]


In [24]:
ds2.show(3)

{'0': -1.868768742002446}
{'0': -0.6332290800922385}
{'0': -0.7050184444842438}


## Read monolith file with Ray Data - it crashes!

In [25]:
# Actually, start fresh from here to get a clean test
ray.shutdown()

In [26]:
if ray.is_initialized() == False:
    service_host = os.environ["RAY_HEAD_SERVICE_HOST"]
    service_port = os.environ["RAY_HEAD_SERVICE_PORT"]
    ray.init(f"ray://{service_host}:{service_port}")

In [8]:
ds = ray.data.read_parquet(monolith_file, parallelism = 10)

In [9]:
ds.show(3)

2022-08-23 16:18:46,529	ERROR dataclient.py:150 -- Unrecoverable error in data channel.


ConnectionError: GRPC connection failed: <_InactiveRpcError of RPC that terminated with:
	status = StatusCode.NOT_FOUND
	details = "Failed to serialize response!"
	debug_error_string = "{"created":"@1661271527.030029018","description":"Error received from peer ipv4:172.20.187.153:10001","file":"src/core/lib/surface/call.cc","file_line":1074,"grpc_message":"Failed to serialize response!","grpc_status":5}"
>

## Read split file with Ray Data and no explicit parallelism
It's fine, because the default parallelism is 200

In [8]:
ds = ray.data.read_parquet(split_file)

In [9]:
ds.show(3)

{'col_0': 1.158893411011255, 'col_1': 0.6024888481795232, 'col_2': 0.8547269707463037, 'col_3': -0.7717945711432682, 'col_4': 1.327244959107909, 'col_5': 1.6760976533642786, 'col_6': 0.851013834218383, 'col_7': -0.6105415319538923, 'col_8': -0.8987464297029039, 'col_9': 0.1937053286234583, 'col_10': -0.4011255531356019, 'col_11': -1.142756618209515, 'col_12': -2.0355433205668514, 'col_13': -1.6062197382186973, 'col_14': -1.7626335335372112, 'col_15': -0.5957913560997585, 'col_16': 1.2440529127597844, 'col_17': 0.08486158782383446, 'col_18': -0.9278595131767658, 'col_19': 0.8911579179072887}
{'col_0': -1.1585311566375038, 'col_1': 0.155681761397261, 'col_2': -0.10374564341978187, 'col_3': 0.7309110889294081, 'col_4': -0.8441261132098692, 'col_5': -0.4426172541321336, 'col_6': 2.145427727628284, 'col_7': 0.7734108353639534, 'col_8': 0.15244208585701527, 'col_9': 0.5341579368697362, 'col_10': 0.21354196252623878, 'col_11': 0.47904246676231227, 'col_12': 0.03953915050317926, 'col_13': -0.1

In [10]:
def dummy_transform_batch(t: pa.Table) -> pd.DataFrame:
    return t.to_pandas().sum(axis=1).to_frame()

In [11]:
ds2 = ds.map_batches(dummy_transform_batch)

Map Progress: 100%|██████████| 10/10 [00:04<00:00,  2.03it/s]


In [12]:
ds2.show(3)

{'0': -1.868768742002446}
{'0': -0.6332290800922385}
{'0': -0.7050184444842438}
