# Experimentation and troubleshooting for Dan

Any important learnings should get documented/folded into the tutorial somehow!

In [None]:
#ray.shutdown()

In [1]:
import ray
import os

# os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION']='python'

if ray.is_initialized() == False:
    service_host = os.environ["RAY_HEAD_SERVICE_HOST"]
    service_port = os.environ["RAY_HEAD_SERVICE_PORT"]
    ray.init(f"ray://{service_host}:{service_port}")

In [2]:
import numpy as np
import pandas as pd
import modin.pandas as mpd
import pyarrow.parquet as pq
import pyarrow as pa

In [3]:
# Generate some dummy data following Ben's example
default_dataset_path = f"/domino/datasets/local/{os.environ['DOMINO_PROJECT_NAME']}"

def generate_dummy_data(n_rows, n_columns, name):
    dummy_file_path = os.path.join(default_dataset_path, f"{name}.parquet")
    arr = np.random.standard_normal((n_rows, n_columns))
    df = pd.DataFrame(arr, columns = [str(i) for i in range(n_columns)])
    df.to_parquet(dummy_file_path)
    size_gib = os.path.getsize(dummy_file_path)/(1024*1024*1024)
    print(f"With {n_rows} rows and {n_columns} columns the new file is {size_gib} GiB on disk")
    return dummy_file_path

### Super small data
Output from the first cell when I actually ran it:
```
With 50000 rows and 2 columns the new file is 0.0009336844086647034 GiB on disk
```

In [4]:
# No fancy way to save my "inventory" of dummy files for future runs yet
#smallest_file = generate_dummy_data(5*10**4, 2, "smallest")

In [5]:
smallest_file = os.path.join(default_dataset_path, "smallest.parquet")

#### Try with ray data

In [6]:
ds = ray.data.read_parquet(smallest_file)

In [7]:
ds.show(3)

{'0': -0.8455730423087089, '1': -0.23197710872317917}
{'0': 1.57761716047386, '1': 0.2675177338719423}
{'0': -0.8469077067951065, '1': 0.465946115922862}


In [8]:
def dummy_transform_batch(t: pa.Table) -> pd.DataFrame:
    return t.to_pandas().sum(axis=1).to_frame()

ds2 = ds.map_batches(dummy_transform_batch)

Map Progress: 100%|██████████| 1/1 [00:00<00:00,  4.82it/s]


In [9]:
ds2.show(3)

{'0': -1.0775501510318881}
{'0': 1.8451348943458024}
{'0': -0.3809615908722445}


#### Try with Modin

In [10]:
mdf = mpd.read_parquet(smallest_file)

In [11]:
mdf.head(3)

Unnamed: 0,0,1
0,-0.845573,-0.231977
1,1.577617,0.267518
2,-0.846908,0.465946


In [12]:
mdf2 = mdf.sum(axis=1).to_frame()

In [13]:
mdf2.head(3)

Unnamed: 0,0
0,-1.07755
1,1.845135
2,-0.380962


#### Try with pyarrow

In [14]:
pqt = pq.read_table(smallest_file)

In [15]:
pqdf = pqt.to_pandas()

In [16]:
pqdf.head(3)

Unnamed: 0,0,1
0,-0.845573,-0.231977
1,1.577617,0.267518
2,-0.846908,0.465946


In [17]:
pqdf2 = pqdf.sum(axis=1).to_frame()

In [18]:
pqdf2.head(3)

Unnamed: 0,0
0,-1.07755
1,1.845135
2,-0.380962


### Smallish data
Output from the first cell when I actually ran it:
```
With 500000 rows and 20 columns the new file is 0.0797198498621583 GiB on disk
```

In [4]:
# No fancy way to save my "inventory" of dummy files for future runs yet
#smallish_file = generate_dummy_data(5*10**5, 20, "smallish")

With 500000 rows and 20 columns the new file is 0.07971985079348087 GiB on disk


In [5]:
smallish_file = os.path.join(default_dataset_path, "smallish.parquet")

#### Try with ray data

In [6]:
ds = ray.data.read_parquet(smallish_file)

In [7]:
ds.show(3)

{'0': -0.6858277450841938, '1': -0.8967399942464025, '2': 1.5118259559221787, '3': -0.3045707242693841, '4': -0.08883015096473798, '5': -0.7973339363812424, '6': 0.9879107736937893, '7': 1.1198899511152156, '8': 0.6474586884068384, '9': 0.3484700545960988, '10': 0.7464934706681945, '11': -1.0705226111663273, '12': -0.3331889925060301, '13': -0.4273963997493709, '14': -0.20254530606073146, '15': -0.9165467006341906, '16': -1.029116484190101, '17': 0.4311535562076728, '18': 0.8211583048767542, '19': 0.895051761233368}
{'0': 0.08011940033581648, '1': -2.0967639895546957, '2': 0.7819836159845024, '3': 1.1102997165910116, '4': 1.6944586039741485, '5': -0.2943824272050674, '6': 1.2227018429505547, '7': 0.2710947377399597, '8': -1.2160816525955218, '9': -0.8286494705257316, '10': 0.37416255426565037, '11': 0.05105107872403199, '12': 0.679997969929679, '13': 0.8369786928032972, '14': -0.4301666769043128, '15': 1.1321151018722833, '16': 1.0964944468617461, '17': 0.6075235338225544, '18': -0.779

In [8]:
def dummy_transform_batch(t: pa.Table) -> pd.DataFrame:
    return t.to_pandas().sum(axis=1).to_frame()

ds2 = ds.map_batches(dummy_transform_batch)

Map Progress: 100%|██████████| 1/1 [00:00<00:00,  3.26it/s]


In [9]:
ds2.show(3)

{'0': 0.7567934714673982}
{'0': 2.232542627884457}
{'0': -6.317743194212464}


#### Try with Modin

In [10]:
mdf = mpd.read_parquet(smallish_file)

In [11]:
mdf.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,-0.685828,-0.89674,1.511826,-0.304571,-0.08883,-0.797334,0.987911,1.11989,0.647459,0.34847,0.746493,-1.070523,-0.333189,-0.427396,-0.202545,-0.916547,-1.029116,0.431154,0.821158,0.895052
1,0.080119,-2.096764,0.781984,1.1103,1.694459,-0.294382,1.222702,0.271095,-1.216082,-0.828649,0.374163,0.051051,0.679998,0.836979,-0.430167,1.132115,1.096494,0.607524,-0.779317,-2.061078
2,-0.200059,0.787717,1.303924,1.167607,-2.663148,-1.100231,0.64334,-0.033566,-0.167996,-1.008125,-1.369326,0.747986,1.918263,-0.702239,-0.559032,-1.012587,-1.426052,-1.483214,0.493003,-1.654009


In [12]:
mdf2 = mdf.sum(axis=1).to_frame()

In [13]:
mdf2.head(3)

Unnamed: 0,0
0,0.756793
1,2.232543
2,-6.317743


#### Findings for smallish:
* Baseline is a single worker (the head worker) at 96 MiB RAM before starting
* Ray Data as follows:
    * As of `ray.data.read_parquet` I get a second worker at 96 MiB, plus the original goes up to 218 MiB, and Plasma (total only) goes up to 77 MiB. This happens as soon as it reads, the subsequent show didn't seem to change anything.
    * Sum operation causes Plasma to go up to 81 MiB.
* Modin as follows:
    * As of `mpd.read_parquet` I get a third worker at 96 MiB, plus the original goes up from 218 to 358 MiB, and Plasma goes up from 77 to 157 MiB.
    * Sum operation gives me a fourth worker at 70 MiB, then all four immediately go up by between 20 and 40 MiB, and plasma goes up to 184 MiB.

In sum, memory consumption actually looks pretty nice

### Medium data
Deliberately max out what I can create this way. (Twice the number of rows killed the kernel on small HWT, I believe the dataframe conversion makes a copy and requires twice the memory).

Output from the first cell when I actually ran it:
```
With 10000000 rows and 20 columns the new file is 1.495505265891552 GiB on disk
```

In [5]:
# No fancy way to save my "inventory" of dummy files for future runs yet
#medium_file = generate_dummy_data(10**7, 20, "medium")

In [5]:
medium_file = os.path.join(default_dataset_path, "medium.parquet")

#### Try with ray data

In [5]:
ds = ray.data.read_parquet(medium_file)

In [6]:
ds.show(3)

2022-07-29 22:45:01,304	ERROR dataclient.py:150 -- Unrecoverable error in data channel.


ConnectionError: GRPC connection failed: <_InactiveRpcError of RPC that terminated with:
	status = StatusCode.NOT_FOUND
	details = "Failed to serialize response!"
	debug_error_string = "{"created":"@1659134701.618941918","description":"Error received from peer ipv4:172.20.212.219:10001","file":"src/core/lib/surface/call.cc","file_line":1074,"grpc_message":"Failed to serialize response!","grpc_status":5}"
>

In [9]:
# Couldn't get this far
#def dummy_transform_batch(t: pa.Table) -> pd.DataFrame:
#    return t.to_pandas().sum(axis=1).to_frame()

#ds2 = ds.map_batches(dummy_transform_batch)

In [8]:
#ds2.show(3)

#### Try with Modin

In [6]:
mdf = mpd.read_parquet(medium_file)

In [7]:
mdf.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,-0.560002,-0.308564,-1.049303,0.008382,-0.853563,-0.258357,-1.656964,-0.014819,-0.415261,1.81215,-1.527844,1.68893,0.130555,0.34087,-0.863005,-1.829299,0.866442,1.732221,0.611116,-0.659617
1,1.137965,-0.280707,0.458817,-0.635319,-0.731099,-0.399083,-0.841399,0.839758,1.282331,0.30618,-0.445659,-0.818173,-0.176238,0.885542,-0.963602,-1.345789,0.5687,0.447883,0.750788,-0.715429
2,-1.083661,-0.385706,-0.220577,0.090654,-0.166137,0.388106,-1.007656,-2.202053,-0.196738,0.21293,1.546228,-1.774291,0.198617,0.223779,0.432537,0.760189,2.399902,0.221714,0.285044,-0.249088


In [8]:
mdf2 = mdf.sum(axis=1).to_frame()

In [9]:
mdf2.head(3)

Unnamed: 0,0
0,-2.805931
1,-0.674532
2,-0.526206


#### Findings for medium:
* Baseline is a single worker (the head worker) at 96 MiB RAM before starting
* Ray Data as follows:
    * The read operation did seem to work, and spiked up to ~3 GiB RAM on a single process before settling back to about 1.6 (twice data size to once data size). Also saw about 1.6 GiB in Plasma.
    * However `show` killed it.
* Modin as follows:
    * As of `mpd.read_parquet` I get even RAM usage across 3 workers, 600-675 MiB apiece, plus 1.6 in Plasma. (I am confused about the process RAM usage vs host RAM usage - host has about double, not sure it actually works out to process + Plasma but maybe approximately that?)
    * Show is fine
    * Sum operation seems to increase all the memory usage by like 10-20% or so, all still fine.

In sum, Modin is Better and Smarter at big un-subdivided parquet files.

### Goldilocks data
Let's try to find out exactly where ray.data kicks the bucket.
```
With 5000000 rows and 20 columns the new file is 0.7503557354211807 GiB on disk
```

In [4]:
# No fancy way to save my "inventory" of dummy files for future runs yet
#goldilocks_file = generate_dummy_data(5*10**6, 20, "goldilocks")

In [5]:
goldilocks_file = os.path.join(default_dataset_path, "goldilocks.parquet")

In [7]:
ds = ray.data.read_parquet(goldilocks_file)

In [8]:
ds.show(3)

{'0': 0.8486725261757402, '1': -1.6249991954356522, '2': 0.3491570059240676, '3': -0.3845927848310576, '4': -1.7535548830533385, '5': -1.7415539866200016, '6': -0.06332342022296654, '7': -1.2783767682189873, '8': -1.6925961944833545, '9': 1.3495421016506435, '10': 0.5923182238453245, '11': 0.4341769539541526, '12': 2.2231331215606147, '13': 1.1358900966802883, '14': -0.14083763849360784, '15': -1.7700207958491847, '16': 1.4353060846880865, '17': -0.8130187065371839, '18': -0.8799310305137245, '19': -2.0199181029533215}
{'0': 1.3195784578309429, '1': -0.37837809775452397, '2': -0.2789803290710002, '3': -1.1319832239231604, '4': -0.532566660761417, '5': -0.530425986153597, '6': 0.64005241852152, '7': -0.09528005746394203, '8': 1.3393781704897154, '9': 0.5457471334720579, '10': 0.6154668627082043, '11': 0.07025999845420669, '12': 3.373880186735437, '13': 0.36443392326697027, '14': -0.36758903818152144, '15': 1.1488883352672523, '16': 0.37827831033563736, '17': -1.016511292241403, '18': 1.

In [9]:
# Couldn't get this far
def dummy_transform_batch(t: pa.Table) -> pd.DataFrame:
    return t.to_pandas().sum(axis=1).to_frame()

ds2 = ds.map_batches(dummy_transform_batch)

Map Progress: 100%|██████████| 1/1 [00:01<00:00,  1.21s/it]


In [10]:
ds2.show(3)

{'0': -5.794527392733462}
{'0': 7.862591306479691}
{'0': 2.9912896373131694}


### Findings for goldilocks

Similar pattern to what I described for the medium data; I saw the head node total RAM go up to 3.6 GiB out of 4.0 GiB for just a second during `ds.show`; I think ray.data barely survived this. Looks like 4x data size is roughly the limit here.

## Next steps

Would be nice to see what happens to Modin with a file so big it cannot fit in memory on ALL the workers; maybe try spinning up some large HWT workspace with a tiny Ray cluster to test this out. Would especially like to see if I can make some demonstration of object spillage to disk - might need the "local storage" option enabled for that?