In [7]:
!pip install pandas==1.3.1 --quiet
!pip install 'ray[default]'


Collecting colorful
  Downloading colorful-0.5.4-py2.py3-none-any.whl (201 kB)
[K     |████████████████████████████████| 201 kB 4.6 MB/s eta 0:00:01
Collecting google-auth<2.0dev,>=1.25.0
  Downloading google_auth-1.35.0-py2.py3-none-any.whl (152 kB)
[K     |████████████████████████████████| 152 kB 75.6 MB/s eta 0:00:01
Installing collected packages: google-auth, colorful
  Attempting uninstall: google-auth
    Found existing installation: google-auth 2.0.1
    Uninstalling google-auth-2.0.1:
      Successfully uninstalled google-auth-2.0.1
Successfully installed colorful-0.5.4 google-auth-1.35.0


In [1]:
import pandas as pd
import ray
print(f'ray version {ray.__version__}')
print(f'pandas version {pd.__version__}')

ray version 1.5.2
pandas version 1.3.1


In [2]:
## change this to the head node ip 

# ray.init("ray://10.3.241.2:10001")
ray.init()


2021-08-22 04:37:01,509	INFO services.py:1245 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


{'node_ip_address': '10.1.86.2',
 'raylet_ip_address': '10.1.86.2',
 'redis_address': '10.1.86.2:6379',
 'object_store_address': '/tmp/ray/session_2021-08-22_04-36-59_407257_1448/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2021-08-22_04-36-59_407257_1448/sockets/raylet',
 'webui_url': '127.0.0.1:8265',
 'session_dir': '/tmp/ray/session_2021-08-22_04-36-59_407257_1448',
 'metrics_export_port': 64862,
 'node_id': '9446f47e8221c356577cc33660ab282edecc4333493f657f3f06f04a'}

In [3]:
@ray.remote
def f(x):
    return x * x

futures = [f.remote(i) for i in range(4)]
print(ray.get(futures)) # [0, 1, 4, 9]

[0, 1, 4, 9]


In [4]:
@ray.remote
class Counter(object):
    def __init__(self):
        self.n = 0

    def increment(self):
        self.n += 1

    def read(self):
        return self.n

counters = [Counter.remote() for i in range(4)]
[c.increment.remote() for c in counters]
futures = [c.read.remote() for c in counters]
print(ray.get(futures)) # [1, 1, 1, 1]



[1, 1, 1, 1]


In [5]:
import time
@ray.remote
def do_some_work(x):
    time.sleep(1) # Replace this is with work you need to do.
    return x

start = time.time()
results = ray.get([do_some_work.remote(x) for x in range(4)])
print("duration =", time.time() - start)
print("results = ", results)



duration = 4.6006152629852295
results =  [0, 1, 2, 3]


In [6]:
def tiny_work(x):
    time.sleep(0.0001) # replace this is with work you need to do
    return x

@ray.remote
def mega_work(start, end):
    return [tiny_work(x) for x in range(start, end)]

start = time.time()
result_ids = []
[result_ids.append(mega_work.remote(x*1000, (x+1)*1000)) for x in range(100)]
results = ray.get(result_ids)
print("duration =", time.time() - start)

duration = 18.31840491294861


## check overhead of no work

In [7]:
@ray.remote
def no_work(x):
    return x

start = time.time()
num_calls = 1000
[ray.get(no_work.remote(x)) for x in range(num_calls)]
print("per task overhead (ms) =", (time.time() - start)*1000/num_calls)


per task overhead (ms) = 2.185699701309204


## pass large data by reference id

In [10]:
@ray.remote
def no_work(a):
    return

import numpy as np
start = time.time()
a = np.zeros((5000, 5000))
result_ids = [no_work.remote(a) for x in range(10)]
results = ray.get(result_ids)
print("duration =", time.time() - start)

duration = 2.2644155025482178


In [11]:
start = time.time()
a_id = ray.put(np.zeros((5000, 5000)))
result_ids = [no_work.remote(a_id) for x in range(10)]
results = ray.get(result_ids)
print("duration =", time.time() - start)

duration = 0.6708011627197266


## use ray.wait to process data as soon as it's ready

In [13]:
## don't do this
import random

@ray.remote
def do_some_work(x):
    time.sleep(random.uniform(0, 4)) # Replace this with work you need to do.
    return x

def process_results(results):
    sum = 0
    for x in results:
        time.sleep(1) # Replace this with some processing code.
        sum += x
    return sum

start = time.time()
data_list = ray.get([do_some_work.remote(x) for x in range(4)])
sum = process_results(data_list)
print("duration =", time.time() - start, "\nresult = ", sum)

duration = 14.641751527786255 
result =  6


In [14]:
@ray.remote
def do_some_work(x):
    time.sleep(random.uniform(0, 4)) # Replace this is with work you need to do.
    return x

def process_incremental(sum, result):
    time.sleep(1) # Replace this with some processing code.
    return sum + result

start = time.time()
result_ids = [do_some_work.remote(x) for x in range(4)]
sum = 0
while len(result_ids):
    done_id, result_ids = ray.wait(result_ids)
    sum = process_incremental(sum, ray.get(done_id[0]))
print("duration =", time.time() - start, "\nresult = ", sum)

duration = 13.566273927688599 
result =  6


In [2]:
!pip install 'ray[default]'

Collecting colorful
  Downloading colorful-0.5.4-py2.py3-none-any.whl (201 kB)
[K     |████████████████████████████████| 201 kB 4.4 MB/s eta 0:00:01
Collecting google-auth<2.0dev,>=1.25.0
  Downloading google_auth-1.35.0-py2.py3-none-any.whl (152 kB)
[K     |████████████████████████████████| 152 kB 52.7 MB/s eta 0:00:01
Installing collected packages: google-auth, colorful
  Attempting uninstall: google-auth
    Found existing installation: google-auth 2.0.1
    Uninstalling google-auth-2.0.1:
      Successfully uninstalled google-auth-2.0.1
Successfully installed colorful-0.5.4 google-auth-1.35.0


In [3]:
import ray
from ray.util.dask import ray_dask_get
import dask
import dask.array as da
import dask.dataframe as dd
import numpy as np
import pandas as pd

In [4]:
ray.init()

2021-08-22 19:04:55,977	INFO services.py:1245 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


{'node_ip_address': '10.1.103.2',
 'raylet_ip_address': '10.1.103.2',
 'redis_address': '10.1.103.2:6379',
 'object_store_address': '/tmp/ray/session_2021-08-22_19-04-51_748685_71/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2021-08-22_19-04-51_748685_71/sockets/raylet',
 'webui_url': '127.0.0.1:8265',
 'session_dir': '/tmp/ray/session_2021-08-22_19-04-51_748685_71',
 'metrics_export_port': 64140,
 'node_id': '7afdb126790f3c0978b01bb0280df49cdaf9de2647ecb264cca49a1a'}

In [6]:
d_arr = da.from_array(np.random.randint(0, 1000, size=(256, 256)))

In [7]:
# The Dask scheduler submits the underlying task graph to Ray.
d_arr.mean().compute(scheduler=ray_dask_get)

498.50010681152344

In [8]:
# Set the scheduler to ray_dask_get in your config so you don't have to
# specify it on each compute call.
dask.config.set(scheduler=ray_dask_get)

<dask.config.set at 0x7f101acf1700>

In [9]:
df = dd.from_pandas(
    pd.DataFrame(
        np.random.randint(0, 100, size=(1024, 2)), columns=["age", "grade"]),
    npartitions=2)
df.groupby(["age"]).mean().compute()

Unnamed: 0_level_0,grade
age,Unnamed: 1_level_1
0,48.777778
1,65.625000
2,43.875000
3,45.000000
4,52.733333
...,...
95,48.125000
96,66.375000
97,53.928571
98,53.153846


In [10]:
ray.shutdown()